2 * Code for manipulating bucket marks for garbage collection.
4 * Copyright 2014 Datera, Inc.
7 * - free bucket: mark == 0
8 * The bucket contains no data and will not be read
10 * - allocator bucket: owned_by_allocator == 1
11 * The bucket is on a free list, or it is an open bucket
13 * - cached bucket: owned_by_allocator == 0 &&
14 * dirty_sectors == 0 &&
16 * The bucket contains data but may be safely discarded as there are
17 * enough replicas of the data on other cache devices, or it has been
18 * written back to the backing device
20 * - dirty bucket: owned_by_allocator == 0 &&
22 * The bucket contains data that we must not discard (either only copy,
23 * or one of the 'main copies' for data requiring multiple replicas)
25 * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
26 * This is a btree node, journal or gen/prio bucket
30 * bucket invalidated => bucket on freelist => open bucket =>
31 * [dirty bucket =>] cached bucket => bucket invalidated => ...
33 * Note that cache promotion can skip the dirty bucket step, as data
34 * is copied from a deeper tier to a shallower tier, onto a cached
36 * Note also that a cached bucket can spontaneously become dirty --
39 * Only a traversal of the key space can determine whether a bucket is
40 * truly dirty or cached.
44 * - free => allocator: bucket was invalidated
45 * - cached => allocator: bucket was invalidated
47 * - allocator => dirty: open bucket was filled up
48 * - allocator => cached: open bucket was filled up
49 * - allocator => metadata: metadata was allocated
51 * - dirty => cached: dirty sectors were copied to a deeper tier
52 * - dirty => free: dirty sectors were overwritten or moved (copy gc)
53 * - cached => free: cached sectors were overwritten
55 * - metadata => free: metadata was freed
58 * - cached => dirty: a device was removed so formerly replicated data
59 * is no longer sufficiently replicated
60 * - free => cached: cannot happen
61 * - free => dirty: cannot happen
62 * - free => metadata: cannot happen
66 #include "alloc_background.h"
69 #include "btree_update.h"
75 #include <linux/preempt.h>
76 #include <trace/events/bcachefs.h>
78 static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
82 #define lg_local_lock lg_global_lock
83 #define lg_local_unlock lg_global_unlock
85 static void bch2_fs_stats_verify(struct bch_fs *c)
87 struct bch_fs_usage stats =_bch2_fs_usage_read(c);
90 for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
91 for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
92 if ((s64) stats.replicas[i].data[j] < 0)
93 panic("replicas %u %s sectors underflow: %lli\n",
94 i + 1, bch_data_types[j],
95 stats.replicas[i].data[j]);
97 if ((s64) stats.replicas[i].persistent_reserved < 0)
98 panic("replicas %u reserved underflow: %lli\n",
99 i + 1, stats.replicas[i].persistent_reserved);
102 for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
103 if ((s64) stats.replicas[i].data_buckets[j] < 0)
104 panic("%s buckets underflow: %lli\n",
108 if ((s64) stats.s.online_reserved < 0)
109 panic("sectors_online_reserved underflow: %lli\n",
110 stats.s.online_reserved);
113 static void bch2_dev_stats_verify(struct bch_dev *ca)
115 struct bch_dev_usage stats =
116 __bch2_dev_usage_read(ca);
117 u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
120 for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
121 BUG_ON(stats.buckets[i] > n);
122 BUG_ON(stats.buckets_alloc > n);
123 BUG_ON(stats.buckets_unavailable > n);
126 static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
128 if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
129 u64 used = __bch2_fs_sectors_used(c);
131 u64 avail = atomic64_read(&c->sectors_available);
134 for_each_possible_cpu(cpu)
135 cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
137 if (used + avail + cached > c->capacity)
138 panic("used %llu avail %llu cached %llu capacity %llu\n",
139 used, avail, cached, c->capacity);
145 static void bch2_fs_stats_verify(struct bch_fs *c) {}
146 static void bch2_dev_stats_verify(struct bch_dev *ca) {}
147 static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
152 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
155 void bch2_bucket_seq_cleanup(struct bch_fs *c)
157 u64 journal_seq = atomic64_read(&c->journal.seq);
158 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
160 struct bucket_array *buckets;
162 struct bucket_mark m;
165 if (journal_seq - c->last_bucket_seq_cleanup <
166 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
169 c->last_bucket_seq_cleanup = journal_seq;
171 for_each_member_device(ca, c, i) {
172 down_read(&ca->bucket_lock);
173 buckets = bucket_array(ca);
175 for_each_bucket(g, buckets) {
176 bucket_cmpxchg(g, m, ({
177 if (!m.journal_seq_valid ||
178 bucket_needs_journal_commit(m, last_seq_ondisk))
181 m.journal_seq_valid = 0;
184 up_read(&ca->bucket_lock);
188 #define bch2_usage_add(_acc, _stats) \
190 typeof(_acc) _a = (_acc), _s = (_stats); \
193 for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \
194 ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \
197 #define bch2_usage_read_raw(_stats) \
199 typeof(*this_cpu_ptr(_stats)) _acc; \
202 memset(&_acc, 0, sizeof(_acc)); \
204 for_each_possible_cpu(cpu) \
205 bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
210 struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
212 return bch2_usage_read_raw(ca->usage[gc]);
215 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
217 return bch2_usage_read_raw(ca->usage[0]);
220 struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
222 return bch2_usage_read_raw(c->usage[gc]);
225 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
227 return bch2_usage_read_raw(c->usage[0]);
230 #define RESERVE_FACTOR 6
232 static u64 reserve_factor(u64 r)
234 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
237 static u64 avail_factor(u64 r)
239 return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
242 static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
244 return fs_usage.s.hidden +
246 reserve_factor(fs_usage.s.reserved +
247 fs_usage.s.online_reserved);
250 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
252 return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
255 struct bch_fs_usage_short
256 bch2_fs_usage_read_short(struct bch_fs *c)
258 struct bch_fs_usage_summarized usage =
259 bch2_usage_read_raw(&c->usage[0]->s);
260 struct bch_fs_usage_short ret;
262 ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
263 ret.used = min(ret.capacity, usage.data +
264 reserve_factor(usage.reserved +
265 usage.online_reserved));
266 ret.nr_inodes = usage.nr_inodes;
271 static inline int is_unavailable_bucket(struct bucket_mark m)
273 return !is_available_bucket(m);
276 static inline int is_fragmented_bucket(struct bucket_mark m,
279 if (!m.owned_by_allocator &&
280 m.data_type == BCH_DATA_USER &&
281 bucket_sectors_used(m))
282 return max_t(int, 0, (int) ca->mi.bucket_size -
283 bucket_sectors_used(m));
287 static inline enum bch_data_type bucket_type(struct bucket_mark m)
289 return m.cached_sectors && !m.dirty_sectors
294 static bool bucket_became_unavailable(struct bucket_mark old,
295 struct bucket_mark new)
297 return is_available_bucket(old) &&
298 !is_available_bucket(new);
301 void bch2_fs_usage_apply(struct bch_fs *c,
302 struct bch_fs_usage *fs_usage,
303 struct disk_reservation *disk_res,
304 struct gc_pos gc_pos)
306 s64 added = fs_usage->s.data + fs_usage->s.reserved;
307 s64 should_not_have_added;
309 percpu_rwsem_assert_held(&c->mark_lock);
312 * Not allowed to reduce sectors_available except by getting a
315 should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
316 if (WARN_ONCE(should_not_have_added > 0,
317 "disk usage increased without a reservation")) {
318 atomic64_sub(should_not_have_added, &c->sectors_available);
319 added -= should_not_have_added;
323 disk_res->sectors -= added;
324 fs_usage->s.online_reserved -= added;
327 bch2_usage_add(this_cpu_ptr(c->usage[0]), fs_usage);
329 if (gc_visited(c, gc_pos))
330 bch2_usage_add(this_cpu_ptr(c->usage[1]), fs_usage);
332 bch2_fs_stats_verify(c);
334 memset(fs_usage, 0, sizeof(*fs_usage));
337 static inline void account_bucket(struct bch_fs_usage *fs_usage,
338 struct bch_dev_usage *dev_usage,
339 enum bch_data_type type,
342 if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
343 fs_usage->s.hidden += size;
345 fs_usage->buckets[type] += size;
346 dev_usage->buckets[type] += nr;
349 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
350 struct bch_fs_usage *fs_usage,
351 struct bucket_mark old, struct bucket_mark new,
354 struct bch_dev_usage *dev_usage;
356 percpu_rwsem_assert_held(&c->mark_lock);
358 bch2_fs_inconsistent_on(old.data_type && new.data_type &&
359 old.data_type != new.data_type, c,
360 "different types of data in same bucket: %s, %s",
361 bch2_data_types[old.data_type],
362 bch2_data_types[new.data_type]);
364 dev_usage = this_cpu_ptr(ca->usage[gc]);
366 if (bucket_type(old))
367 account_bucket(fs_usage, dev_usage, bucket_type(old),
368 -1, -ca->mi.bucket_size);
370 if (bucket_type(new))
371 account_bucket(fs_usage, dev_usage, bucket_type(new),
372 1, ca->mi.bucket_size);
374 dev_usage->buckets_alloc +=
375 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
376 dev_usage->buckets_ec +=
377 (int) new.stripe - (int) old.stripe;
378 dev_usage->buckets_unavailable +=
379 is_unavailable_bucket(new) - is_unavailable_bucket(old);
381 dev_usage->sectors[old.data_type] -= old.dirty_sectors;
382 dev_usage->sectors[new.data_type] += new.dirty_sectors;
383 dev_usage->sectors[BCH_DATA_CACHED] +=
384 (int) new.cached_sectors - (int) old.cached_sectors;
385 dev_usage->sectors_fragmented +=
386 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
388 if (!is_available_bucket(old) && is_available_bucket(new))
389 bch2_wake_allocator(ca);
391 bch2_dev_stats_verify(ca);
394 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
396 struct bucket_mark old = { .v.counter = 0 };
397 struct bch_fs_usage *fs_usage;
398 struct bucket_array *buckets;
401 percpu_down_read_preempt_disable(&c->mark_lock);
402 fs_usage = this_cpu_ptr(c->usage[0]);
403 buckets = bucket_array(ca);
405 for_each_bucket(g, buckets)
406 if (g->mark.data_type)
407 bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
408 percpu_up_read_preempt_enable(&c->mark_lock);
411 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
413 struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
415 bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
419 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
420 size_t b, struct bucket_mark *old,
423 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
424 struct bucket *g = __bucket(ca, b, gc);
425 struct bucket_mark new;
427 *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
428 BUG_ON(!is_available_bucket(new));
430 new.owned_by_allocator = 1;
432 new.cached_sectors = 0;
433 new.dirty_sectors = 0;
437 fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
438 fs_usage->s.cached -= old->cached_sectors;
441 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
442 size_t b, struct bucket_mark *old)
444 percpu_rwsem_assert_held(&c->mark_lock);
446 __bch2_invalidate_bucket(c, ca, b, old, false);
448 if (!old->owned_by_allocator && old->cached_sectors)
449 trace_invalidate(ca, bucket_to_sector(ca, b),
450 old->cached_sectors);
453 static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
454 size_t b, bool owned_by_allocator,
457 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
458 struct bucket *g = __bucket(ca, b, gc);
459 struct bucket_mark old, new;
461 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
462 new.owned_by_allocator = owned_by_allocator;
466 !owned_by_allocator && !old.owned_by_allocator);
469 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
470 size_t b, bool owned_by_allocator,
471 struct gc_pos pos, unsigned flags)
473 percpu_rwsem_assert_held(&c->mark_lock);
475 if (!(flags & BCH_BUCKET_MARK_GC))
476 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
478 if ((flags & BCH_BUCKET_MARK_GC) ||
480 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
483 #define checked_add(a, b) \
485 unsigned _res = (unsigned) (a) + (b); \
487 BUG_ON((a) != _res); \
490 static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
491 size_t b, enum bch_data_type type,
492 unsigned sectors, bool gc)
494 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
495 struct bucket *g = __bucket(ca, b, gc);
496 struct bucket_mark new;
498 BUG_ON(type != BCH_DATA_SB &&
499 type != BCH_DATA_JOURNAL);
501 bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
502 new.data_type = type;
503 checked_add(new.dirty_sectors, sectors);
506 if (type == BCH_DATA_BTREE ||
507 type == BCH_DATA_USER)
508 fs_usage->s.data += sectors;
509 fs_usage->replicas[0].data[type] += sectors;
512 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
513 size_t b, enum bch_data_type type,
514 unsigned sectors, struct gc_pos pos,
517 BUG_ON(type != BCH_DATA_SB &&
518 type != BCH_DATA_JOURNAL);
521 percpu_rwsem_assert_held(&c->mark_lock);
523 if (!(flags & BCH_BUCKET_MARK_GC))
524 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
526 if ((flags & BCH_BUCKET_MARK_GC) ||
528 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
532 struct bucket_mark old, new;
537 old = bucket_cmpxchg(g, new, ({
538 new.data_type = type;
539 checked_add(new.dirty_sectors, sectors);
546 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
551 * marking a new extent, which _will have size_ @delta
553 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
554 * case, we haven't actually created the key we'll be inserting
555 * yet (for the split) - so we don't want to be using
556 * k->size/crc.live_size here:
558 return __ptr_disk_sectors(p, delta);
560 BUG_ON(-delta > p.crc.live_size);
562 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
563 (s64) ptr_disk_sectors(p);
568 * Checking against gc's position has to be done here, inside the cmpxchg()
569 * loop, to avoid racing with the start of gc clearing all the marks - GC does
570 * that with the gc pos seqlock held.
572 static void bch2_mark_pointer(struct bch_fs *c,
573 struct extent_ptr_decoded p,
574 s64 sectors, enum bch_data_type data_type,
575 struct bch_fs_usage *fs_usage,
576 unsigned journal_seq, unsigned flags,
579 struct bucket_mark old, new;
580 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
581 size_t b = PTR_BUCKET_NR(ca, &p.ptr);
582 struct bucket *g = __bucket(ca, b, gc);
585 v = atomic64_read(&g->_mark.v);
587 new.v.counter = old.v.counter = v;
590 * Check this after reading bucket mark to guard against
591 * the allocator invalidating a bucket after we've already
594 if (gen_after(new.gen, p.ptr.gen)) {
595 BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
596 EBUG_ON(!p.ptr.cached &&
597 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
602 checked_add(new.dirty_sectors, sectors);
604 checked_add(new.cached_sectors, sectors);
606 if (!new.dirty_sectors &&
607 !new.cached_sectors) {
611 new.journal_seq_valid = 1;
612 new.journal_seq = journal_seq;
615 new.data_type = data_type;
618 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
622 } while ((v = atomic64_cmpxchg(&g->_mark.v,
624 new.v.counter)) != old.v.counter);
626 bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
628 BUG_ON(!gc && bucket_became_unavailable(old, new));
631 static int bch2_mark_stripe_ptr(struct bch_fs *c,
632 struct bch_extent_stripe_ptr p,
633 s64 sectors, unsigned flags,
634 s64 *adjusted_disk_sectors,
635 unsigned *redundancy,
639 unsigned old, new, nr_data;
640 int blocks_nonempty_delta;
643 m = genradix_ptr(&c->stripes[gc], p.idx);
645 if (!m || !m->alive) {
646 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
651 nr_data = m->nr_blocks - m->nr_redundant;
653 parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
656 parity_sectors = -parity_sectors;
658 *adjusted_disk_sectors += parity_sectors;
660 *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
662 new = atomic_add_return(sectors, &m->block_sectors[p.block]);
665 blocks_nonempty_delta = (int) !!new - (int) !!old;
666 if (!blocks_nonempty_delta)
669 atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
671 BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
674 bch2_stripes_heap_update(c, m, p.idx);
679 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
680 s64 sectors, enum bch_data_type data_type,
681 struct bch_fs_usage *fs_usage,
682 unsigned journal_seq, unsigned flags,
685 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
686 const union bch_extent_entry *entry;
687 struct extent_ptr_decoded p;
688 s64 cached_sectors = 0;
689 s64 dirty_sectors = 0;
691 unsigned replicas = 0;
692 unsigned ec_redundancy = 0;
698 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
699 s64 disk_sectors = data_type == BCH_DATA_BTREE
701 : ptr_disk_sectors_delta(p, sectors);
702 s64 adjusted_disk_sectors = disk_sectors;
704 bch2_mark_pointer(c, p, disk_sectors, data_type,
705 fs_usage, journal_seq, flags, gc);
708 for (i = 0; i < p.ec_nr; i++) {
709 ret = bch2_mark_stripe_ptr(c, p.ec[i],
711 &adjusted_disk_sectors,
720 cached_sectors += adjusted_disk_sectors;
722 dirty_sectors += adjusted_disk_sectors;
724 ec_sectors += adjusted_disk_sectors;
727 replicas = clamp_t(unsigned, replicas,
728 1, ARRAY_SIZE(fs_usage->replicas));
729 ec_redundancy = clamp_t(unsigned, ec_redundancy,
730 1, ARRAY_SIZE(fs_usage->replicas));
732 fs_usage->s.cached += cached_sectors;
733 fs_usage->replicas[0].data[BCH_DATA_CACHED] += cached_sectors;
735 fs_usage->s.data += dirty_sectors;
736 fs_usage->replicas[replicas - 1].data[data_type] += dirty_sectors;
738 fs_usage->s.data += ec_sectors;
739 fs_usage->replicas[ec_redundancy - 1].ec_data += ec_sectors;
744 static void bucket_set_stripe(struct bch_fs *c,
745 const struct bch_stripe *v,
747 struct bch_fs_usage *fs_usage,
753 for (i = 0; i < v->nr_blocks; i++) {
754 const struct bch_extent_ptr *ptr = v->ptrs + i;
755 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
756 size_t b = PTR_BUCKET_NR(ca, ptr);
757 struct bucket *g = __bucket(ca, b, gc);
758 struct bucket_mark new, old;
760 BUG_ON(ptr_stale(ca, ptr));
762 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
763 new.stripe = enabled;
765 new.journal_seq_valid = 1;
766 new.journal_seq = journal_seq;
770 BUG_ON(old.stripe == enabled);
774 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
776 struct bch_fs_usage *fs_usage,
777 u64 journal_seq, unsigned flags,
780 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
781 size_t idx = s.k->p.offset;
782 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
785 if (!m || (!inserting && !m->alive)) {
786 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
791 if (inserting && m->alive) {
792 bch_err_ratelimited(c, "error marking stripe %zu: already exists",
797 BUG_ON(atomic_read(&m->blocks_nonempty));
799 for (i = 0; i < EC_STRIPE_MAX; i++)
800 BUG_ON(atomic_read(&m->block_sectors[i]));
803 m->sectors = le16_to_cpu(s.v->sectors);
804 m->algorithm = s.v->algorithm;
805 m->nr_blocks = s.v->nr_blocks;
806 m->nr_redundant = s.v->nr_redundant;
811 bch2_stripes_heap_insert(c, m, idx);
813 bch2_stripes_heap_del(c, m, idx);
815 m->alive = inserting;
818 bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
822 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
823 bool inserting, s64 sectors,
824 struct bch_fs_usage *fs_usage,
825 unsigned journal_seq, unsigned flags,
831 case KEY_TYPE_btree_ptr:
832 ret = bch2_mark_extent(c, k, inserting
833 ? c->opts.btree_node_size
834 : -c->opts.btree_node_size,
836 fs_usage, journal_seq, flags, gc);
838 case KEY_TYPE_extent:
839 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
840 fs_usage, journal_seq, flags, gc);
842 case KEY_TYPE_stripe:
843 ret = bch2_mark_stripe(c, k, inserting,
844 fs_usage, journal_seq, flags, gc);
848 fs_usage->s.nr_inodes++;
850 fs_usage->s.nr_inodes--;
852 case KEY_TYPE_reservation: {
853 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
856 replicas = clamp_t(unsigned, replicas,
857 1, ARRAY_SIZE(fs_usage->replicas));
859 fs_usage->s.reserved += sectors;
860 fs_usage->replicas[replicas - 1].persistent_reserved += sectors;
870 int bch2_mark_key_locked(struct bch_fs *c,
872 bool inserting, s64 sectors,
874 struct bch_fs_usage *fs_usage,
875 u64 journal_seq, unsigned flags)
879 if (!(flags & BCH_BUCKET_MARK_GC)) {
880 ret = __bch2_mark_key(c, k, inserting, sectors,
881 fs_usage ?: this_cpu_ptr(c->usage[0]),
882 journal_seq, flags, false);
887 if ((flags & BCH_BUCKET_MARK_GC) ||
888 gc_visited(c, pos)) {
889 ret = __bch2_mark_key(c, k, inserting, sectors,
890 this_cpu_ptr(c->usage[1]),
891 journal_seq, flags, true);
899 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
900 bool inserting, s64 sectors,
902 struct bch_fs_usage *fs_usage,
903 u64 journal_seq, unsigned flags)
907 percpu_down_read_preempt_disable(&c->mark_lock);
908 ret = bch2_mark_key_locked(c, k, inserting, sectors,
909 pos, fs_usage, journal_seq, flags);
910 percpu_up_read_preempt_enable(&c->mark_lock);
915 void bch2_mark_update(struct btree_insert *trans,
916 struct btree_insert_entry *insert)
918 struct bch_fs *c = trans->c;
919 struct btree_iter *iter = insert->iter;
920 struct btree *b = iter->l[0].b;
921 struct btree_node_iter node_iter = iter->l[0].iter;
922 struct bch_fs_usage fs_usage = { 0 };
923 struct gc_pos pos = gc_pos_btree_node(b);
924 struct bkey_packed *_k;
926 if (!btree_node_type_needs_gc(iter->btree_id))
929 percpu_down_read_preempt_disable(&c->mark_lock);
931 if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
932 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
933 bpos_min(insert->k->k.p, b->key.k.p).offset -
934 bkey_start_offset(&insert->k->k),
935 pos, &fs_usage, trans->journal_res.seq, 0);
937 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
938 KEY_TYPE_discard))) {
939 struct bkey unpacked;
943 k = bkey_disassemble(b, _k, &unpacked);
945 if (btree_node_is_extents(b)
946 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
947 : bkey_cmp(insert->k->k.p, k.k->p))
950 if (btree_node_is_extents(b)) {
951 switch (bch2_extent_overlap(&insert->k->k, k.k)) {
952 case BCH_EXTENT_OVERLAP_ALL:
953 sectors = -((s64) k.k->size);
955 case BCH_EXTENT_OVERLAP_BACK:
956 sectors = bkey_start_offset(&insert->k->k) -
959 case BCH_EXTENT_OVERLAP_FRONT:
960 sectors = bkey_start_offset(k.k) -
961 insert->k->k.p.offset;
963 case BCH_EXTENT_OVERLAP_MIDDLE:
964 sectors = k.k->p.offset - insert->k->k.p.offset;
965 BUG_ON(sectors <= 0);
967 bch2_mark_key_locked(c, k, true, sectors,
968 pos, &fs_usage, trans->journal_res.seq, 0);
970 sectors = bkey_start_offset(&insert->k->k) -
975 BUG_ON(sectors >= 0);
978 bch2_mark_key_locked(c, k, false, sectors,
979 pos, &fs_usage, trans->journal_res.seq, 0);
981 bch2_btree_node_iter_advance(&node_iter, b);
984 bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
986 percpu_up_read_preempt_enable(&c->mark_lock);
989 /* Disk reservations: */
991 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
995 for_each_possible_cpu(cpu)
996 per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
998 return avail_factor(bch2_fs_sectors_free(c));
1001 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
1003 percpu_down_read_preempt_disable(&c->mark_lock);
1004 this_cpu_sub(c->usage[0]->s.online_reserved,
1007 bch2_fs_stats_verify(c);
1008 percpu_up_read_preempt_enable(&c->mark_lock);
1013 #define SECTORS_CACHE 1024
1015 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1016 unsigned sectors, int flags)
1018 struct bch_fs_pcpu *pcpu;
1020 s64 sectors_available;
1023 percpu_down_read_preempt_disable(&c->mark_lock);
1024 pcpu = this_cpu_ptr(c->pcpu);
1026 if (sectors <= pcpu->sectors_available)
1029 v = atomic64_read(&c->sectors_available);
1032 get = min((u64) sectors + SECTORS_CACHE, old);
1034 if (get < sectors) {
1035 percpu_up_read_preempt_enable(&c->mark_lock);
1038 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1039 old, old - get)) != old);
1041 pcpu->sectors_available += get;
1044 pcpu->sectors_available -= sectors;
1045 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1046 res->sectors += sectors;
1048 bch2_disk_reservations_verify(c, flags);
1049 bch2_fs_stats_verify(c);
1050 percpu_up_read_preempt_enable(&c->mark_lock);
1055 * GC recalculates sectors_available when it starts, so that hopefully
1056 * we don't normally end up blocking here:
1060 * Piss fuck, we can be called from extent_insert_fixup() with btree
1064 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
1065 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
1066 down_read(&c->gc_lock);
1067 else if (!down_read_trylock(&c->gc_lock))
1071 percpu_down_write(&c->mark_lock);
1072 sectors_available = bch2_recalc_sectors_available(c);
1074 if (sectors <= sectors_available ||
1075 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1076 atomic64_set(&c->sectors_available,
1077 max_t(s64, 0, sectors_available - sectors));
1078 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1079 res->sectors += sectors;
1082 bch2_disk_reservations_verify(c, flags);
1084 atomic64_set(&c->sectors_available, sectors_available);
1088 bch2_fs_stats_verify(c);
1089 percpu_up_write(&c->mark_lock);
1091 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
1092 up_read(&c->gc_lock);
1097 /* Startup/shutdown: */
1099 static void buckets_free_rcu(struct rcu_head *rcu)
1101 struct bucket_array *buckets =
1102 container_of(rcu, struct bucket_array, rcu);
1105 sizeof(struct bucket_array) +
1106 buckets->nbuckets * sizeof(struct bucket));
1109 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1111 struct bucket_array *buckets = NULL, *old_buckets = NULL;
1112 unsigned long *buckets_nouse = NULL;
1113 unsigned long *buckets_written = NULL;
1114 u8 *oldest_gens = NULL;
1115 alloc_fifo free[RESERVE_NR];
1116 alloc_fifo free_inc;
1117 alloc_heap alloc_heap;
1118 copygc_heap copygc_heap;
1120 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1121 ca->mi.bucket_size / c->opts.btree_node_size);
1122 /* XXX: these should be tunable */
1123 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
1124 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
1125 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
1127 bool resize = ca->buckets[0] != NULL,
1128 start_copygc = ca->copygc_thread != NULL;
1132 memset(&free, 0, sizeof(free));
1133 memset(&free_inc, 0, sizeof(free_inc));
1134 memset(&alloc_heap, 0, sizeof(alloc_heap));
1135 memset(©gc_heap, 0, sizeof(copygc_heap));
1137 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
1138 nbuckets * sizeof(struct bucket),
1139 GFP_KERNEL|__GFP_ZERO)) ||
1140 !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
1141 GFP_KERNEL|__GFP_ZERO)) ||
1142 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1143 sizeof(unsigned long),
1144 GFP_KERNEL|__GFP_ZERO)) ||
1145 !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1146 sizeof(unsigned long),
1147 GFP_KERNEL|__GFP_ZERO)) ||
1148 !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1149 !init_fifo(&free[RESERVE_MOVINGGC],
1150 copygc_reserve, GFP_KERNEL) ||
1151 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1152 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
1153 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1154 !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL))
1157 buckets->first_bucket = ca->mi.first_bucket;
1158 buckets->nbuckets = nbuckets;
1160 bch2_copygc_stop(ca);
1163 down_write(&c->gc_lock);
1164 down_write(&ca->bucket_lock);
1165 percpu_down_write(&c->mark_lock);
1168 old_buckets = bucket_array(ca);
1171 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1175 n * sizeof(struct bucket));
1179 memcpy(buckets_nouse,
1181 BITS_TO_LONGS(n) * sizeof(unsigned long));
1182 memcpy(buckets_written,
1183 ca->buckets_written,
1184 BITS_TO_LONGS(n) * sizeof(unsigned long));
1187 rcu_assign_pointer(ca->buckets[0], buckets);
1188 buckets = old_buckets;
1190 swap(ca->oldest_gens, oldest_gens);
1191 swap(ca->buckets_nouse, buckets_nouse);
1192 swap(ca->buckets_written, buckets_written);
1195 percpu_up_write(&c->mark_lock);
1197 spin_lock(&c->freelist_lock);
1198 for (i = 0; i < RESERVE_NR; i++) {
1199 fifo_move(&free[i], &ca->free[i]);
1200 swap(ca->free[i], free[i]);
1202 fifo_move(&free_inc, &ca->free_inc);
1203 swap(ca->free_inc, free_inc);
1204 spin_unlock(&c->freelist_lock);
1206 /* with gc lock held, alloc_heap can't be in use: */
1207 swap(ca->alloc_heap, alloc_heap);
1209 /* and we shut down copygc: */
1210 swap(ca->copygc_heap, copygc_heap);
1212 nbuckets = ca->mi.nbuckets;
1215 up_write(&ca->bucket_lock);
1216 up_write(&c->gc_lock);
1220 bch2_copygc_start(c, ca))
1221 bch_err(ca, "error restarting copygc thread");
1225 free_heap(©gc_heap);
1226 free_heap(&alloc_heap);
1227 free_fifo(&free_inc);
1228 for (i = 0; i < RESERVE_NR; i++)
1229 free_fifo(&free[i]);
1230 kvpfree(buckets_nouse,
1231 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1232 kvpfree(buckets_written,
1233 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1234 kvpfree(oldest_gens,
1235 nbuckets * sizeof(u8));
1237 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1242 void bch2_dev_buckets_free(struct bch_dev *ca)
1246 free_heap(&ca->copygc_heap);
1247 free_heap(&ca->alloc_heap);
1248 free_fifo(&ca->free_inc);
1249 for (i = 0; i < RESERVE_NR; i++)
1250 free_fifo(&ca->free[i]);
1251 kvpfree(ca->buckets_written,
1252 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1253 kvpfree(ca->buckets_nouse,
1254 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1255 kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
1256 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1257 sizeof(struct bucket_array) +
1258 ca->mi.nbuckets * sizeof(struct bucket));
1260 free_percpu(ca->usage[0]);
1263 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1265 if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1268 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;