2 * Code for manipulating bucket marks for garbage collection.
4 * Copyright 2014 Datera, Inc.
7 * - free bucket: mark == 0
8 * The bucket contains no data and will not be read
10 * - allocator bucket: owned_by_allocator == 1
11 * The bucket is on a free list, or it is an open bucket
13 * - cached bucket: owned_by_allocator == 0 &&
14 * dirty_sectors == 0 &&
16 * The bucket contains data but may be safely discarded as there are
17 * enough replicas of the data on other cache devices, or it has been
18 * written back to the backing device
20 * - dirty bucket: owned_by_allocator == 0 &&
22 * The bucket contains data that we must not discard (either only copy,
23 * or one of the 'main copies' for data requiring multiple replicas)
25 * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
26 * This is a btree node, journal or gen/prio bucket
30 * bucket invalidated => bucket on freelist => open bucket =>
31 * [dirty bucket =>] cached bucket => bucket invalidated => ...
33 * Note that cache promotion can skip the dirty bucket step, as data
34 * is copied from a deeper tier to a shallower tier, onto a cached
36 * Note also that a cached bucket can spontaneously become dirty --
39 * Only a traversal of the key space can determine whether a bucket is
40 * truly dirty or cached.
44 * - free => allocator: bucket was invalidated
45 * - cached => allocator: bucket was invalidated
47 * - allocator => dirty: open bucket was filled up
48 * - allocator => cached: open bucket was filled up
49 * - allocator => metadata: metadata was allocated
51 * - dirty => cached: dirty sectors were copied to a deeper tier
52 * - dirty => free: dirty sectors were overwritten or moved (copy gc)
53 * - cached => free: cached sectors were overwritten
55 * - metadata => free: metadata was freed
58 * - cached => dirty: a device was removed so formerly replicated data
59 * is no longer sufficiently replicated
60 * - free => cached: cannot happen
61 * - free => dirty: cannot happen
62 * - free => metadata: cannot happen
66 #include "alloc_background.h"
69 #include "btree_update.h"
76 #include <linux/preempt.h>
77 #include <trace/events/bcachefs.h>
80 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
83 void bch2_bucket_seq_cleanup(struct bch_fs *c)
85 u64 journal_seq = atomic64_read(&c->journal.seq);
86 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
88 struct bucket_array *buckets;
93 if (journal_seq - c->last_bucket_seq_cleanup <
94 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
97 c->last_bucket_seq_cleanup = journal_seq;
99 for_each_member_device(ca, c, i) {
100 down_read(&ca->bucket_lock);
101 buckets = bucket_array(ca);
103 for_each_bucket(g, buckets) {
104 bucket_cmpxchg(g, m, ({
105 if (!m.journal_seq_valid ||
106 bucket_needs_journal_commit(m, last_seq_ondisk))
109 m.journal_seq_valid = 0;
112 up_read(&ca->bucket_lock);
116 #define bch2_usage_read_raw(_stats) \
118 typeof(*this_cpu_ptr(_stats)) _acc; \
120 memset(&_acc, 0, sizeof(_acc)); \
121 acc_u64s_percpu((u64 *) &_acc, \
122 (u64 __percpu *) _stats, \
123 sizeof(_acc) / sizeof(u64)); \
128 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
130 return bch2_usage_read_raw(ca->usage[0]);
133 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
135 struct bch_fs_usage *ret;
136 unsigned nr = READ_ONCE(c->replicas.nr);
138 ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
142 percpu_down_read_preempt_disable(&c->mark_lock);
144 if (unlikely(nr < c->replicas.nr)) {
146 percpu_up_read_preempt_enable(&c->mark_lock);
151 acc_u64s_percpu((u64 *) ret,
152 (u64 __percpu *) c->usage[0],
153 sizeof(*ret) / sizeof(u64) + nr);
158 #define RESERVE_FACTOR 6
160 static u64 reserve_factor(u64 r)
162 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
165 static u64 avail_factor(u64 r)
167 return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
170 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
172 return min(fs_usage.s.hidden +
174 reserve_factor(fs_usage.s.reserved +
175 fs_usage.s.online_reserved),
179 struct bch_fs_usage_short
180 bch2_fs_usage_read_short(struct bch_fs *c)
182 struct bch_fs_usage_summarized usage =
183 bch2_usage_read_raw(&c->usage[0]->s);
184 struct bch_fs_usage_short ret;
186 ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
187 ret.used = min(ret.capacity, usage.data +
188 reserve_factor(usage.reserved +
189 usage.online_reserved));
190 ret.nr_inodes = usage.nr_inodes;
195 static inline int is_unavailable_bucket(struct bucket_mark m)
197 return !is_available_bucket(m);
200 static inline int is_fragmented_bucket(struct bucket_mark m,
203 if (!m.owned_by_allocator &&
204 m.data_type == BCH_DATA_USER &&
205 bucket_sectors_used(m))
206 return max_t(int, 0, (int) ca->mi.bucket_size -
207 bucket_sectors_used(m));
211 static inline enum bch_data_type bucket_type(struct bucket_mark m)
213 return m.cached_sectors && !m.dirty_sectors
218 static bool bucket_became_unavailable(struct bucket_mark old,
219 struct bucket_mark new)
221 return is_available_bucket(old) &&
222 !is_available_bucket(new);
225 int bch2_fs_usage_apply(struct bch_fs *c,
226 struct bch_fs_usage *fs_usage,
227 struct disk_reservation *disk_res,
228 struct gc_pos gc_pos)
230 s64 added = fs_usage->s.data + fs_usage->s.reserved;
231 s64 should_not_have_added;
234 percpu_rwsem_assert_held(&c->mark_lock);
237 * Not allowed to reduce sectors_available except by getting a
240 should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
241 if (WARN_ONCE(should_not_have_added > 0,
242 "disk usage increased without a reservation")) {
243 atomic64_sub(should_not_have_added, &c->sectors_available);
244 added -= should_not_have_added;
249 disk_res->sectors -= added;
250 fs_usage->s.online_reserved -= added;
253 acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
255 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
257 if (gc_visited(c, gc_pos)) {
258 BUG_ON(!c->usage[1]);
259 acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
261 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
267 static inline void account_bucket(struct bch_fs_usage *fs_usage,
268 struct bch_dev_usage *dev_usage,
269 enum bch_data_type type,
272 if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
273 fs_usage->s.hidden += size;
275 dev_usage->buckets[type] += nr;
278 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
279 struct bch_fs_usage *fs_usage,
280 struct bucket_mark old, struct bucket_mark new,
283 struct bch_dev_usage *dev_usage;
285 percpu_rwsem_assert_held(&c->mark_lock);
287 bch2_fs_inconsistent_on(old.data_type && new.data_type &&
288 old.data_type != new.data_type, c,
289 "different types of data in same bucket: %s, %s",
290 bch2_data_types[old.data_type],
291 bch2_data_types[new.data_type]);
293 dev_usage = this_cpu_ptr(ca->usage[gc]);
295 if (bucket_type(old))
296 account_bucket(fs_usage, dev_usage, bucket_type(old),
297 -1, -ca->mi.bucket_size);
299 if (bucket_type(new))
300 account_bucket(fs_usage, dev_usage, bucket_type(new),
301 1, ca->mi.bucket_size);
303 dev_usage->buckets_alloc +=
304 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
305 dev_usage->buckets_ec +=
306 (int) new.stripe - (int) old.stripe;
307 dev_usage->buckets_unavailable +=
308 is_unavailable_bucket(new) - is_unavailable_bucket(old);
310 dev_usage->sectors[old.data_type] -= old.dirty_sectors;
311 dev_usage->sectors[new.data_type] += new.dirty_sectors;
312 dev_usage->sectors[BCH_DATA_CACHED] +=
313 (int) new.cached_sectors - (int) old.cached_sectors;
314 dev_usage->sectors_fragmented +=
315 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
317 if (!is_available_bucket(old) && is_available_bucket(new))
318 bch2_wake_allocator(ca);
321 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
323 struct bucket_mark old = { .v.counter = 0 };
324 struct bch_fs_usage *fs_usage;
325 struct bucket_array *buckets;
328 percpu_down_read_preempt_disable(&c->mark_lock);
329 fs_usage = this_cpu_ptr(c->usage[0]);
330 buckets = bucket_array(ca);
332 for_each_bucket(g, buckets)
333 if (g->mark.data_type)
334 bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
335 percpu_up_read_preempt_enable(&c->mark_lock);
338 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
340 struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
342 bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
346 static inline void update_replicas(struct bch_fs *c,
347 struct bch_fs_usage *fs_usage,
348 struct bch_replicas_entry *r,
351 int idx = bch2_replicas_entry_idx(c, r);
356 if (r->data_type == BCH_DATA_CACHED)
357 fs_usage->s.cached += sectors;
359 fs_usage->s.data += sectors;
360 fs_usage->data[idx] += sectors;
363 static inline void update_cached_sectors(struct bch_fs *c,
364 struct bch_fs_usage *fs_usage,
365 unsigned dev, s64 sectors)
367 struct bch_replicas_padded r;
369 bch2_replicas_entry_cached(&r.e, dev);
371 update_replicas(c, fs_usage, &r.e, sectors);
374 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
375 size_t b, struct bucket_mark *old,
378 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
379 struct bucket *g = __bucket(ca, b, gc);
380 struct bucket_mark new;
382 *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
383 BUG_ON(!is_available_bucket(new));
385 new.owned_by_allocator = true;
388 new.cached_sectors = 0;
389 new.dirty_sectors = 0;
393 if (old->cached_sectors)
394 update_cached_sectors(c, fs_usage, ca->dev_idx,
395 -old->cached_sectors);
398 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
399 size_t b, struct bucket_mark *old)
401 percpu_rwsem_assert_held(&c->mark_lock);
403 __bch2_invalidate_bucket(c, ca, b, old, false);
405 if (!old->owned_by_allocator && old->cached_sectors)
406 trace_invalidate(ca, bucket_to_sector(ca, b),
407 old->cached_sectors);
410 static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
411 size_t b, bool owned_by_allocator,
414 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
415 struct bucket *g = __bucket(ca, b, gc);
416 struct bucket_mark old, new;
418 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
419 new.owned_by_allocator = owned_by_allocator;
423 !owned_by_allocator && !old.owned_by_allocator);
426 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
427 size_t b, bool owned_by_allocator,
428 struct gc_pos pos, unsigned flags)
430 percpu_rwsem_assert_held(&c->mark_lock);
432 if (!(flags & BCH_BUCKET_MARK_GC))
433 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
435 if ((flags & BCH_BUCKET_MARK_GC) ||
437 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
440 #define checked_add(a, b) \
442 unsigned _res = (unsigned) (a) + (b); \
444 BUG_ON((a) != _res); \
447 static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
448 size_t b, enum bch_data_type type,
449 unsigned sectors, bool gc)
451 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
452 struct bucket *g = __bucket(ca, b, gc);
453 struct bucket_mark new;
455 BUG_ON(type != BCH_DATA_SB &&
456 type != BCH_DATA_JOURNAL);
458 bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
460 new.data_type = type;
461 checked_add(new.dirty_sectors, sectors);
465 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
466 size_t b, enum bch_data_type type,
467 unsigned sectors, struct gc_pos pos,
470 BUG_ON(type != BCH_DATA_SB &&
471 type != BCH_DATA_JOURNAL);
474 percpu_rwsem_assert_held(&c->mark_lock);
476 if (!(flags & BCH_BUCKET_MARK_GC))
477 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
479 if ((flags & BCH_BUCKET_MARK_GC) ||
481 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
485 struct bucket_mark new;
490 bucket_cmpxchg(g, new, ({
492 new.data_type = type;
493 checked_add(new.dirty_sectors, sectors);
500 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
505 * marking a new extent, which _will have size_ @delta
507 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
508 * case, we haven't actually created the key we'll be inserting
509 * yet (for the split) - so we don't want to be using
510 * k->size/crc.live_size here:
512 return __ptr_disk_sectors(p, delta);
514 BUG_ON(-delta > p.crc.live_size);
516 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
517 (s64) ptr_disk_sectors(p);
522 * Checking against gc's position has to be done here, inside the cmpxchg()
523 * loop, to avoid racing with the start of gc clearing all the marks - GC does
524 * that with the gc pos seqlock held.
526 static void bch2_mark_pointer(struct bch_fs *c,
527 struct extent_ptr_decoded p,
528 s64 sectors, enum bch_data_type data_type,
529 struct bch_fs_usage *fs_usage,
530 unsigned journal_seq, unsigned flags,
533 struct bucket_mark old, new;
534 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
535 size_t b = PTR_BUCKET_NR(ca, &p.ptr);
536 struct bucket *g = __bucket(ca, b, gc);
539 v = atomic64_read(&g->_mark.v);
541 new.v.counter = old.v.counter = v;
546 * Check this after reading bucket mark to guard against
547 * the allocator invalidating a bucket after we've already
550 if (gen_after(new.gen, p.ptr.gen)) {
551 BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
552 EBUG_ON(!p.ptr.cached &&
553 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
558 checked_add(new.dirty_sectors, sectors);
560 checked_add(new.cached_sectors, sectors);
562 if (!new.dirty_sectors &&
563 !new.cached_sectors) {
567 new.journal_seq_valid = 1;
568 new.journal_seq = journal_seq;
571 new.data_type = data_type;
574 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
578 } while ((v = atomic64_cmpxchg(&g->_mark.v,
580 new.v.counter)) != old.v.counter);
582 bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
584 BUG_ON(!gc && bucket_became_unavailable(old, new));
587 static int bch2_mark_stripe_ptr(struct bch_fs *c,
588 struct bch_extent_stripe_ptr p,
589 enum bch_data_type data_type,
590 struct bch_fs_usage *fs_usage,
591 s64 sectors, unsigned flags,
595 unsigned old, new, nr_data;
596 int blocks_nonempty_delta;
601 m = genradix_ptr(&c->stripes[gc], p.idx);
603 spin_lock(&c->ec_stripes_heap_lock);
605 if (!m || !m->alive) {
606 spin_unlock(&c->ec_stripes_heap_lock);
607 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
612 BUG_ON(m->r.e.data_type != data_type);
614 nr_data = m->nr_blocks - m->nr_redundant;
616 parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
619 parity_sectors = -parity_sectors;
620 sectors += parity_sectors;
622 old = m->block_sectors[p.block];
623 m->block_sectors[p.block] += sectors;
624 new = m->block_sectors[p.block];
626 blocks_nonempty_delta = (int) !!new - (int) !!old;
627 if (blocks_nonempty_delta) {
628 m->blocks_nonempty += blocks_nonempty_delta;
631 bch2_stripes_heap_update(c, m, p.idx);
636 spin_unlock(&c->ec_stripes_heap_lock);
638 update_replicas(c, fs_usage, &m->r.e, sectors);
643 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
644 s64 sectors, enum bch_data_type data_type,
645 struct bch_fs_usage *fs_usage,
646 unsigned journal_seq, unsigned flags,
649 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
650 const union bch_extent_entry *entry;
651 struct extent_ptr_decoded p;
652 struct bch_replicas_padded r;
653 s64 dirty_sectors = 0;
657 r.e.data_type = data_type;
663 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
664 s64 disk_sectors = data_type == BCH_DATA_BTREE
666 : ptr_disk_sectors_delta(p, sectors);
668 bch2_mark_pointer(c, p, disk_sectors, data_type,
669 fs_usage, journal_seq, flags, gc);
672 update_cached_sectors(c, fs_usage, p.ptr.dev,
674 } else if (!p.ec_nr) {
675 dirty_sectors += disk_sectors;
676 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
678 for (i = 0; i < p.ec_nr; i++) {
679 ret = bch2_mark_stripe_ptr(c, p.ec[i],
681 disk_sectors, flags, gc);
691 update_replicas(c, fs_usage, &r.e, dirty_sectors);
696 static void bucket_set_stripe(struct bch_fs *c,
697 const struct bch_stripe *v,
699 struct bch_fs_usage *fs_usage,
705 for (i = 0; i < v->nr_blocks; i++) {
706 const struct bch_extent_ptr *ptr = v->ptrs + i;
707 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
708 size_t b = PTR_BUCKET_NR(ca, ptr);
709 struct bucket *g = __bucket(ca, b, gc);
710 struct bucket_mark new, old;
712 BUG_ON(ptr_stale(ca, ptr));
714 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
716 new.stripe = enabled;
718 new.journal_seq_valid = 1;
719 new.journal_seq = journal_seq;
725 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
727 struct bch_fs_usage *fs_usage,
728 u64 journal_seq, unsigned flags,
731 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
732 size_t idx = s.k->p.offset;
733 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
736 spin_lock(&c->ec_stripes_heap_lock);
738 if (!m || (!inserting && !m->alive)) {
739 spin_unlock(&c->ec_stripes_heap_lock);
740 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
746 bch2_stripes_heap_del(c, m, idx);
748 memset(m, 0, sizeof(*m));
751 m->sectors = le16_to_cpu(s.v->sectors);
752 m->algorithm = s.v->algorithm;
753 m->nr_blocks = s.v->nr_blocks;
754 m->nr_redundant = s.v->nr_redundant;
756 memset(&m->r, 0, sizeof(m->r));
758 m->r.e.data_type = BCH_DATA_USER;
759 m->r.e.nr_devs = s.v->nr_blocks;
760 m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
762 for (i = 0; i < s.v->nr_blocks; i++)
763 m->r.e.devs[i] = s.v->ptrs[i].dev;
766 * XXX: account for stripes somehow here
769 update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
772 /* gc recalculates these fields: */
773 if (!(flags & BCH_BUCKET_MARK_GC)) {
774 for (i = 0; i < s.v->nr_blocks; i++) {
775 m->block_sectors[i] =
776 stripe_blockcount_get(s.v, i);
777 m->blocks_nonempty += !!m->block_sectors[i];
782 bch2_stripes_heap_insert(c, m, idx);
787 spin_unlock(&c->ec_stripes_heap_lock);
789 bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
793 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
794 bool inserting, s64 sectors,
795 struct bch_fs_usage *fs_usage,
796 unsigned journal_seq, unsigned flags,
802 case KEY_TYPE_btree_ptr:
803 ret = bch2_mark_extent(c, k, inserting
804 ? c->opts.btree_node_size
805 : -c->opts.btree_node_size,
807 fs_usage, journal_seq, flags, gc);
809 case KEY_TYPE_extent:
810 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
811 fs_usage, journal_seq, flags, gc);
813 case KEY_TYPE_stripe:
814 ret = bch2_mark_stripe(c, k, inserting,
815 fs_usage, journal_seq, flags, gc);
819 fs_usage->s.nr_inodes++;
821 fs_usage->s.nr_inodes--;
823 case KEY_TYPE_reservation: {
824 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
827 replicas = clamp_t(unsigned, replicas, 1,
828 ARRAY_SIZE(fs_usage->persistent_reserved));
830 fs_usage->s.reserved += sectors;
831 fs_usage->persistent_reserved[replicas - 1] += sectors;
841 int bch2_mark_key_locked(struct bch_fs *c,
843 bool inserting, s64 sectors,
845 struct bch_fs_usage *fs_usage,
846 u64 journal_seq, unsigned flags)
850 if (!(flags & BCH_BUCKET_MARK_GC)) {
851 ret = __bch2_mark_key(c, k, inserting, sectors,
852 fs_usage ?: this_cpu_ptr(c->usage[0]),
853 journal_seq, flags, false);
858 if ((flags & BCH_BUCKET_MARK_GC) ||
859 gc_visited(c, pos)) {
860 ret = __bch2_mark_key(c, k, inserting, sectors,
861 this_cpu_ptr(c->usage[1]),
862 journal_seq, flags, true);
870 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
871 bool inserting, s64 sectors,
873 struct bch_fs_usage *fs_usage,
874 u64 journal_seq, unsigned flags)
878 percpu_down_read_preempt_disable(&c->mark_lock);
879 ret = bch2_mark_key_locked(c, k, inserting, sectors,
880 pos, fs_usage, journal_seq, flags);
881 percpu_up_read_preempt_enable(&c->mark_lock);
886 void bch2_mark_update(struct btree_insert *trans,
887 struct btree_insert_entry *insert)
889 struct bch_fs *c = trans->c;
890 struct btree_iter *iter = insert->iter;
891 struct btree *b = iter->l[0].b;
892 struct btree_node_iter node_iter = iter->l[0].iter;
893 struct bch_fs_usage *fs_usage;
894 struct gc_pos pos = gc_pos_btree_node(b);
895 struct bkey_packed *_k;
896 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
897 static int warned_disk_usage = 0;
899 if (!btree_node_type_needs_gc(iter->btree_id))
902 percpu_down_read_preempt_disable(&c->mark_lock);
903 fs_usage = bch2_fs_usage_get_scratch(c);
905 if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
906 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
907 bpos_min(insert->k->k.p, b->key.k.p).offset -
908 bkey_start_offset(&insert->k->k),
909 pos, fs_usage, trans->journal_res.seq, 0);
911 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
912 KEY_TYPE_discard))) {
913 struct bkey unpacked;
917 k = bkey_disassemble(b, _k, &unpacked);
919 if (btree_node_is_extents(b)
920 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
921 : bkey_cmp(insert->k->k.p, k.k->p))
924 if (btree_node_is_extents(b)) {
925 switch (bch2_extent_overlap(&insert->k->k, k.k)) {
926 case BCH_EXTENT_OVERLAP_ALL:
927 sectors = -((s64) k.k->size);
929 case BCH_EXTENT_OVERLAP_BACK:
930 sectors = bkey_start_offset(&insert->k->k) -
933 case BCH_EXTENT_OVERLAP_FRONT:
934 sectors = bkey_start_offset(k.k) -
935 insert->k->k.p.offset;
937 case BCH_EXTENT_OVERLAP_MIDDLE:
938 sectors = k.k->p.offset - insert->k->k.p.offset;
939 BUG_ON(sectors <= 0);
941 bch2_mark_key_locked(c, k, true, sectors,
942 pos, fs_usage, trans->journal_res.seq, 0);
944 sectors = bkey_start_offset(&insert->k->k) -
949 BUG_ON(sectors >= 0);
952 bch2_mark_key_locked(c, k, false, sectors,
953 pos, fs_usage, trans->journal_res.seq, 0);
955 bch2_btree_node_iter_advance(&node_iter, b);
958 if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
959 !warned_disk_usage &&
960 !xchg(&warned_disk_usage, 1)) {
963 pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
965 pr_err("while inserting");
966 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
968 pr_err("overlapping with");
970 node_iter = iter->l[0].iter;
971 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
972 KEY_TYPE_discard))) {
973 struct bkey unpacked;
976 k = bkey_disassemble(b, _k, &unpacked);
978 if (btree_node_is_extents(b)
979 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
980 : bkey_cmp(insert->k->k.p, k.k->p))
983 bch2_bkey_val_to_text(&PBUF(buf), c, k);
986 bch2_btree_node_iter_advance(&node_iter, b);
990 percpu_up_read_preempt_enable(&c->mark_lock);
993 /* Disk reservations: */
995 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
999 for_each_possible_cpu(cpu)
1000 per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
1002 return avail_factor(bch2_fs_sectors_free(c));
1005 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
1007 percpu_down_read_preempt_disable(&c->mark_lock);
1008 this_cpu_sub(c->usage[0]->s.online_reserved,
1010 percpu_up_read_preempt_enable(&c->mark_lock);
1015 #define SECTORS_CACHE 1024
1017 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1018 unsigned sectors, int flags)
1020 struct bch_fs_pcpu *pcpu;
1022 s64 sectors_available;
1025 percpu_down_read_preempt_disable(&c->mark_lock);
1026 pcpu = this_cpu_ptr(c->pcpu);
1028 if (sectors <= pcpu->sectors_available)
1031 v = atomic64_read(&c->sectors_available);
1034 get = min((u64) sectors + SECTORS_CACHE, old);
1036 if (get < sectors) {
1037 percpu_up_read_preempt_enable(&c->mark_lock);
1040 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1041 old, old - get)) != old);
1043 pcpu->sectors_available += get;
1046 pcpu->sectors_available -= sectors;
1047 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1048 res->sectors += sectors;
1050 percpu_up_read_preempt_enable(&c->mark_lock);
1055 * GC recalculates sectors_available when it starts, so that hopefully
1056 * we don't normally end up blocking here:
1060 * Piss fuck, we can be called from extent_insert_fixup() with btree
1064 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
1065 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
1066 down_read(&c->gc_lock);
1067 else if (!down_read_trylock(&c->gc_lock))
1071 percpu_down_write(&c->mark_lock);
1072 sectors_available = bch2_recalc_sectors_available(c);
1074 if (sectors <= sectors_available ||
1075 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1076 atomic64_set(&c->sectors_available,
1077 max_t(s64, 0, sectors_available - sectors));
1078 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1079 res->sectors += sectors;
1082 atomic64_set(&c->sectors_available, sectors_available);
1086 percpu_up_write(&c->mark_lock);
1088 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
1089 up_read(&c->gc_lock);
1094 /* Startup/shutdown: */
1096 static void buckets_free_rcu(struct rcu_head *rcu)
1098 struct bucket_array *buckets =
1099 container_of(rcu, struct bucket_array, rcu);
1102 sizeof(struct bucket_array) +
1103 buckets->nbuckets * sizeof(struct bucket));
1106 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1108 struct bucket_array *buckets = NULL, *old_buckets = NULL;
1109 unsigned long *buckets_nouse = NULL;
1110 unsigned long *buckets_written = NULL;
1111 u8 *oldest_gens = NULL;
1112 alloc_fifo free[RESERVE_NR];
1113 alloc_fifo free_inc;
1114 alloc_heap alloc_heap;
1115 copygc_heap copygc_heap;
1117 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1118 ca->mi.bucket_size / c->opts.btree_node_size);
1119 /* XXX: these should be tunable */
1120 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
1121 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
1122 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
1124 bool resize = ca->buckets[0] != NULL,
1125 start_copygc = ca->copygc_thread != NULL;
1129 memset(&free, 0, sizeof(free));
1130 memset(&free_inc, 0, sizeof(free_inc));
1131 memset(&alloc_heap, 0, sizeof(alloc_heap));
1132 memset(©gc_heap, 0, sizeof(copygc_heap));
1134 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
1135 nbuckets * sizeof(struct bucket),
1136 GFP_KERNEL|__GFP_ZERO)) ||
1137 !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
1138 GFP_KERNEL|__GFP_ZERO)) ||
1139 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1140 sizeof(unsigned long),
1141 GFP_KERNEL|__GFP_ZERO)) ||
1142 !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1143 sizeof(unsigned long),
1144 GFP_KERNEL|__GFP_ZERO)) ||
1145 !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1146 !init_fifo(&free[RESERVE_MOVINGGC],
1147 copygc_reserve, GFP_KERNEL) ||
1148 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1149 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
1150 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1151 !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL))
1154 buckets->first_bucket = ca->mi.first_bucket;
1155 buckets->nbuckets = nbuckets;
1157 bch2_copygc_stop(ca);
1160 down_write(&c->gc_lock);
1161 down_write(&ca->bucket_lock);
1162 percpu_down_write(&c->mark_lock);
1165 old_buckets = bucket_array(ca);
1168 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1172 n * sizeof(struct bucket));
1176 memcpy(buckets_nouse,
1178 BITS_TO_LONGS(n) * sizeof(unsigned long));
1179 memcpy(buckets_written,
1180 ca->buckets_written,
1181 BITS_TO_LONGS(n) * sizeof(unsigned long));
1184 rcu_assign_pointer(ca->buckets[0], buckets);
1185 buckets = old_buckets;
1187 swap(ca->oldest_gens, oldest_gens);
1188 swap(ca->buckets_nouse, buckets_nouse);
1189 swap(ca->buckets_written, buckets_written);
1192 percpu_up_write(&c->mark_lock);
1194 spin_lock(&c->freelist_lock);
1195 for (i = 0; i < RESERVE_NR; i++) {
1196 fifo_move(&free[i], &ca->free[i]);
1197 swap(ca->free[i], free[i]);
1199 fifo_move(&free_inc, &ca->free_inc);
1200 swap(ca->free_inc, free_inc);
1201 spin_unlock(&c->freelist_lock);
1203 /* with gc lock held, alloc_heap can't be in use: */
1204 swap(ca->alloc_heap, alloc_heap);
1206 /* and we shut down copygc: */
1207 swap(ca->copygc_heap, copygc_heap);
1209 nbuckets = ca->mi.nbuckets;
1212 up_write(&ca->bucket_lock);
1213 up_write(&c->gc_lock);
1217 bch2_copygc_start(c, ca))
1218 bch_err(ca, "error restarting copygc thread");
1222 free_heap(©gc_heap);
1223 free_heap(&alloc_heap);
1224 free_fifo(&free_inc);
1225 for (i = 0; i < RESERVE_NR; i++)
1226 free_fifo(&free[i]);
1227 kvpfree(buckets_nouse,
1228 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1229 kvpfree(buckets_written,
1230 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1231 kvpfree(oldest_gens,
1232 nbuckets * sizeof(u8));
1234 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1239 void bch2_dev_buckets_free(struct bch_dev *ca)
1243 free_heap(&ca->copygc_heap);
1244 free_heap(&ca->alloc_heap);
1245 free_fifo(&ca->free_inc);
1246 for (i = 0; i < RESERVE_NR; i++)
1247 free_fifo(&ca->free[i]);
1248 kvpfree(ca->buckets_written,
1249 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1250 kvpfree(ca->buckets_nouse,
1251 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1252 kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
1253 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1254 sizeof(struct bucket_array) +
1255 ca->mi.nbuckets * sizeof(struct bucket));
1257 free_percpu(ca->usage[0]);
1260 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1262 if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1265 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;