2 * Code for manipulating bucket marks for garbage collection.
4 * Copyright 2014 Datera, Inc.
7 * - free bucket: mark == 0
8 * The bucket contains no data and will not be read
10 * - allocator bucket: owned_by_allocator == 1
11 * The bucket is on a free list, or it is an open bucket
13 * - cached bucket: owned_by_allocator == 0 &&
14 * dirty_sectors == 0 &&
16 * The bucket contains data but may be safely discarded as there are
17 * enough replicas of the data on other cache devices, or it has been
18 * written back to the backing device
20 * - dirty bucket: owned_by_allocator == 0 &&
22 * The bucket contains data that we must not discard (either only copy,
23 * or one of the 'main copies' for data requiring multiple replicas)
25 * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
26 * This is a btree node, journal or gen/prio bucket
30 * bucket invalidated => bucket on freelist => open bucket =>
31 * [dirty bucket =>] cached bucket => bucket invalidated => ...
33 * Note that cache promotion can skip the dirty bucket step, as data
34 * is copied from a deeper tier to a shallower tier, onto a cached
36 * Note also that a cached bucket can spontaneously become dirty --
39 * Only a traversal of the key space can determine whether a bucket is
40 * truly dirty or cached.
44 * - free => allocator: bucket was invalidated
45 * - cached => allocator: bucket was invalidated
47 * - allocator => dirty: open bucket was filled up
48 * - allocator => cached: open bucket was filled up
49 * - allocator => metadata: metadata was allocated
51 * - dirty => cached: dirty sectors were copied to a deeper tier
52 * - dirty => free: dirty sectors were overwritten or moved (copy gc)
53 * - cached => free: cached sectors were overwritten
55 * - metadata => free: metadata was freed
58 * - cached => dirty: a device was removed so formerly replicated data
59 * is no longer sufficiently replicated
60 * - free => cached: cannot happen
61 * - free => dirty: cannot happen
62 * - free => metadata: cannot happen
66 #include "alloc_background.h"
69 #include "btree_update.h"
76 #include <linux/preempt.h>
77 #include <trace/events/bcachefs.h>
80 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
83 void bch2_bucket_seq_cleanup(struct bch_fs *c)
85 u64 journal_seq = atomic64_read(&c->journal.seq);
86 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
88 struct bucket_array *buckets;
93 if (journal_seq - c->last_bucket_seq_cleanup <
94 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
97 c->last_bucket_seq_cleanup = journal_seq;
99 for_each_member_device(ca, c, i) {
100 down_read(&ca->bucket_lock);
101 buckets = bucket_array(ca);
103 for_each_bucket(g, buckets) {
104 bucket_cmpxchg(g, m, ({
105 if (!m.journal_seq_valid ||
106 bucket_needs_journal_commit(m, last_seq_ondisk))
109 m.journal_seq_valid = 0;
112 up_read(&ca->bucket_lock);
116 #define bch2_usage_read_raw(_stats) \
118 typeof(*this_cpu_ptr(_stats)) _acc; \
120 memset(&_acc, 0, sizeof(_acc)); \
121 acc_u64s_percpu((u64 *) &_acc, \
122 (u64 __percpu *) _stats, \
123 sizeof(_acc) / sizeof(u64)); \
128 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
130 return bch2_usage_read_raw(ca->usage[0]);
133 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
135 struct bch_fs_usage *ret;
136 unsigned nr = READ_ONCE(c->replicas.nr);
138 ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
142 percpu_down_read_preempt_disable(&c->mark_lock);
144 if (unlikely(nr < c->replicas.nr)) {
146 percpu_up_read_preempt_enable(&c->mark_lock);
151 acc_u64s_percpu((u64 *) ret,
152 (u64 __percpu *) c->usage[0],
153 sizeof(*ret) / sizeof(u64) + nr);
154 percpu_up_read_preempt_enable(&c->mark_lock);
159 #define RESERVE_FACTOR 6
161 static u64 reserve_factor(u64 r)
163 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
166 static u64 avail_factor(u64 r)
168 return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
171 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
173 return min(fs_usage.s.hidden +
175 reserve_factor(fs_usage.s.reserved +
176 fs_usage.s.online_reserved),
180 struct bch_fs_usage_short
181 bch2_fs_usage_read_short(struct bch_fs *c)
183 struct bch_fs_usage_summarized usage =
184 bch2_usage_read_raw(&c->usage[0]->s);
185 struct bch_fs_usage_short ret;
187 ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
188 ret.used = min(ret.capacity, usage.data +
189 reserve_factor(usage.reserved +
190 usage.online_reserved));
191 ret.nr_inodes = usage.nr_inodes;
196 static inline int is_unavailable_bucket(struct bucket_mark m)
198 return !is_available_bucket(m);
201 static inline int is_fragmented_bucket(struct bucket_mark m,
204 if (!m.owned_by_allocator &&
205 m.data_type == BCH_DATA_USER &&
206 bucket_sectors_used(m))
207 return max_t(int, 0, (int) ca->mi.bucket_size -
208 bucket_sectors_used(m));
212 static inline enum bch_data_type bucket_type(struct bucket_mark m)
214 return m.cached_sectors && !m.dirty_sectors
219 static bool bucket_became_unavailable(struct bucket_mark old,
220 struct bucket_mark new)
222 return is_available_bucket(old) &&
223 !is_available_bucket(new);
226 void bch2_fs_usage_apply(struct bch_fs *c,
227 struct bch_fs_usage *fs_usage,
228 struct disk_reservation *disk_res,
229 struct gc_pos gc_pos)
231 s64 added = fs_usage->s.data + fs_usage->s.reserved;
232 s64 should_not_have_added;
234 percpu_rwsem_assert_held(&c->mark_lock);
237 * Not allowed to reduce sectors_available except by getting a
240 should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
241 if (WARN_ONCE(should_not_have_added > 0,
242 "disk usage increased without a reservation")) {
243 atomic64_sub(should_not_have_added, &c->sectors_available);
244 added -= should_not_have_added;
248 disk_res->sectors -= added;
249 fs_usage->s.online_reserved -= added;
252 acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
254 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
256 if (gc_visited(c, gc_pos)) {
257 BUG_ON(!c->usage[1]);
258 acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
260 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
264 static inline void account_bucket(struct bch_fs_usage *fs_usage,
265 struct bch_dev_usage *dev_usage,
266 enum bch_data_type type,
269 if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
270 fs_usage->s.hidden += size;
272 dev_usage->buckets[type] += nr;
275 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
276 struct bch_fs_usage *fs_usage,
277 struct bucket_mark old, struct bucket_mark new,
280 struct bch_dev_usage *dev_usage;
282 percpu_rwsem_assert_held(&c->mark_lock);
284 bch2_fs_inconsistent_on(old.data_type && new.data_type &&
285 old.data_type != new.data_type, c,
286 "different types of data in same bucket: %s, %s",
287 bch2_data_types[old.data_type],
288 bch2_data_types[new.data_type]);
290 dev_usage = this_cpu_ptr(ca->usage[gc]);
292 if (bucket_type(old))
293 account_bucket(fs_usage, dev_usage, bucket_type(old),
294 -1, -ca->mi.bucket_size);
296 if (bucket_type(new))
297 account_bucket(fs_usage, dev_usage, bucket_type(new),
298 1, ca->mi.bucket_size);
300 dev_usage->buckets_alloc +=
301 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
302 dev_usage->buckets_ec +=
303 (int) new.stripe - (int) old.stripe;
304 dev_usage->buckets_unavailable +=
305 is_unavailable_bucket(new) - is_unavailable_bucket(old);
307 dev_usage->sectors[old.data_type] -= old.dirty_sectors;
308 dev_usage->sectors[new.data_type] += new.dirty_sectors;
309 dev_usage->sectors[BCH_DATA_CACHED] +=
310 (int) new.cached_sectors - (int) old.cached_sectors;
311 dev_usage->sectors_fragmented +=
312 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
314 if (!is_available_bucket(old) && is_available_bucket(new))
315 bch2_wake_allocator(ca);
318 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
320 struct bucket_mark old = { .v.counter = 0 };
321 struct bch_fs_usage *fs_usage;
322 struct bucket_array *buckets;
325 percpu_down_read_preempt_disable(&c->mark_lock);
326 fs_usage = this_cpu_ptr(c->usage[0]);
327 buckets = bucket_array(ca);
329 for_each_bucket(g, buckets)
330 if (g->mark.data_type)
331 bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
332 percpu_up_read_preempt_enable(&c->mark_lock);
335 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
337 struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
339 bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
343 static inline void update_replicas(struct bch_fs *c,
344 struct bch_fs_usage *fs_usage,
345 struct bch_replicas_entry *r,
348 int idx = bch2_replicas_entry_idx(c, r);
353 if (r->data_type == BCH_DATA_CACHED)
354 fs_usage->s.cached += sectors;
356 fs_usage->s.data += sectors;
357 fs_usage->data[idx] += sectors;
360 static inline void update_cached_sectors(struct bch_fs *c,
361 struct bch_fs_usage *fs_usage,
362 unsigned dev, s64 sectors)
364 struct bch_replicas_padded r;
366 r.e.data_type = BCH_DATA_CACHED;
371 update_replicas(c, fs_usage, &r.e, sectors);
374 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
375 size_t b, struct bucket_mark *old,
378 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
379 struct bucket *g = __bucket(ca, b, gc);
380 struct bucket_mark new;
382 *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
383 BUG_ON(!is_available_bucket(new));
385 new.owned_by_allocator = 1;
387 new.cached_sectors = 0;
388 new.dirty_sectors = 0;
392 if (old->cached_sectors)
393 update_cached_sectors(c, fs_usage, ca->dev_idx,
394 -old->cached_sectors);
397 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
398 size_t b, struct bucket_mark *old)
400 percpu_rwsem_assert_held(&c->mark_lock);
402 __bch2_invalidate_bucket(c, ca, b, old, false);
404 if (!old->owned_by_allocator && old->cached_sectors)
405 trace_invalidate(ca, bucket_to_sector(ca, b),
406 old->cached_sectors);
409 static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
410 size_t b, bool owned_by_allocator,
413 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
414 struct bucket *g = __bucket(ca, b, gc);
415 struct bucket_mark old, new;
417 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
418 new.owned_by_allocator = owned_by_allocator;
422 !owned_by_allocator && !old.owned_by_allocator);
425 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
426 size_t b, bool owned_by_allocator,
427 struct gc_pos pos, unsigned flags)
429 percpu_rwsem_assert_held(&c->mark_lock);
431 if (!(flags & BCH_BUCKET_MARK_GC))
432 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
434 if ((flags & BCH_BUCKET_MARK_GC) ||
436 __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
439 #define checked_add(a, b) \
441 unsigned _res = (unsigned) (a) + (b); \
443 BUG_ON((a) != _res); \
446 static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
447 size_t b, enum bch_data_type type,
448 unsigned sectors, bool gc)
450 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
451 struct bucket *g = __bucket(ca, b, gc);
452 struct bucket_mark new;
454 BUG_ON(type != BCH_DATA_SB &&
455 type != BCH_DATA_JOURNAL);
457 bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
458 new.data_type = type;
459 checked_add(new.dirty_sectors, sectors);
463 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
464 size_t b, enum bch_data_type type,
465 unsigned sectors, struct gc_pos pos,
468 BUG_ON(type != BCH_DATA_SB &&
469 type != BCH_DATA_JOURNAL);
472 percpu_rwsem_assert_held(&c->mark_lock);
474 if (!(flags & BCH_BUCKET_MARK_GC))
475 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
477 if ((flags & BCH_BUCKET_MARK_GC) ||
479 __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
483 struct bucket_mark old, new;
488 old = bucket_cmpxchg(g, new, ({
489 new.data_type = type;
490 checked_add(new.dirty_sectors, sectors);
497 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
502 * marking a new extent, which _will have size_ @delta
504 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
505 * case, we haven't actually created the key we'll be inserting
506 * yet (for the split) - so we don't want to be using
507 * k->size/crc.live_size here:
509 return __ptr_disk_sectors(p, delta);
511 BUG_ON(-delta > p.crc.live_size);
513 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
514 (s64) ptr_disk_sectors(p);
519 * Checking against gc's position has to be done here, inside the cmpxchg()
520 * loop, to avoid racing with the start of gc clearing all the marks - GC does
521 * that with the gc pos seqlock held.
523 static void bch2_mark_pointer(struct bch_fs *c,
524 struct extent_ptr_decoded p,
525 s64 sectors, enum bch_data_type data_type,
526 struct bch_fs_usage *fs_usage,
527 unsigned journal_seq, unsigned flags,
530 struct bucket_mark old, new;
531 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
532 size_t b = PTR_BUCKET_NR(ca, &p.ptr);
533 struct bucket *g = __bucket(ca, b, gc);
536 v = atomic64_read(&g->_mark.v);
538 new.v.counter = old.v.counter = v;
541 * Check this after reading bucket mark to guard against
542 * the allocator invalidating a bucket after we've already
545 if (gen_after(new.gen, p.ptr.gen)) {
546 BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
547 EBUG_ON(!p.ptr.cached &&
548 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
553 checked_add(new.dirty_sectors, sectors);
555 checked_add(new.cached_sectors, sectors);
557 if (!new.dirty_sectors &&
558 !new.cached_sectors) {
562 new.journal_seq_valid = 1;
563 new.journal_seq = journal_seq;
566 new.data_type = data_type;
569 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
573 } while ((v = atomic64_cmpxchg(&g->_mark.v,
575 new.v.counter)) != old.v.counter);
577 bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
579 BUG_ON(!gc && bucket_became_unavailable(old, new));
582 static int bch2_mark_stripe_ptr(struct bch_fs *c,
583 struct bch_extent_stripe_ptr p,
584 enum bch_data_type data_type,
585 struct bch_fs_usage *fs_usage,
586 s64 sectors, unsigned flags,
590 unsigned old, new, nr_data;
591 int blocks_nonempty_delta;
594 m = genradix_ptr(&c->stripes[gc], p.idx);
596 if (!m || !m->alive) {
597 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
602 BUG_ON(m->r.e.data_type != data_type);
604 nr_data = m->nr_blocks - m->nr_redundant;
606 parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
609 parity_sectors = -parity_sectors;
610 sectors += parity_sectors;
612 new = atomic_add_return(sectors, &m->block_sectors[p.block]);
615 blocks_nonempty_delta = (int) !!new - (int) !!old;
616 if (!blocks_nonempty_delta)
619 atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
621 BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
624 bch2_stripes_heap_update(c, m, p.idx);
626 update_replicas(c, fs_usage, &m->r.e, sectors);
631 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
633 enum bch_data_type data_type,
634 struct bch_fs_usage *fs_usage,
635 unsigned journal_seq, unsigned flags,
638 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
639 const union bch_extent_entry *entry;
640 struct extent_ptr_decoded p;
641 struct bch_replicas_padded r;
642 s64 dirty_sectors = 0;
646 r.e.data_type = data_type;
652 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
653 s64 disk_sectors = data_type == BCH_DATA_BTREE
655 : ptr_disk_sectors_delta(p, sectors);
657 bch2_mark_pointer(c, p, disk_sectors, data_type,
658 fs_usage, journal_seq, flags, gc);
661 update_cached_sectors(c, fs_usage, p.ptr.dev,
663 } else if (!p.ec_nr) {
664 dirty_sectors += disk_sectors;
665 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
667 for (i = 0; i < p.ec_nr; i++) {
668 ret = bch2_mark_stripe_ptr(c, p.ec[i],
670 disk_sectors, flags, gc);
680 update_replicas(c, fs_usage, &r.e, dirty_sectors);
685 static void bucket_set_stripe(struct bch_fs *c,
686 const struct bch_stripe *v,
688 struct bch_fs_usage *fs_usage,
694 for (i = 0; i < v->nr_blocks; i++) {
695 const struct bch_extent_ptr *ptr = v->ptrs + i;
696 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
697 size_t b = PTR_BUCKET_NR(ca, ptr);
698 struct bucket *g = __bucket(ca, b, gc);
699 struct bucket_mark new, old;
701 BUG_ON(ptr_stale(ca, ptr));
703 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
704 new.stripe = enabled;
706 new.journal_seq_valid = 1;
707 new.journal_seq = journal_seq;
711 BUG_ON(old.stripe == enabled);
715 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
717 struct bch_fs_usage *fs_usage,
718 u64 journal_seq, unsigned flags,
721 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
722 size_t idx = s.k->p.offset;
723 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
726 if (!m || (!inserting && !m->alive)) {
727 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
732 if (inserting && m->alive) {
733 bch_err_ratelimited(c, "error marking stripe %zu: already exists",
738 BUG_ON(atomic_read(&m->blocks_nonempty));
740 for (i = 0; i < EC_STRIPE_MAX; i++)
741 BUG_ON(atomic_read(&m->block_sectors[i]));
744 m->sectors = le16_to_cpu(s.v->sectors);
745 m->algorithm = s.v->algorithm;
746 m->nr_blocks = s.v->nr_blocks;
747 m->nr_redundant = s.v->nr_redundant;
749 memset(&m->r, 0, sizeof(m->r));
751 m->r.e.data_type = BCH_DATA_USER;
752 m->r.e.nr_devs = s.v->nr_blocks;
753 m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
755 for (i = 0; i < s.v->nr_blocks; i++)
756 m->r.e.devs[i] = s.v->ptrs[i].dev;
760 * XXX: account for stripes somehow here
763 update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
768 bch2_stripes_heap_insert(c, m, idx);
770 bch2_stripes_heap_del(c, m, idx);
772 m->alive = inserting;
775 bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
779 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
780 bool inserting, s64 sectors,
781 struct bch_fs_usage *fs_usage,
782 unsigned journal_seq, unsigned flags,
788 case KEY_TYPE_btree_ptr:
789 ret = bch2_mark_extent(c, k, inserting
790 ? c->opts.btree_node_size
791 : -c->opts.btree_node_size,
793 fs_usage, journal_seq, flags, gc);
795 case KEY_TYPE_extent:
796 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
797 fs_usage, journal_seq, flags, gc);
799 case KEY_TYPE_stripe:
800 ret = bch2_mark_stripe(c, k, inserting,
801 fs_usage, journal_seq, flags, gc);
805 fs_usage->s.nr_inodes++;
807 fs_usage->s.nr_inodes--;
809 case KEY_TYPE_reservation: {
810 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
813 replicas = clamp_t(unsigned, replicas, 1,
814 ARRAY_SIZE(fs_usage->persistent_reserved));
816 fs_usage->s.reserved += sectors;
817 fs_usage->persistent_reserved[replicas - 1] += sectors;
827 int bch2_mark_key_locked(struct bch_fs *c,
829 bool inserting, s64 sectors,
831 struct bch_fs_usage *fs_usage,
832 u64 journal_seq, unsigned flags)
836 if (!(flags & BCH_BUCKET_MARK_GC)) {
837 ret = __bch2_mark_key(c, k, inserting, sectors,
838 fs_usage ?: this_cpu_ptr(c->usage[0]),
839 journal_seq, flags, false);
844 if ((flags & BCH_BUCKET_MARK_GC) ||
845 gc_visited(c, pos)) {
846 ret = __bch2_mark_key(c, k, inserting, sectors,
847 this_cpu_ptr(c->usage[1]),
848 journal_seq, flags, true);
856 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
857 bool inserting, s64 sectors,
859 struct bch_fs_usage *fs_usage,
860 u64 journal_seq, unsigned flags)
864 percpu_down_read_preempt_disable(&c->mark_lock);
865 ret = bch2_mark_key_locked(c, k, inserting, sectors,
866 pos, fs_usage, journal_seq, flags);
867 percpu_up_read_preempt_enable(&c->mark_lock);
872 void bch2_mark_update(struct btree_insert *trans,
873 struct btree_insert_entry *insert)
875 struct bch_fs *c = trans->c;
876 struct btree_iter *iter = insert->iter;
877 struct btree *b = iter->l[0].b;
878 struct btree_node_iter node_iter = iter->l[0].iter;
879 struct bch_fs_usage *fs_usage;
880 struct gc_pos pos = gc_pos_btree_node(b);
881 struct bkey_packed *_k;
883 if (!btree_node_type_needs_gc(iter->btree_id))
886 percpu_down_read_preempt_disable(&c->mark_lock);
887 fs_usage = bch2_fs_usage_get_scratch(c);
889 if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
890 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
891 bpos_min(insert->k->k.p, b->key.k.p).offset -
892 bkey_start_offset(&insert->k->k),
893 pos, fs_usage, trans->journal_res.seq, 0);
895 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
896 KEY_TYPE_discard))) {
897 struct bkey unpacked;
901 k = bkey_disassemble(b, _k, &unpacked);
903 if (btree_node_is_extents(b)
904 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
905 : bkey_cmp(insert->k->k.p, k.k->p))
908 if (btree_node_is_extents(b)) {
909 switch (bch2_extent_overlap(&insert->k->k, k.k)) {
910 case BCH_EXTENT_OVERLAP_ALL:
911 sectors = -((s64) k.k->size);
913 case BCH_EXTENT_OVERLAP_BACK:
914 sectors = bkey_start_offset(&insert->k->k) -
917 case BCH_EXTENT_OVERLAP_FRONT:
918 sectors = bkey_start_offset(k.k) -
919 insert->k->k.p.offset;
921 case BCH_EXTENT_OVERLAP_MIDDLE:
922 sectors = k.k->p.offset - insert->k->k.p.offset;
923 BUG_ON(sectors <= 0);
925 bch2_mark_key_locked(c, k, true, sectors,
926 pos, fs_usage, trans->journal_res.seq, 0);
928 sectors = bkey_start_offset(&insert->k->k) -
933 BUG_ON(sectors >= 0);
936 bch2_mark_key_locked(c, k, false, sectors,
937 pos, fs_usage, trans->journal_res.seq, 0);
939 bch2_btree_node_iter_advance(&node_iter, b);
942 bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos);
944 percpu_up_read_preempt_enable(&c->mark_lock);
947 /* Disk reservations: */
949 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
953 for_each_possible_cpu(cpu)
954 per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0;
956 return avail_factor(bch2_fs_sectors_free(c));
959 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
961 percpu_down_read_preempt_disable(&c->mark_lock);
962 this_cpu_sub(c->usage[0]->s.online_reserved,
964 percpu_up_read_preempt_enable(&c->mark_lock);
969 #define SECTORS_CACHE 1024
971 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
972 unsigned sectors, int flags)
974 struct bch_fs_pcpu *pcpu;
976 s64 sectors_available;
979 percpu_down_read_preempt_disable(&c->mark_lock);
980 pcpu = this_cpu_ptr(c->pcpu);
982 if (sectors <= pcpu->sectors_available)
985 v = atomic64_read(&c->sectors_available);
988 get = min((u64) sectors + SECTORS_CACHE, old);
991 percpu_up_read_preempt_enable(&c->mark_lock);
994 } while ((v = atomic64_cmpxchg(&c->sectors_available,
995 old, old - get)) != old);
997 pcpu->sectors_available += get;
1000 pcpu->sectors_available -= sectors;
1001 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1002 res->sectors += sectors;
1004 percpu_up_read_preempt_enable(&c->mark_lock);
1009 * GC recalculates sectors_available when it starts, so that hopefully
1010 * we don't normally end up blocking here:
1014 * Piss fuck, we can be called from extent_insert_fixup() with btree
1018 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
1019 if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
1020 down_read(&c->gc_lock);
1021 else if (!down_read_trylock(&c->gc_lock))
1025 percpu_down_write(&c->mark_lock);
1026 sectors_available = bch2_recalc_sectors_available(c);
1028 if (sectors <= sectors_available ||
1029 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1030 atomic64_set(&c->sectors_available,
1031 max_t(s64, 0, sectors_available - sectors));
1032 this_cpu_add(c->usage[0]->s.online_reserved, sectors);
1033 res->sectors += sectors;
1036 atomic64_set(&c->sectors_available, sectors_available);
1040 percpu_up_write(&c->mark_lock);
1042 if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
1043 up_read(&c->gc_lock);
1048 /* Startup/shutdown: */
1050 static void buckets_free_rcu(struct rcu_head *rcu)
1052 struct bucket_array *buckets =
1053 container_of(rcu, struct bucket_array, rcu);
1056 sizeof(struct bucket_array) +
1057 buckets->nbuckets * sizeof(struct bucket));
1060 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1062 struct bucket_array *buckets = NULL, *old_buckets = NULL;
1063 unsigned long *buckets_nouse = NULL;
1064 unsigned long *buckets_written = NULL;
1065 u8 *oldest_gens = NULL;
1066 alloc_fifo free[RESERVE_NR];
1067 alloc_fifo free_inc;
1068 alloc_heap alloc_heap;
1069 copygc_heap copygc_heap;
1071 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1072 ca->mi.bucket_size / c->opts.btree_node_size);
1073 /* XXX: these should be tunable */
1074 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
1075 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
1076 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
1078 bool resize = ca->buckets[0] != NULL,
1079 start_copygc = ca->copygc_thread != NULL;
1083 memset(&free, 0, sizeof(free));
1084 memset(&free_inc, 0, sizeof(free_inc));
1085 memset(&alloc_heap, 0, sizeof(alloc_heap));
1086 memset(©gc_heap, 0, sizeof(copygc_heap));
1088 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
1089 nbuckets * sizeof(struct bucket),
1090 GFP_KERNEL|__GFP_ZERO)) ||
1091 !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
1092 GFP_KERNEL|__GFP_ZERO)) ||
1093 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1094 sizeof(unsigned long),
1095 GFP_KERNEL|__GFP_ZERO)) ||
1096 !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1097 sizeof(unsigned long),
1098 GFP_KERNEL|__GFP_ZERO)) ||
1099 !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1100 !init_fifo(&free[RESERVE_MOVINGGC],
1101 copygc_reserve, GFP_KERNEL) ||
1102 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1103 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
1104 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1105 !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL))
1108 buckets->first_bucket = ca->mi.first_bucket;
1109 buckets->nbuckets = nbuckets;
1111 bch2_copygc_stop(ca);
1114 down_write(&c->gc_lock);
1115 down_write(&ca->bucket_lock);
1116 percpu_down_write(&c->mark_lock);
1119 old_buckets = bucket_array(ca);
1122 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1126 n * sizeof(struct bucket));
1130 memcpy(buckets_nouse,
1132 BITS_TO_LONGS(n) * sizeof(unsigned long));
1133 memcpy(buckets_written,
1134 ca->buckets_written,
1135 BITS_TO_LONGS(n) * sizeof(unsigned long));
1138 rcu_assign_pointer(ca->buckets[0], buckets);
1139 buckets = old_buckets;
1141 swap(ca->oldest_gens, oldest_gens);
1142 swap(ca->buckets_nouse, buckets_nouse);
1143 swap(ca->buckets_written, buckets_written);
1146 percpu_up_write(&c->mark_lock);
1148 spin_lock(&c->freelist_lock);
1149 for (i = 0; i < RESERVE_NR; i++) {
1150 fifo_move(&free[i], &ca->free[i]);
1151 swap(ca->free[i], free[i]);
1153 fifo_move(&free_inc, &ca->free_inc);
1154 swap(ca->free_inc, free_inc);
1155 spin_unlock(&c->freelist_lock);
1157 /* with gc lock held, alloc_heap can't be in use: */
1158 swap(ca->alloc_heap, alloc_heap);
1160 /* and we shut down copygc: */
1161 swap(ca->copygc_heap, copygc_heap);
1163 nbuckets = ca->mi.nbuckets;
1166 up_write(&ca->bucket_lock);
1167 up_write(&c->gc_lock);
1171 bch2_copygc_start(c, ca))
1172 bch_err(ca, "error restarting copygc thread");
1176 free_heap(©gc_heap);
1177 free_heap(&alloc_heap);
1178 free_fifo(&free_inc);
1179 for (i = 0; i < RESERVE_NR; i++)
1180 free_fifo(&free[i]);
1181 kvpfree(buckets_nouse,
1182 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1183 kvpfree(buckets_written,
1184 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1185 kvpfree(oldest_gens,
1186 nbuckets * sizeof(u8));
1188 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1193 void bch2_dev_buckets_free(struct bch_dev *ca)
1197 free_heap(&ca->copygc_heap);
1198 free_heap(&ca->alloc_heap);
1199 free_fifo(&ca->free_inc);
1200 for (i = 0; i < RESERVE_NR; i++)
1201 free_fifo(&ca->free[i]);
1202 kvpfree(ca->buckets_written,
1203 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1204 kvpfree(ca->buckets_nouse,
1205 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1206 kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
1207 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1208 sizeof(struct bucket_array) +
1209 ca->mi.nbuckets * sizeof(struct bucket));
1211 free_percpu(ca->usage[0]);
1214 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1216 if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1219 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;