1 // SPDX-License-Identifier: GPL-2.0
3 * Code for manipulating bucket marks for garbage collection.
5 * Copyright 2014 Datera, Inc.
9 #include "alloc_background.h"
12 #include "btree_update.h"
20 #include "subvolume.h"
22 #include <linux/preempt.h>
23 #include <trace/events/bcachefs.h>
25 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
26 enum bch_data_type data_type,
31 fs_usage->btree += sectors;
35 fs_usage->data += sectors;
38 fs_usage->cached += sectors;
46 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
49 void bch2_bucket_seq_cleanup(struct bch_fs *c)
51 u64 journal_seq = atomic64_read(&c->journal.seq);
52 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
54 struct bucket_array *buckets;
59 if (journal_seq - c->last_bucket_seq_cleanup <
60 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
63 c->last_bucket_seq_cleanup = journal_seq;
65 for_each_member_device(ca, c, i) {
66 down_read(&ca->bucket_lock);
67 buckets = bucket_array(ca);
69 for_each_bucket(g, buckets) {
70 bucket_cmpxchg(g, m, ({
71 if (!m.journal_seq_valid ||
72 bucket_needs_journal_commit(m, last_seq_ondisk))
75 m.journal_seq_valid = 0;
78 up_read(&ca->bucket_lock);
82 void bch2_fs_usage_initialize(struct bch_fs *c)
84 struct bch_fs_usage *usage;
88 percpu_down_write(&c->mark_lock);
89 usage = c->usage_base;
91 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
92 bch2_fs_usage_acc_to_base(c, i);
94 for (i = 0; i < BCH_REPLICAS_MAX; i++)
95 usage->reserved += usage->persistent_reserved[i];
97 for (i = 0; i < c->replicas.nr; i++) {
98 struct bch_replicas_entry *e =
99 cpu_replicas_entry(&c->replicas, i);
101 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
104 for_each_member_device(ca, c, i) {
105 struct bch_dev_usage dev = bch2_dev_usage_read(ca);
107 usage->hidden += (dev.d[BCH_DATA_sb].buckets +
108 dev.d[BCH_DATA_journal].buckets) *
112 percpu_up_write(&c->mark_lock);
115 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
116 unsigned journal_seq,
119 return this_cpu_ptr(gc
121 : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
124 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
126 struct bch_fs *c = ca->fs;
127 struct bch_dev_usage ret;
128 unsigned seq, i, u64s = dev_usage_u64s();
131 seq = read_seqcount_begin(&c->usage_lock);
132 memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
133 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
134 acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
135 } while (read_seqcount_retry(&c->usage_lock, seq));
140 static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
141 unsigned journal_seq,
144 return this_cpu_ptr(gc
146 : c->usage[journal_seq & JOURNAL_BUF_MASK]);
149 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
151 ssize_t offset = v - (u64 *) c->usage_base;
155 BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
156 percpu_rwsem_assert_held(&c->mark_lock);
159 seq = read_seqcount_begin(&c->usage_lock);
162 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
163 ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
164 } while (read_seqcount_retry(&c->usage_lock, seq));
169 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
171 struct bch_fs_usage_online *ret;
172 unsigned seq, i, u64s;
174 percpu_down_read(&c->mark_lock);
176 ret = kmalloc(sizeof(struct bch_fs_usage_online) +
177 sizeof(u64) * c->replicas.nr, GFP_NOFS);
178 if (unlikely(!ret)) {
179 percpu_up_read(&c->mark_lock);
183 ret->online_reserved = percpu_u64_get(c->online_reserved);
185 u64s = fs_usage_u64s(c);
187 seq = read_seqcount_begin(&c->usage_lock);
188 memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
189 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
190 acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
191 } while (read_seqcount_retry(&c->usage_lock, seq));
196 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
199 unsigned i, u64s = fs_usage_u64s(c);
201 BUG_ON(idx >= ARRAY_SIZE(c->usage));
204 write_seqcount_begin(&c->usage_lock);
206 acc_u64s_percpu((u64 *) c->usage_base,
207 (u64 __percpu *) c->usage[idx], u64s);
208 percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
211 for_each_member_device_rcu(ca, c, i, NULL) {
212 u64s = dev_usage_u64s();
214 acc_u64s_percpu((u64 *) ca->usage_base,
215 (u64 __percpu *) ca->usage[idx], u64s);
216 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
220 write_seqcount_end(&c->usage_lock);
224 void bch2_fs_usage_to_text(struct printbuf *out,
226 struct bch_fs_usage_online *fs_usage)
230 pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
232 pr_buf(out, "hidden:\t\t\t\t%llu\n",
234 pr_buf(out, "data:\t\t\t\t%llu\n",
236 pr_buf(out, "cached:\t\t\t\t%llu\n",
238 pr_buf(out, "reserved:\t\t\t%llu\n",
239 fs_usage->u.reserved);
240 pr_buf(out, "nr_inodes:\t\t\t%llu\n",
241 fs_usage->u.nr_inodes);
242 pr_buf(out, "online reserved:\t\t%llu\n",
243 fs_usage->online_reserved);
246 i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
248 pr_buf(out, "%u replicas:\n", i + 1);
249 pr_buf(out, "\treserved:\t\t%llu\n",
250 fs_usage->u.persistent_reserved[i]);
253 for (i = 0; i < c->replicas.nr; i++) {
254 struct bch_replicas_entry *e =
255 cpu_replicas_entry(&c->replicas, i);
258 bch2_replicas_entry_to_text(out, e);
259 pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
263 static u64 reserve_factor(u64 r)
265 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
268 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
270 return min(fs_usage->u.hidden +
273 reserve_factor(fs_usage->u.reserved +
274 fs_usage->online_reserved),
278 static struct bch_fs_usage_short
279 __bch2_fs_usage_read_short(struct bch_fs *c)
281 struct bch_fs_usage_short ret;
284 ret.capacity = c->capacity -
285 bch2_fs_usage_read_one(c, &c->usage_base->hidden);
287 data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
288 bch2_fs_usage_read_one(c, &c->usage_base->btree);
289 reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
290 percpu_u64_get(c->online_reserved);
292 ret.used = min(ret.capacity, data + reserve_factor(reserved));
293 ret.free = ret.capacity - ret.used;
295 ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
300 struct bch_fs_usage_short
301 bch2_fs_usage_read_short(struct bch_fs *c)
303 struct bch_fs_usage_short ret;
305 percpu_down_read(&c->mark_lock);
306 ret = __bch2_fs_usage_read_short(c);
307 percpu_up_read(&c->mark_lock);
312 static inline int is_unavailable_bucket(struct bucket_mark m)
314 return !is_available_bucket(m);
317 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
318 struct bucket_mark m)
320 return bucket_sectors_used(m)
321 ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
325 static inline int is_stripe_data_bucket(struct bucket_mark m)
327 return m.stripe && m.data_type != BCH_DATA_parity;
330 static inline enum bch_data_type bucket_type(struct bucket_mark m)
332 return m.cached_sectors && !m.dirty_sectors
337 static bool bucket_became_unavailable(struct bucket_mark old,
338 struct bucket_mark new)
340 return is_available_bucket(old) &&
341 !is_available_bucket(new);
344 static inline void account_bucket(struct bch_fs_usage *fs_usage,
345 struct bch_dev_usage *dev_usage,
346 enum bch_data_type type,
349 if (type == BCH_DATA_sb || type == BCH_DATA_journal)
350 fs_usage->hidden += size;
352 dev_usage->d[type].buckets += nr;
355 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
356 struct bucket_mark old, struct bucket_mark new,
357 u64 journal_seq, bool gc)
359 struct bch_fs_usage *fs_usage;
360 struct bch_dev_usage *u;
362 percpu_rwsem_assert_held(&c->mark_lock);
365 fs_usage = fs_usage_ptr(c, journal_seq, gc);
366 u = dev_usage_ptr(ca, journal_seq, gc);
368 if (bucket_type(old))
369 account_bucket(fs_usage, u, bucket_type(old),
370 -1, -ca->mi.bucket_size);
372 if (bucket_type(new))
373 account_bucket(fs_usage, u, bucket_type(new),
374 1, ca->mi.bucket_size);
376 u->buckets_ec += (int) new.stripe - (int) old.stripe;
377 u->buckets_unavailable +=
378 is_unavailable_bucket(new) - is_unavailable_bucket(old);
380 u->d[old.data_type].sectors -= old.dirty_sectors;
381 u->d[new.data_type].sectors += new.dirty_sectors;
382 u->d[BCH_DATA_cached].sectors +=
383 (int) new.cached_sectors - (int) old.cached_sectors;
385 u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
386 u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
390 if (!is_available_bucket(old) && is_available_bucket(new))
391 bch2_wake_allocator(ca);
394 static inline int __update_replicas(struct bch_fs *c,
395 struct bch_fs_usage *fs_usage,
396 struct bch_replicas_entry *r,
399 int idx = bch2_replicas_entry_idx(c, r);
404 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
405 fs_usage->replicas[idx] += sectors;
409 static inline int update_replicas(struct bch_fs *c,
410 struct bch_replicas_entry *r, s64 sectors,
411 unsigned journal_seq, bool gc)
413 struct bch_fs_usage __percpu *fs_usage;
414 int idx = bch2_replicas_entry_idx(c, r);
420 fs_usage = fs_usage_ptr(c, journal_seq, gc);
421 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
422 fs_usage->replicas[idx] += sectors;
427 static inline int update_cached_sectors(struct bch_fs *c,
428 unsigned dev, s64 sectors,
429 unsigned journal_seq, bool gc)
431 struct bch_replicas_padded r;
433 bch2_replicas_entry_cached(&r.e, dev);
435 return update_replicas(c, &r.e, sectors, journal_seq, gc);
438 static struct replicas_delta_list *
439 replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
441 struct replicas_delta_list *d = trans->fs_usage_deltas;
442 unsigned new_size = d ? (d->size + more) * 2 : 128;
443 unsigned alloc_size = sizeof(*d) + new_size;
445 WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
447 if (!d || d->used + more > d->size) {
448 d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
450 BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
453 d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
454 memset(d, 0, REPLICAS_DELTA_LIST_MAX);
456 if (trans->fs_usage_deltas)
457 memcpy(d, trans->fs_usage_deltas,
458 trans->fs_usage_deltas->size + sizeof(*d));
460 new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
461 kfree(trans->fs_usage_deltas);
465 trans->fs_usage_deltas = d;
470 static inline void update_replicas_list(struct btree_trans *trans,
471 struct bch_replicas_entry *r,
474 struct replicas_delta_list *d;
475 struct replicas_delta *n;
481 b = replicas_entry_bytes(r) + 8;
482 d = replicas_deltas_realloc(trans, b);
484 n = (void *) d->d + d->used;
486 memcpy(&n->r, r, replicas_entry_bytes(r));
487 bch2_replicas_entry_sort(&n->r);
491 static inline void update_cached_sectors_list(struct btree_trans *trans,
492 unsigned dev, s64 sectors)
494 struct bch_replicas_padded r;
496 bch2_replicas_entry_cached(&r.e, dev);
498 update_replicas_list(trans, &r.e, sectors);
501 #define do_mark_fn(fn, c, pos, flags, ...) \
505 percpu_rwsem_assert_held(&c->mark_lock); \
507 for (gc = 0; gc < 2 && !ret; gc++) \
508 if (!gc == !(flags & BTREE_TRIGGER_GC) || \
509 (gc && gc_visited(c, pos))) \
510 ret = fn(c, __VA_ARGS__, gc); \
514 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
515 size_t b, bool owned_by_allocator)
517 struct bucket *g = bucket(ca, b);
518 struct bucket_mark old, new;
520 old = bucket_cmpxchg(g, new, ({
521 new.owned_by_allocator = owned_by_allocator;
524 BUG_ON(owned_by_allocator == old.owned_by_allocator);
527 static int bch2_mark_alloc(struct bch_fs *c,
528 struct bkey_s_c old, struct bkey_s_c new,
529 u64 journal_seq, unsigned flags)
531 bool gc = flags & BTREE_TRIGGER_GC;
532 struct bkey_alloc_unpacked u;
535 struct bucket_mark old_m, m;
537 /* We don't do anything for deletions - do we?: */
538 if (new.k->type != KEY_TYPE_alloc &&
539 new.k->type != KEY_TYPE_alloc_v2)
543 * alloc btree is read in by bch2_alloc_read, not gc:
545 if ((flags & BTREE_TRIGGER_GC) &&
546 !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
549 ca = bch_dev_bkey_exists(c, new.k->p.inode);
551 if (new.k->p.offset >= ca->mi.nbuckets)
554 g = __bucket(ca, new.k->p.offset, gc);
555 u = bch2_alloc_unpack(new);
557 old_m = bucket_cmpxchg(g, m, ({
559 m.data_type = u.data_type;
560 m.dirty_sectors = u.dirty_sectors;
561 m.cached_sectors = u.cached_sectors;
562 m.stripe = u.stripe != 0;
565 m.journal_seq_valid = 1;
566 m.journal_seq = journal_seq;
570 bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
572 g->io_time[READ] = u.read_time;
573 g->io_time[WRITE] = u.write_time;
574 g->oldest_gen = u.oldest_gen;
576 g->stripe = u.stripe;
577 g->stripe_redundancy = u.stripe_redundancy;
580 * need to know if we're getting called from the invalidate path or
584 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
585 old_m.cached_sectors) {
586 if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
588 bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
592 trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
593 old_m.cached_sectors);
599 #define checked_add(a, b) \
601 unsigned _res = (unsigned) (a) + (b); \
602 bool overflow = _res > U16_MAX; \
609 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
610 size_t b, enum bch_data_type data_type,
611 unsigned sectors, bool gc)
613 struct bucket *g = __bucket(ca, b, gc);
614 struct bucket_mark old, new;
617 BUG_ON(data_type != BCH_DATA_sb &&
618 data_type != BCH_DATA_journal);
620 old = bucket_cmpxchg(g, new, ({
621 new.data_type = data_type;
622 overflow = checked_add(new.dirty_sectors, sectors);
625 bch2_fs_inconsistent_on(old.data_type &&
626 old.data_type != data_type, c,
627 "different types of data in same bucket: %s, %s",
628 bch2_data_types[old.data_type],
629 bch2_data_types[data_type]);
631 bch2_fs_inconsistent_on(overflow, c,
632 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
633 ca->dev_idx, b, new.gen,
634 bch2_data_types[old.data_type ?: data_type],
635 old.dirty_sectors, sectors);
638 bch2_dev_usage_update(c, ca, old, new, 0, gc);
643 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
644 size_t b, enum bch_data_type type,
645 unsigned sectors, struct gc_pos pos,
648 BUG_ON(type != BCH_DATA_sb &&
649 type != BCH_DATA_journal);
652 * Backup superblock might be past the end of our normal usable space:
654 if (b >= ca->mi.nbuckets)
658 do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
659 ca, b, type, sectors);
661 __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
665 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
667 EBUG_ON(sectors < 0);
669 return p.crc.compression_type &&
670 p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
671 ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
672 p.crc.uncompressed_size)
676 static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
677 const struct bch_extent_ptr *ptr,
678 s64 sectors, enum bch_data_type ptr_data_type,
679 u8 bucket_gen, u8 bucket_data_type,
680 u16 dirty_sectors, u16 cached_sectors)
682 size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
683 u16 bucket_sectors = !ptr->cached
688 if (gen_after(ptr->gen, bucket_gen)) {
689 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
690 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
692 ptr->dev, bucket_nr, bucket_gen,
693 bch2_data_types[bucket_data_type ?: ptr_data_type],
695 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
699 if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
700 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
701 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
703 ptr->dev, bucket_nr, bucket_gen,
704 bch2_data_types[bucket_data_type ?: ptr_data_type],
706 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
710 if (bucket_gen != ptr->gen && !ptr->cached) {
711 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
712 "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
714 ptr->dev, bucket_nr, bucket_gen,
715 bch2_data_types[bucket_data_type ?: ptr_data_type],
717 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
721 if (bucket_gen != ptr->gen)
724 if (bucket_data_type && ptr_data_type &&
725 bucket_data_type != ptr_data_type) {
726 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
727 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
729 ptr->dev, bucket_nr, bucket_gen,
730 bch2_data_types[bucket_data_type],
731 bch2_data_types[ptr_data_type],
732 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
736 if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
737 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
738 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
740 ptr->dev, bucket_nr, bucket_gen,
741 bch2_data_types[bucket_data_type ?: ptr_data_type],
742 bucket_sectors, sectors,
743 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
750 static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
752 u64 journal_seq, unsigned flags)
754 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
755 unsigned nr_data = s->nr_blocks - s->nr_redundant;
756 bool parity = ptr_idx >= nr_data;
757 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
758 bool gc = flags & BTREE_TRIGGER_GC;
759 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
760 struct bucket *g = PTR_BUCKET(ca, ptr, gc);
761 struct bucket_mark new, old;
765 if (g->stripe && g->stripe != k.k->p.offset) {
766 bch2_fs_inconsistent(c,
767 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
768 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
769 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
773 old = bucket_cmpxchg(g, new, ({
774 ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
775 new.dirty_sectors, new.cached_sectors);
780 new.data_type = BCH_DATA_parity;
781 new.dirty_sectors = le16_to_cpu(s->sectors);
785 new.journal_seq_valid = 1;
786 new.journal_seq = journal_seq;
790 g->stripe = k.k->p.offset;
791 g->stripe_redundancy = s->nr_redundant;
793 bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
797 static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
798 const struct bch_extent_ptr *ptr,
799 s64 sectors, enum bch_data_type ptr_data_type,
800 u8 bucket_gen, u8 *bucket_data_type,
801 u16 *dirty_sectors, u16 *cached_sectors)
803 u16 *dst_sectors = !ptr->cached
806 int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
807 bucket_gen, *bucket_data_type,
808 *dirty_sectors, *cached_sectors);
813 *dst_sectors += sectors;
814 *bucket_data_type = *dirty_sectors || *cached_sectors
819 static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
820 struct extent_ptr_decoded p,
821 s64 sectors, enum bch_data_type data_type,
822 u64 journal_seq, unsigned flags)
824 bool gc = flags & BTREE_TRIGGER_GC;
825 struct bucket_mark old, new;
826 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
827 struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
832 v = atomic64_read(&g->_mark.v);
834 new.v.counter = old.v.counter = v;
835 bucket_data_type = new.data_type;
837 ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
840 &new.cached_sectors);
844 new.data_type = bucket_data_type;
847 new.journal_seq_valid = 1;
848 new.journal_seq = journal_seq;
851 if (flags & BTREE_TRIGGER_NOATOMIC) {
855 } while ((v = atomic64_cmpxchg(&g->_mark.v,
857 new.v.counter)) != old.v.counter);
859 bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
861 BUG_ON(!gc && bucket_became_unavailable(old, new));
866 static int bch2_mark_stripe_ptr(struct bch_fs *c,
867 struct bch_extent_stripe_ptr p,
868 enum bch_data_type data_type,
870 unsigned journal_seq, unsigned flags)
872 bool gc = flags & BTREE_TRIGGER_GC;
873 struct bch_replicas_padded r;
875 unsigned i, blocks_nonempty = 0;
877 m = genradix_ptr(&c->stripes[gc], p.idx);
879 spin_lock(&c->ec_stripes_heap_lock);
881 if (!m || !m->alive) {
882 spin_unlock(&c->ec_stripes_heap_lock);
883 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
885 bch2_inconsistent_error(c);
889 m->block_sectors[p.block] += sectors;
893 for (i = 0; i < m->nr_blocks; i++)
894 blocks_nonempty += m->block_sectors[i] != 0;
896 if (m->blocks_nonempty != blocks_nonempty) {
897 m->blocks_nonempty = blocks_nonempty;
899 bch2_stripes_heap_update(c, m, p.idx);
902 spin_unlock(&c->ec_stripes_heap_lock);
904 r.e.data_type = data_type;
905 update_replicas(c, &r.e, sectors, journal_seq, gc);
910 static int bch2_mark_extent(struct bch_fs *c,
911 struct bkey_s_c old, struct bkey_s_c new,
912 unsigned journal_seq, unsigned flags)
914 bool gc = flags & BTREE_TRIGGER_GC;
915 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
916 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
917 const union bch_extent_entry *entry;
918 struct extent_ptr_decoded p;
919 struct bch_replicas_padded r;
920 enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
923 s64 sectors = bkey_is_btree_ptr(k.k)
924 ? c->opts.btree_node_size
926 s64 dirty_sectors = 0;
930 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
931 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
933 r.e.data_type = data_type;
937 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
938 s64 disk_sectors = ptr_disk_sectors(sectors, p);
940 if (flags & BTREE_TRIGGER_OVERWRITE)
941 disk_sectors = -disk_sectors;
943 ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
952 if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
954 bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
958 } else if (!p.has_ec) {
959 dirty_sectors += disk_sectors;
960 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
962 ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
963 disk_sectors, journal_seq, flags);
968 * There may be other dirty pointers in this extent, but
969 * if so they're not required for mounting if we have an
970 * erasure coded pointer in this extent:
977 if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
980 bch2_bkey_val_to_text(&PBUF(buf), c, k);
981 bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
989 static int bch2_mark_stripe(struct bch_fs *c,
990 struct bkey_s_c old, struct bkey_s_c new,
991 u64 journal_seq, unsigned flags)
993 bool gc = flags & BTREE_TRIGGER_GC;
994 size_t idx = new.k->p.offset;
995 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
996 ? bkey_s_c_to_stripe(old).v : NULL;
997 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
998 ? bkey_s_c_to_stripe(new).v : NULL;
999 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
1003 BUG_ON(gc && old_s);
1005 if (!m || (old_s && !m->alive)) {
1006 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
1008 bch2_inconsistent_error(c);
1013 spin_lock(&c->ec_stripes_heap_lock);
1014 bch2_stripes_heap_del(c, m, idx);
1015 spin_unlock(&c->ec_stripes_heap_lock);
1017 memset(m, 0, sizeof(*m));
1020 m->sectors = le16_to_cpu(new_s->sectors);
1021 m->algorithm = new_s->algorithm;
1022 m->nr_blocks = new_s->nr_blocks;
1023 m->nr_redundant = new_s->nr_redundant;
1024 m->blocks_nonempty = 0;
1026 for (i = 0; i < new_s->nr_blocks; i++) {
1027 m->block_sectors[i] =
1028 stripe_blockcount_get(new_s, i);
1029 m->blocks_nonempty += !!m->block_sectors[i];
1031 m->ptrs[i] = new_s->ptrs[i];
1034 bch2_bkey_to_replicas(&m->r.e, new);
1037 spin_lock(&c->ec_stripes_heap_lock);
1038 bch2_stripes_heap_update(c, m, idx);
1039 spin_unlock(&c->ec_stripes_heap_lock);
1045 * gc recalculates this field from stripe ptr
1048 memset(m->block_sectors, 0, sizeof(m->block_sectors));
1049 m->blocks_nonempty = 0;
1051 for (i = 0; i < new_s->nr_blocks; i++) {
1052 ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
1057 if (update_replicas(c, &m->r.e,
1058 ((s64) m->sectors * m->nr_redundant),
1062 bch2_bkey_val_to_text(&PBUF(buf), c, new);
1063 bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
1071 static int bch2_mark_inode(struct bch_fs *c,
1072 struct bkey_s_c old, struct bkey_s_c new,
1073 u64 journal_seq, unsigned flags)
1075 struct bch_fs_usage __percpu *fs_usage;
1078 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
1079 fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
1080 fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
1085 static int bch2_mark_reservation(struct bch_fs *c,
1086 struct bkey_s_c old, struct bkey_s_c new,
1087 u64 journal_seq, unsigned flags)
1089 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1090 struct bch_fs_usage __percpu *fs_usage;
1091 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1092 s64 sectors = (s64) k.k->size;
1094 if (flags & BTREE_TRIGGER_OVERWRITE)
1096 sectors *= replicas;
1099 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
1100 replicas = clamp_t(unsigned, replicas, 1,
1101 ARRAY_SIZE(fs_usage->persistent_reserved));
1103 fs_usage->reserved += sectors;
1104 fs_usage->persistent_reserved[replicas - 1] += sectors;
1110 static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
1111 u64 *idx, unsigned flags, size_t r_idx)
1113 struct reflink_gc *r;
1114 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
1117 if (r_idx >= c->reflink_gc_nr)
1120 r = genradix_ptr(&c->reflink_gc_table, r_idx);
1121 if (*idx < r->offset - r->size)
1124 BUG_ON((s64) r->refcount + add < 0);
1134 * XXX: we're replacing the entire reflink pointer with an error
1135 * key, we should just be replacing the part that was missing:
1137 if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
1138 p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
1139 struct bkey_i_error *new;
1141 new = kmalloc(sizeof(*new), GFP_KERNEL);
1143 bch_err(c, "%s: error allocating new key", __func__);
1148 new->k.type = KEY_TYPE_error;
1150 new->k.size = p.k->size;
1151 ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
1157 static int bch2_mark_reflink_p(struct bch_fs *c,
1158 struct bkey_s_c old, struct bkey_s_c new,
1159 u64 journal_seq, unsigned flags)
1161 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1162 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
1163 struct reflink_gc *ref;
1165 u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
1166 u64 end_idx = le64_to_cpu(p.v->idx) + p.k->size +
1167 le32_to_cpu(p.v->back_pad);
1170 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1171 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1174 r = c->reflink_gc_nr;
1176 m = l + (r - l) / 2;
1178 ref = genradix_ptr(&c->reflink_gc_table, m);
1179 if (ref->offset <= idx)
1185 while (idx < end_idx && !ret)
1186 ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
1191 static int bch2_mark_key_locked(struct bch_fs *c,
1192 struct bkey_s_c old,
1193 struct bkey_s_c new,
1194 u64 journal_seq, unsigned flags)
1196 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1198 BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1200 switch (k.k->type) {
1201 case KEY_TYPE_alloc:
1202 case KEY_TYPE_alloc_v2:
1203 return bch2_mark_alloc(c, old, new, journal_seq, flags);
1204 case KEY_TYPE_btree_ptr:
1205 case KEY_TYPE_btree_ptr_v2:
1206 case KEY_TYPE_extent:
1207 case KEY_TYPE_reflink_v:
1208 return bch2_mark_extent(c, old, new, journal_seq, flags);
1209 case KEY_TYPE_stripe:
1210 return bch2_mark_stripe(c, old, new, journal_seq, flags);
1211 case KEY_TYPE_inode:
1212 return bch2_mark_inode(c, old, new, journal_seq, flags);
1213 case KEY_TYPE_reservation:
1214 return bch2_mark_reservation(c, old, new, journal_seq, flags);
1215 case KEY_TYPE_reflink_p:
1216 return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
1217 case KEY_TYPE_snapshot:
1218 return bch2_mark_snapshot(c, old, new, journal_seq, flags);
1224 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
1226 struct bkey deleted = KEY(0, 0, 0);
1227 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
1230 percpu_down_read(&c->mark_lock);
1231 ret = bch2_mark_key_locked(c, old, new, 0, flags);
1232 percpu_up_read(&c->mark_lock);
1237 int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
1238 struct bkey_i *new, unsigned flags)
1240 struct bch_fs *c = trans->c;
1241 struct bkey _deleted = KEY(0, 0, 0);
1242 struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
1243 struct bkey_s_c old;
1244 struct bkey unpacked;
1247 if (unlikely(flags & BTREE_TRIGGER_NORUN))
1250 if (!btree_node_type_needs_gc(path->btree_id))
1253 old = bch2_btree_path_peek_slot(path, &unpacked);
1255 if (old.k->type == new->k.type &&
1256 ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
1257 ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
1258 trans->journal_res.seq,
1259 BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1261 ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
1262 trans->journal_res.seq,
1263 BTREE_TRIGGER_INSERT|flags) ?:
1264 bch2_mark_key_locked(c, old, deleted,
1265 trans->journal_res.seq,
1266 BTREE_TRIGGER_OVERWRITE|flags);
1272 static noinline __cold
1273 void fs_usage_apply_warn(struct btree_trans *trans,
1274 unsigned disk_res_sectors,
1275 s64 should_not_have_added)
1277 struct bch_fs *c = trans->c;
1278 struct btree_insert_entry *i;
1281 bch_err(c, "disk usage increased %lli more than %u sectors reserved",
1282 should_not_have_added, disk_res_sectors);
1284 trans_for_each_update(trans, i) {
1285 pr_err("while inserting");
1286 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
1288 pr_err("overlapping with");
1292 struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
1294 bch2_bkey_val_to_text(&PBUF(buf), c, k);
1297 struct bkey_cached *ck = (void *) i->path->l[0].b;
1300 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
1308 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
1309 struct replicas_delta_list *deltas)
1311 struct bch_fs *c = trans->c;
1312 static int warned_disk_usage = 0;
1314 unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
1315 struct replicas_delta *d = deltas->d;
1316 struct replicas_delta *top = (void *) deltas->d + deltas->used;
1317 struct bch_fs_usage *dst;
1318 s64 added = 0, should_not_have_added;
1321 percpu_rwsem_assert_held(&c->mark_lock);
1324 dst = fs_usage_ptr(c, trans->journal_res.seq, false);
1326 for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
1327 switch (d->r.data_type) {
1328 case BCH_DATA_btree:
1330 case BCH_DATA_parity:
1334 BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
1337 dst->nr_inodes += deltas->nr_inodes;
1339 for (i = 0; i < BCH_REPLICAS_MAX; i++) {
1340 added += deltas->persistent_reserved[i];
1341 dst->reserved += deltas->persistent_reserved[i];
1342 dst->persistent_reserved[i] += deltas->persistent_reserved[i];
1346 * Not allowed to reduce sectors_available except by getting a
1349 should_not_have_added = added - (s64) disk_res_sectors;
1350 if (unlikely(should_not_have_added > 0)) {
1351 u64 old, new, v = atomic64_read(&c->sectors_available);
1355 new = max_t(s64, 0, old - should_not_have_added);
1356 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1359 added -= should_not_have_added;
1364 trans->disk_res->sectors -= added;
1365 this_cpu_sub(*c->online_reserved, added);
1370 if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
1371 fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
1376 static struct bkey_alloc_buf *
1377 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
1378 const struct bch_extent_ptr *ptr,
1379 struct bkey_alloc_unpacked *u)
1381 struct bch_fs *c = trans->c;
1382 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1383 struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
1385 struct bkey_alloc_buf *a;
1386 struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
1389 a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
1393 bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
1395 BTREE_ITER_CACHED_NOFILL|
1397 ret = bch2_btree_iter_traverse(iter);
1399 bch2_trans_iter_exit(trans, iter);
1400 return ERR_PTR(ret);
1403 if (update && !bpos_cmp(update->k.p, pos)) {
1404 *u = bch2_alloc_unpack(bkey_i_to_s_c(update));
1406 percpu_down_read(&c->mark_lock);
1407 g = bucket(ca, pos.offset);
1408 *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
1409 percpu_up_read(&c->mark_lock);
1415 static int bch2_trans_mark_pointer(struct btree_trans *trans,
1416 struct bkey_s_c k, struct extent_ptr_decoded p,
1417 s64 sectors, enum bch_data_type data_type)
1419 struct bch_fs *c = trans->c;
1420 struct btree_iter iter;
1421 struct bkey_alloc_unpacked u;
1422 struct bkey_alloc_buf *a;
1425 a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
1429 ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
1430 &u.dirty_sectors, &u.cached_sectors);
1434 bch2_alloc_pack(c, a, u);
1435 bch2_trans_update(trans, &iter, &a->k, 0);
1437 bch2_trans_iter_exit(trans, &iter);
1441 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
1442 struct extent_ptr_decoded p,
1443 s64 sectors, enum bch_data_type data_type)
1445 struct bch_fs *c = trans->c;
1446 struct btree_iter iter;
1448 struct bkey_i_stripe *s;
1449 struct bch_replicas_padded r;
1452 bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
1454 BTREE_ITER_WITH_UPDATES);
1455 k = bch2_btree_iter_peek_slot(&iter);
1460 if (k.k->type != KEY_TYPE_stripe) {
1461 bch2_fs_inconsistent(c,
1462 "pointer to nonexistent stripe %llu",
1464 bch2_inconsistent_error(c);
1469 if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
1470 bch2_fs_inconsistent(c,
1471 "stripe pointer doesn't match stripe %llu",
1477 s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1478 ret = PTR_ERR_OR_ZERO(s);
1482 bkey_reassemble(&s->k_i, k);
1483 stripe_blockcount_set(&s->v, p.ec.block,
1484 stripe_blockcount_get(&s->v, p.ec.block) +
1486 bch2_trans_update(trans, &iter, &s->k_i, 0);
1488 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
1489 r.e.data_type = data_type;
1490 update_replicas_list(trans, &r.e, sectors);
1492 bch2_trans_iter_exit(trans, &iter);
1496 static int bch2_trans_mark_extent(struct btree_trans *trans,
1497 struct bkey_s_c k, unsigned flags)
1499 struct bch_fs *c = trans->c;
1500 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1501 const union bch_extent_entry *entry;
1502 struct extent_ptr_decoded p;
1503 struct bch_replicas_padded r;
1504 enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
1507 s64 sectors = bkey_is_btree_ptr(k.k)
1508 ? c->opts.btree_node_size
1510 s64 dirty_sectors = 0;
1514 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1515 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1517 r.e.data_type = data_type;
1519 r.e.nr_required = 1;
1521 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1522 s64 disk_sectors = ptr_disk_sectors(sectors, p);
1524 if (flags & BTREE_TRIGGER_OVERWRITE)
1525 disk_sectors = -disk_sectors;
1527 ret = bch2_trans_mark_pointer(trans, k, p,
1528 disk_sectors, data_type);
1536 update_cached_sectors_list(trans, p.ptr.dev,
1538 } else if (!p.has_ec) {
1539 dirty_sectors += disk_sectors;
1540 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1542 ret = bch2_trans_mark_stripe_ptr(trans, p,
1543 disk_sectors, data_type);
1547 r.e.nr_required = 0;
1552 update_replicas_list(trans, &r.e, dirty_sectors);
1557 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
1558 struct bkey_s_c_stripe s,
1559 unsigned idx, bool deleting)
1561 struct bch_fs *c = trans->c;
1562 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
1563 struct bkey_alloc_buf *a;
1564 struct btree_iter iter;
1565 struct bkey_alloc_unpacked u;
1566 bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
1569 a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
1574 s64 sectors = le16_to_cpu(s.v->sectors);
1579 u.dirty_sectors += sectors;
1580 u.data_type = u.dirty_sectors
1586 if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
1587 "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
1588 iter.pos.inode, iter.pos.offset, u.gen,
1589 u.stripe, s.k->p.offset)) {
1594 u.stripe = s.k->p.offset;
1595 u.stripe_redundancy = s.v->nr_redundant;
1598 u.stripe_redundancy = 0;
1601 bch2_alloc_pack(c, a, u);
1602 bch2_trans_update(trans, &iter, &a->k, 0);
1604 bch2_trans_iter_exit(trans, &iter);
1608 static int bch2_trans_mark_stripe(struct btree_trans *trans,
1609 struct bkey_s_c old, struct bkey_s_c new,
1612 struct bkey_s_c_stripe old_s = { .k = NULL };
1613 struct bkey_s_c_stripe new_s = { .k = NULL };
1614 struct bch_replicas_padded r;
1618 if (old.k->type == KEY_TYPE_stripe)
1619 old_s = bkey_s_c_to_stripe(old);
1620 if (new.k->type == KEY_TYPE_stripe)
1621 new_s = bkey_s_c_to_stripe(new);
1624 * If the pointers aren't changing, we don't need to do anything:
1626 if (new_s.k && old_s.k &&
1627 new_s.v->nr_blocks == old_s.v->nr_blocks &&
1628 new_s.v->nr_redundant == old_s.v->nr_redundant &&
1629 !memcmp(old_s.v->ptrs, new_s.v->ptrs,
1630 new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
1634 s64 sectors = le16_to_cpu(new_s.v->sectors);
1636 bch2_bkey_to_replicas(&r.e, new);
1637 update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
1639 for (i = 0; i < new_s.v->nr_blocks; i++) {
1640 ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
1648 s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
1650 bch2_bkey_to_replicas(&r.e, old);
1651 update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
1653 for (i = 0; i < old_s.v->nr_blocks; i++) {
1654 ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
1664 static int bch2_trans_mark_inode(struct btree_trans *trans,
1665 struct bkey_s_c old,
1666 struct bkey_s_c new,
1669 int nr = (new.k->type == KEY_TYPE_inode) -
1670 (old.k->type == KEY_TYPE_inode);
1673 struct replicas_delta_list *d =
1674 replicas_deltas_realloc(trans, 0);
1681 static int bch2_trans_mark_reservation(struct btree_trans *trans,
1682 struct bkey_s_c k, unsigned flags)
1684 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1685 s64 sectors = (s64) k.k->size;
1686 struct replicas_delta_list *d;
1688 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1689 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1691 if (flags & BTREE_TRIGGER_OVERWRITE)
1693 sectors *= replicas;
1695 d = replicas_deltas_realloc(trans, 0);
1697 replicas = clamp_t(unsigned, replicas, 1,
1698 ARRAY_SIZE(d->persistent_reserved));
1700 d->persistent_reserved[replicas - 1] += sectors;
1704 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
1705 struct bkey_s_c_reflink_p p,
1706 u64 *idx, unsigned flags)
1708 struct bch_fs *c = trans->c;
1709 struct btree_iter iter;
1713 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
1717 bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
1719 BTREE_ITER_WITH_UPDATES);
1720 k = bch2_btree_iter_peek_slot(&iter);
1725 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1726 ret = PTR_ERR_OR_ZERO(n);
1730 bkey_reassemble(n, k);
1732 refcount = bkey_refcount(n);
1734 bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
1735 bch2_fs_inconsistent(c,
1736 "nonexistent indirect extent at %llu while marking\n %s",
1742 if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
1743 bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
1744 bch2_fs_inconsistent(c,
1745 "indirect extent refcount underflow at %llu while marking\n %s",
1751 if (flags & BTREE_TRIGGER_INSERT) {
1752 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
1755 pad = max_t(s64, le32_to_cpu(v->front_pad),
1756 le64_to_cpu(v->idx) - bkey_start_offset(k.k));
1757 BUG_ON(pad > U32_MAX);
1758 v->front_pad = cpu_to_le32(pad);
1760 pad = max_t(s64, le32_to_cpu(v->back_pad),
1761 k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
1762 BUG_ON(pad > U32_MAX);
1763 v->back_pad = cpu_to_le32(pad);
1766 le64_add_cpu(refcount, add);
1769 n->k.type = KEY_TYPE_deleted;
1770 set_bkey_val_u64s(&n->k, 0);
1773 bch2_btree_iter_set_pos_to_extent_start(&iter);
1774 ret = bch2_trans_update(trans, &iter, n, 0);
1778 *idx = k.k->p.offset;
1780 bch2_trans_iter_exit(trans, &iter);
1784 static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
1785 struct bkey_s_c k, unsigned flags)
1787 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
1791 if (flags & BTREE_TRIGGER_INSERT) {
1792 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
1794 v->front_pad = v->back_pad = 0;
1797 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
1798 end_idx = le64_to_cpu(p.v->idx) + p.k->size +
1799 le32_to_cpu(p.v->back_pad);
1801 while (idx < end_idx && !ret)
1802 ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
1807 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
1808 struct bkey_s_c new, unsigned flags)
1810 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1812 BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1814 switch (k.k->type) {
1815 case KEY_TYPE_btree_ptr:
1816 case KEY_TYPE_btree_ptr_v2:
1817 case KEY_TYPE_extent:
1818 case KEY_TYPE_reflink_v:
1819 return bch2_trans_mark_extent(trans, k, flags);
1820 case KEY_TYPE_stripe:
1821 return bch2_trans_mark_stripe(trans, old, new, flags);
1822 case KEY_TYPE_inode:
1823 return bch2_trans_mark_inode(trans, old, new, flags);
1824 case KEY_TYPE_reservation:
1825 return bch2_trans_mark_reservation(trans, k, flags);
1826 case KEY_TYPE_reflink_p:
1827 return bch2_trans_mark_reflink_p(trans, k, flags);
1833 int bch2_trans_mark_update(struct btree_trans *trans,
1834 struct btree_path *path,
1838 struct bkey _deleted = KEY(0, 0, 0);
1839 struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
1840 struct bkey_s_c old;
1841 struct bkey unpacked;
1844 if (unlikely(flags & BTREE_TRIGGER_NORUN))
1847 if (!btree_node_type_needs_gc(path->btree_id))
1850 old = bch2_btree_path_peek_slot(path, &unpacked);
1852 if (old.k->type == new->k.type &&
1853 ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
1854 ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
1855 BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1857 ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
1858 BTREE_TRIGGER_INSERT|flags) ?:
1859 bch2_trans_mark_key(trans, old, deleted,
1860 BTREE_TRIGGER_OVERWRITE|flags);
1866 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1867 struct bch_dev *ca, size_t b,
1868 enum bch_data_type type,
1871 struct bch_fs *c = trans->c;
1872 struct btree_iter iter;
1873 struct bkey_alloc_unpacked u;
1874 struct bkey_alloc_buf *a;
1875 struct bch_extent_ptr ptr = {
1877 .offset = bucket_to_sector(ca, b),
1882 * Backup superblock might be past the end of our normal usable space:
1884 if (b >= ca->mi.nbuckets)
1887 a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
1891 if (u.data_type && u.data_type != type) {
1892 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
1893 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
1895 iter.pos.inode, iter.pos.offset, u.gen,
1896 bch2_data_types[u.data_type],
1897 bch2_data_types[type],
1898 bch2_data_types[type]);
1904 u.dirty_sectors = sectors;
1906 bch2_alloc_pack(c, a, u);
1907 bch2_trans_update(trans, &iter, &a->k, 0);
1909 bch2_trans_iter_exit(trans, &iter);
1913 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1914 struct bch_dev *ca, size_t b,
1915 enum bch_data_type type,
1918 return __bch2_trans_do(trans, NULL, NULL, 0,
1919 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
1922 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
1925 enum bch_data_type type,
1926 u64 *bucket, unsigned *bucket_sectors)
1929 u64 b = sector_to_bucket(ca, start);
1931 min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1933 if (b != *bucket && *bucket_sectors) {
1934 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
1935 type, *bucket_sectors);
1939 *bucket_sectors = 0;
1943 *bucket_sectors += sectors;
1945 } while (start < end);
1950 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
1953 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1955 unsigned i, bucket_sectors = 0;
1958 for (i = 0; i < layout->nr_superblocks; i++) {
1959 u64 offset = le64_to_cpu(layout->sb_offset[i]);
1961 if (offset == BCH_SB_SECTOR) {
1962 ret = bch2_trans_mark_metadata_sectors(trans, ca,
1964 BCH_DATA_sb, &bucket, &bucket_sectors);
1969 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
1970 offset + (1 << layout->sb_max_size_bits),
1971 BCH_DATA_sb, &bucket, &bucket_sectors);
1976 if (bucket_sectors) {
1977 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1978 bucket, BCH_DATA_sb, bucket_sectors);
1983 for (i = 0; i < ca->journal.nr; i++) {
1984 ret = bch2_trans_mark_metadata_bucket(trans, ca,
1985 ca->journal.buckets[i],
1986 BCH_DATA_journal, ca->mi.bucket_size);
1994 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
1996 return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
1997 __bch2_trans_mark_dev_sb(&trans, ca));
2000 /* Disk reservations: */
2002 #define SECTORS_CACHE 1024
2004 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
2005 u64 sectors, int flags)
2007 struct bch_fs_pcpu *pcpu;
2009 s64 sectors_available;
2012 percpu_down_read(&c->mark_lock);
2014 pcpu = this_cpu_ptr(c->pcpu);
2016 if (sectors <= pcpu->sectors_available)
2019 v = atomic64_read(&c->sectors_available);
2022 get = min((u64) sectors + SECTORS_CACHE, old);
2024 if (get < sectors) {
2028 } while ((v = atomic64_cmpxchg(&c->sectors_available,
2029 old, old - get)) != old);
2031 pcpu->sectors_available += get;
2034 pcpu->sectors_available -= sectors;
2035 this_cpu_add(*c->online_reserved, sectors);
2036 res->sectors += sectors;
2039 percpu_up_read(&c->mark_lock);
2043 mutex_lock(&c->sectors_available_lock);
2045 percpu_u64_set(&c->pcpu->sectors_available, 0);
2046 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
2048 if (sectors <= sectors_available ||
2049 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
2050 atomic64_set(&c->sectors_available,
2051 max_t(s64, 0, sectors_available - sectors));
2052 this_cpu_add(*c->online_reserved, sectors);
2053 res->sectors += sectors;
2056 atomic64_set(&c->sectors_available, sectors_available);
2060 mutex_unlock(&c->sectors_available_lock);
2061 percpu_up_read(&c->mark_lock);
2066 /* Startup/shutdown: */
2068 static void buckets_free_rcu(struct rcu_head *rcu)
2070 struct bucket_array *buckets =
2071 container_of(rcu, struct bucket_array, rcu);
2074 sizeof(struct bucket_array) +
2075 buckets->nbuckets * sizeof(struct bucket));
2078 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
2080 struct bucket_array *buckets = NULL, *old_buckets = NULL;
2081 unsigned long *buckets_nouse = NULL;
2082 alloc_fifo free[RESERVE_NR];
2083 alloc_fifo free_inc;
2084 alloc_heap alloc_heap;
2086 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
2087 ca->mi.bucket_size / c->opts.btree_node_size);
2088 /* XXX: these should be tunable */
2089 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
2090 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
2091 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
2093 bool resize = ca->buckets[0] != NULL;
2097 memset(&free, 0, sizeof(free));
2098 memset(&free_inc, 0, sizeof(free_inc));
2099 memset(&alloc_heap, 0, sizeof(alloc_heap));
2101 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
2102 nbuckets * sizeof(struct bucket),
2103 GFP_KERNEL|__GFP_ZERO)) ||
2104 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
2105 sizeof(unsigned long),
2106 GFP_KERNEL|__GFP_ZERO)) ||
2107 !init_fifo(&free[RESERVE_MOVINGGC],
2108 copygc_reserve, GFP_KERNEL) ||
2109 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
2110 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
2111 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
2114 buckets->first_bucket = ca->mi.first_bucket;
2115 buckets->nbuckets = nbuckets;
2117 bch2_copygc_stop(c);
2120 down_write(&c->gc_lock);
2121 down_write(&ca->bucket_lock);
2122 percpu_down_write(&c->mark_lock);
2125 old_buckets = bucket_array(ca);
2128 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
2132 n * sizeof(struct bucket));
2133 memcpy(buckets_nouse,
2135 BITS_TO_LONGS(n) * sizeof(unsigned long));
2138 rcu_assign_pointer(ca->buckets[0], buckets);
2139 buckets = old_buckets;
2141 swap(ca->buckets_nouse, buckets_nouse);
2144 percpu_up_write(&c->mark_lock);
2145 up_write(&c->gc_lock);
2148 spin_lock(&c->freelist_lock);
2149 for (i = 0; i < RESERVE_NR; i++) {
2150 fifo_move(&free[i], &ca->free[i]);
2151 swap(ca->free[i], free[i]);
2153 fifo_move(&free_inc, &ca->free_inc);
2154 swap(ca->free_inc, free_inc);
2155 spin_unlock(&c->freelist_lock);
2157 /* with gc lock held, alloc_heap can't be in use: */
2158 swap(ca->alloc_heap, alloc_heap);
2160 nbuckets = ca->mi.nbuckets;
2163 up_write(&ca->bucket_lock);
2167 free_heap(&alloc_heap);
2168 free_fifo(&free_inc);
2169 for (i = 0; i < RESERVE_NR; i++)
2170 free_fifo(&free[i]);
2171 kvpfree(buckets_nouse,
2172 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
2174 call_rcu(&old_buckets->rcu, buckets_free_rcu);
2179 void bch2_dev_buckets_free(struct bch_dev *ca)
2183 free_heap(&ca->alloc_heap);
2184 free_fifo(&ca->free_inc);
2185 for (i = 0; i < RESERVE_NR; i++)
2186 free_fifo(&ca->free[i]);
2187 kvpfree(ca->buckets_nouse,
2188 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
2189 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
2190 sizeof(struct bucket_array) +
2191 ca->mi.nbuckets * sizeof(struct bucket));
2193 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
2194 free_percpu(ca->usage[i]);
2195 kfree(ca->usage_base);
2198 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
2202 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
2203 if (!ca->usage_base)
2206 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
2207 ca->usage[i] = alloc_percpu(struct bch_dev_usage);
2212 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;