1 // SPDX-License-Identifier: GPL-2.0
3 * Code for manipulating bucket marks for garbage collection.
5 * Copyright 2014 Datera, Inc.
9 #include "alloc_background.h"
12 #include "btree_update.h"
20 #include "subvolume.h"
22 #include <linux/preempt.h>
23 #include <trace/events/bcachefs.h>
25 static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
26 enum bch_data_type data_type,
31 fs_usage->btree += sectors;
35 fs_usage->data += sectors;
38 fs_usage->cached += sectors;
46 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
49 void bch2_bucket_seq_cleanup(struct bch_fs *c)
51 u64 journal_seq = atomic64_read(&c->journal.seq);
52 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
54 struct bucket_array *buckets;
59 if (journal_seq - c->last_bucket_seq_cleanup <
60 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
63 c->last_bucket_seq_cleanup = journal_seq;
65 for_each_member_device(ca, c, i) {
66 down_read(&ca->bucket_lock);
67 buckets = bucket_array(ca);
69 for_each_bucket(g, buckets) {
70 bucket_cmpxchg(g, m, ({
71 if (!m.journal_seq_valid ||
72 bucket_needs_journal_commit(m, last_seq_ondisk))
75 m.journal_seq_valid = 0;
78 up_read(&ca->bucket_lock);
82 void bch2_fs_usage_initialize(struct bch_fs *c)
84 struct bch_fs_usage *usage;
88 percpu_down_write(&c->mark_lock);
89 usage = c->usage_base;
91 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
92 bch2_fs_usage_acc_to_base(c, i);
94 for (i = 0; i < BCH_REPLICAS_MAX; i++)
95 usage->reserved += usage->persistent_reserved[i];
97 for (i = 0; i < c->replicas.nr; i++) {
98 struct bch_replicas_entry *e =
99 cpu_replicas_entry(&c->replicas, i);
101 fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
104 for_each_member_device(ca, c, i) {
105 struct bch_dev_usage dev = bch2_dev_usage_read(ca);
107 usage->hidden += (dev.d[BCH_DATA_sb].buckets +
108 dev.d[BCH_DATA_journal].buckets) *
112 percpu_up_write(&c->mark_lock);
115 static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
116 unsigned journal_seq,
119 return this_cpu_ptr(gc
121 : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
124 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
126 struct bch_fs *c = ca->fs;
127 struct bch_dev_usage ret;
128 unsigned seq, i, u64s = dev_usage_u64s();
131 seq = read_seqcount_begin(&c->usage_lock);
132 memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
133 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
134 acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
135 } while (read_seqcount_retry(&c->usage_lock, seq));
140 static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
141 unsigned journal_seq,
144 return this_cpu_ptr(gc
146 : c->usage[journal_seq & JOURNAL_BUF_MASK]);
149 u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
151 ssize_t offset = v - (u64 *) c->usage_base;
155 BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
156 percpu_rwsem_assert_held(&c->mark_lock);
159 seq = read_seqcount_begin(&c->usage_lock);
162 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
163 ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
164 } while (read_seqcount_retry(&c->usage_lock, seq));
169 struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
171 struct bch_fs_usage_online *ret;
172 unsigned seq, i, u64s;
174 percpu_down_read(&c->mark_lock);
176 ret = kmalloc(sizeof(struct bch_fs_usage_online) +
177 sizeof(u64) * c->replicas.nr, GFP_NOFS);
178 if (unlikely(!ret)) {
179 percpu_up_read(&c->mark_lock);
183 ret->online_reserved = percpu_u64_get(c->online_reserved);
185 u64s = fs_usage_u64s(c);
187 seq = read_seqcount_begin(&c->usage_lock);
188 memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
189 for (i = 0; i < ARRAY_SIZE(c->usage); i++)
190 acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
191 } while (read_seqcount_retry(&c->usage_lock, seq));
196 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
199 unsigned i, u64s = fs_usage_u64s(c);
201 BUG_ON(idx >= ARRAY_SIZE(c->usage));
204 write_seqcount_begin(&c->usage_lock);
206 acc_u64s_percpu((u64 *) c->usage_base,
207 (u64 __percpu *) c->usage[idx], u64s);
208 percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
211 for_each_member_device_rcu(ca, c, i, NULL) {
212 u64s = dev_usage_u64s();
214 acc_u64s_percpu((u64 *) ca->usage_base,
215 (u64 __percpu *) ca->usage[idx], u64s);
216 percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
220 write_seqcount_end(&c->usage_lock);
224 void bch2_fs_usage_to_text(struct printbuf *out,
226 struct bch_fs_usage_online *fs_usage)
230 pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
232 pr_buf(out, "hidden:\t\t\t\t%llu\n",
234 pr_buf(out, "data:\t\t\t\t%llu\n",
236 pr_buf(out, "cached:\t\t\t\t%llu\n",
238 pr_buf(out, "reserved:\t\t\t%llu\n",
239 fs_usage->u.reserved);
240 pr_buf(out, "nr_inodes:\t\t\t%llu\n",
241 fs_usage->u.nr_inodes);
242 pr_buf(out, "online reserved:\t\t%llu\n",
243 fs_usage->online_reserved);
246 i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
248 pr_buf(out, "%u replicas:\n", i + 1);
249 pr_buf(out, "\treserved:\t\t%llu\n",
250 fs_usage->u.persistent_reserved[i]);
253 for (i = 0; i < c->replicas.nr; i++) {
254 struct bch_replicas_entry *e =
255 cpu_replicas_entry(&c->replicas, i);
258 bch2_replicas_entry_to_text(out, e);
259 pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
263 static u64 reserve_factor(u64 r)
265 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
268 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
270 return min(fs_usage->u.hidden +
273 reserve_factor(fs_usage->u.reserved +
274 fs_usage->online_reserved),
278 static struct bch_fs_usage_short
279 __bch2_fs_usage_read_short(struct bch_fs *c)
281 struct bch_fs_usage_short ret;
284 ret.capacity = c->capacity -
285 bch2_fs_usage_read_one(c, &c->usage_base->hidden);
287 data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
288 bch2_fs_usage_read_one(c, &c->usage_base->btree);
289 reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
290 percpu_u64_get(c->online_reserved);
292 ret.used = min(ret.capacity, data + reserve_factor(reserved));
293 ret.free = ret.capacity - ret.used;
295 ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
300 struct bch_fs_usage_short
301 bch2_fs_usage_read_short(struct bch_fs *c)
303 struct bch_fs_usage_short ret;
305 percpu_down_read(&c->mark_lock);
306 ret = __bch2_fs_usage_read_short(c);
307 percpu_up_read(&c->mark_lock);
312 static inline int is_unavailable_bucket(struct bucket_mark m)
314 return !is_available_bucket(m);
317 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
318 struct bucket_mark m)
320 return bucket_sectors_used(m)
321 ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
325 static inline int is_stripe_data_bucket(struct bucket_mark m)
327 return m.stripe && m.data_type != BCH_DATA_parity;
330 static inline enum bch_data_type bucket_type(struct bucket_mark m)
332 return m.cached_sectors && !m.dirty_sectors
337 static bool bucket_became_unavailable(struct bucket_mark old,
338 struct bucket_mark new)
340 return is_available_bucket(old) &&
341 !is_available_bucket(new);
344 static inline void account_bucket(struct bch_fs_usage *fs_usage,
345 struct bch_dev_usage *dev_usage,
346 enum bch_data_type type,
349 if (type == BCH_DATA_sb || type == BCH_DATA_journal)
350 fs_usage->hidden += size;
352 dev_usage->d[type].buckets += nr;
355 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
356 struct bucket_mark old, struct bucket_mark new,
357 u64 journal_seq, bool gc)
359 struct bch_fs_usage *fs_usage;
360 struct bch_dev_usage *u;
362 percpu_rwsem_assert_held(&c->mark_lock);
365 fs_usage = fs_usage_ptr(c, journal_seq, gc);
366 u = dev_usage_ptr(ca, journal_seq, gc);
368 if (bucket_type(old))
369 account_bucket(fs_usage, u, bucket_type(old),
370 -1, -ca->mi.bucket_size);
372 if (bucket_type(new))
373 account_bucket(fs_usage, u, bucket_type(new),
374 1, ca->mi.bucket_size);
376 u->buckets_ec += (int) new.stripe - (int) old.stripe;
377 u->buckets_unavailable +=
378 is_unavailable_bucket(new) - is_unavailable_bucket(old);
380 u->d[old.data_type].sectors -= old.dirty_sectors;
381 u->d[new.data_type].sectors += new.dirty_sectors;
382 u->d[BCH_DATA_cached].sectors +=
383 (int) new.cached_sectors - (int) old.cached_sectors;
385 u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
386 u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
390 if (!is_available_bucket(old) && is_available_bucket(new))
391 bch2_wake_allocator(ca);
394 static inline int __update_replicas(struct bch_fs *c,
395 struct bch_fs_usage *fs_usage,
396 struct bch_replicas_entry *r,
399 int idx = bch2_replicas_entry_idx(c, r);
404 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
405 fs_usage->replicas[idx] += sectors;
409 static inline int update_replicas(struct bch_fs *c,
410 struct bch_replicas_entry *r, s64 sectors,
411 unsigned journal_seq, bool gc)
413 struct bch_fs_usage __percpu *fs_usage;
414 int idx = bch2_replicas_entry_idx(c, r);
420 fs_usage = fs_usage_ptr(c, journal_seq, gc);
421 fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
422 fs_usage->replicas[idx] += sectors;
427 static inline int update_cached_sectors(struct bch_fs *c,
428 unsigned dev, s64 sectors,
429 unsigned journal_seq, bool gc)
431 struct bch_replicas_padded r;
433 bch2_replicas_entry_cached(&r.e, dev);
435 return update_replicas(c, &r.e, sectors, journal_seq, gc);
438 static struct replicas_delta_list *
439 replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
441 struct replicas_delta_list *d = trans->fs_usage_deltas;
442 unsigned new_size = d ? (d->size + more) * 2 : 128;
443 unsigned alloc_size = sizeof(*d) + new_size;
445 WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
447 if (!d || d->used + more > d->size) {
448 d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
450 BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
453 d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
454 memset(d, 0, REPLICAS_DELTA_LIST_MAX);
456 if (trans->fs_usage_deltas)
457 memcpy(d, trans->fs_usage_deltas,
458 trans->fs_usage_deltas->size + sizeof(*d));
460 new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
461 kfree(trans->fs_usage_deltas);
465 trans->fs_usage_deltas = d;
470 static inline void update_replicas_list(struct btree_trans *trans,
471 struct bch_replicas_entry *r,
474 struct replicas_delta_list *d;
475 struct replicas_delta *n;
481 b = replicas_entry_bytes(r) + 8;
482 d = replicas_deltas_realloc(trans, b);
484 n = (void *) d->d + d->used;
486 memcpy(&n->r, r, replicas_entry_bytes(r));
487 bch2_replicas_entry_sort(&n->r);
491 static inline void update_cached_sectors_list(struct btree_trans *trans,
492 unsigned dev, s64 sectors)
494 struct bch_replicas_padded r;
496 bch2_replicas_entry_cached(&r.e, dev);
498 update_replicas_list(trans, &r.e, sectors);
501 #define do_mark_fn(fn, c, pos, flags, ...) \
505 percpu_rwsem_assert_held(&c->mark_lock); \
507 for (gc = 0; gc < 2 && !ret; gc++) \
508 if (!gc == !(flags & BTREE_TRIGGER_GC) || \
509 (gc && gc_visited(c, pos))) \
510 ret = fn(c, __VA_ARGS__, gc); \
514 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
515 size_t b, bool owned_by_allocator)
517 struct bucket *g = bucket(ca, b);
518 struct bucket_mark old, new;
520 old = bucket_cmpxchg(g, new, ({
521 new.owned_by_allocator = owned_by_allocator;
524 BUG_ON(owned_by_allocator == old.owned_by_allocator);
527 static int bch2_mark_alloc(struct bch_fs *c,
528 struct bkey_s_c old, struct bkey_s_c new,
529 u64 journal_seq, unsigned flags)
531 bool gc = flags & BTREE_TRIGGER_GC;
532 struct bkey_alloc_unpacked u;
535 struct bucket_mark old_m, m;
537 /* We don't do anything for deletions - do we?: */
538 if (new.k->type != KEY_TYPE_alloc &&
539 new.k->type != KEY_TYPE_alloc_v2)
543 * alloc btree is read in by bch2_alloc_read, not gc:
545 if ((flags & BTREE_TRIGGER_GC) &&
546 !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
549 ca = bch_dev_bkey_exists(c, new.k->p.inode);
551 if (new.k->p.offset >= ca->mi.nbuckets)
554 g = __bucket(ca, new.k->p.offset, gc);
555 u = bch2_alloc_unpack(new);
557 old_m = bucket_cmpxchg(g, m, ({
559 m.data_type = u.data_type;
560 m.dirty_sectors = u.dirty_sectors;
561 m.cached_sectors = u.cached_sectors;
562 m.stripe = u.stripe != 0;
565 m.journal_seq_valid = 1;
566 m.journal_seq = journal_seq;
570 bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
572 g->io_time[READ] = u.read_time;
573 g->io_time[WRITE] = u.write_time;
574 g->oldest_gen = u.oldest_gen;
576 g->stripe = u.stripe;
577 g->stripe_redundancy = u.stripe_redundancy;
580 * need to know if we're getting called from the invalidate path or
584 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
585 old_m.cached_sectors) {
586 if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
588 bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
592 trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
593 old_m.cached_sectors);
599 #define checked_add(a, b) \
601 unsigned _res = (unsigned) (a) + (b); \
602 bool overflow = _res > U16_MAX; \
609 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
610 size_t b, enum bch_data_type data_type,
611 unsigned sectors, bool gc)
613 struct bucket *g = __bucket(ca, b, gc);
614 struct bucket_mark old, new;
617 BUG_ON(data_type != BCH_DATA_sb &&
618 data_type != BCH_DATA_journal);
620 old = bucket_cmpxchg(g, new, ({
621 new.data_type = data_type;
622 overflow = checked_add(new.dirty_sectors, sectors);
625 bch2_fs_inconsistent_on(old.data_type &&
626 old.data_type != data_type, c,
627 "different types of data in same bucket: %s, %s",
628 bch2_data_types[old.data_type],
629 bch2_data_types[data_type]);
631 bch2_fs_inconsistent_on(overflow, c,
632 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
633 ca->dev_idx, b, new.gen,
634 bch2_data_types[old.data_type ?: data_type],
635 old.dirty_sectors, sectors);
638 bch2_dev_usage_update(c, ca, old, new, 0, gc);
643 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
644 size_t b, enum bch_data_type type,
645 unsigned sectors, struct gc_pos pos,
648 BUG_ON(type != BCH_DATA_sb &&
649 type != BCH_DATA_journal);
652 * Backup superblock might be past the end of our normal usable space:
654 if (b >= ca->mi.nbuckets)
658 do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
659 ca, b, type, sectors);
661 __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
665 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
667 EBUG_ON(sectors < 0);
669 return p.crc.compression_type &&
670 p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
671 ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
672 p.crc.uncompressed_size)
676 static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
677 const struct bch_extent_ptr *ptr,
678 s64 sectors, enum bch_data_type ptr_data_type,
679 u8 bucket_gen, u8 bucket_data_type,
680 u16 dirty_sectors, u16 cached_sectors)
682 size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
683 u16 bucket_sectors = !ptr->cached
688 if (gen_after(ptr->gen, bucket_gen)) {
689 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
690 "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
692 ptr->dev, bucket_nr, bucket_gen,
693 bch2_data_types[bucket_data_type ?: ptr_data_type],
695 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
699 if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
700 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
701 "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
703 ptr->dev, bucket_nr, bucket_gen,
704 bch2_data_types[bucket_data_type ?: ptr_data_type],
706 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
710 if (bucket_gen != ptr->gen && !ptr->cached) {
711 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
712 "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
714 ptr->dev, bucket_nr, bucket_gen,
715 bch2_data_types[bucket_data_type ?: ptr_data_type],
717 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
721 if (bucket_gen != ptr->gen)
724 if (bucket_data_type && ptr_data_type &&
725 bucket_data_type != ptr_data_type) {
726 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
727 "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
729 ptr->dev, bucket_nr, bucket_gen,
730 bch2_data_types[bucket_data_type],
731 bch2_data_types[ptr_data_type],
732 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
736 if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
737 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
738 "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
740 ptr->dev, bucket_nr, bucket_gen,
741 bch2_data_types[bucket_data_type ?: ptr_data_type],
742 bucket_sectors, sectors,
743 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
750 static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
752 u64 journal_seq, unsigned flags)
754 const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
755 unsigned nr_data = s->nr_blocks - s->nr_redundant;
756 bool parity = ptr_idx >= nr_data;
757 const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
758 bool gc = flags & BTREE_TRIGGER_GC;
759 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
760 struct bucket *g = PTR_BUCKET(ca, ptr, gc);
761 struct bucket_mark new, old;
765 if (g->stripe && g->stripe != k.k->p.offset) {
766 bch2_fs_inconsistent(c,
767 "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
768 ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
769 (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
773 old = bucket_cmpxchg(g, new, ({
774 ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
775 new.dirty_sectors, new.cached_sectors);
780 new.data_type = BCH_DATA_parity;
781 new.dirty_sectors = le16_to_cpu(s->sectors);
785 new.journal_seq_valid = 1;
786 new.journal_seq = journal_seq;
790 g->stripe = k.k->p.offset;
791 g->stripe_redundancy = s->nr_redundant;
793 bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
797 static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
798 const struct bch_extent_ptr *ptr,
799 s64 sectors, enum bch_data_type ptr_data_type,
800 u8 bucket_gen, u8 *bucket_data_type,
801 u16 *dirty_sectors, u16 *cached_sectors)
803 u16 *dst_sectors = !ptr->cached
806 int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
807 bucket_gen, *bucket_data_type,
808 *dirty_sectors, *cached_sectors);
813 *dst_sectors += sectors;
814 *bucket_data_type = *dirty_sectors || *cached_sectors
819 static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
820 struct extent_ptr_decoded p,
821 s64 sectors, enum bch_data_type data_type,
822 u64 journal_seq, unsigned flags)
824 bool gc = flags & BTREE_TRIGGER_GC;
825 struct bucket_mark old, new;
826 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
827 struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
832 v = atomic64_read(&g->_mark.v);
834 new.v.counter = old.v.counter = v;
835 bucket_data_type = new.data_type;
837 ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
840 &new.cached_sectors);
844 new.data_type = bucket_data_type;
847 new.journal_seq_valid = 1;
848 new.journal_seq = journal_seq;
851 if (flags & BTREE_TRIGGER_NOATOMIC) {
855 } while ((v = atomic64_cmpxchg(&g->_mark.v,
857 new.v.counter)) != old.v.counter);
859 bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
861 BUG_ON(!gc && bucket_became_unavailable(old, new));
866 static int bch2_mark_stripe_ptr(struct bch_fs *c,
867 struct bch_extent_stripe_ptr p,
868 enum bch_data_type data_type,
870 unsigned journal_seq, unsigned flags)
872 bool gc = flags & BTREE_TRIGGER_GC;
873 struct bch_replicas_padded r;
875 unsigned i, blocks_nonempty = 0;
877 m = genradix_ptr(&c->stripes[gc], p.idx);
879 spin_lock(&c->ec_stripes_heap_lock);
881 if (!m || !m->alive) {
882 spin_unlock(&c->ec_stripes_heap_lock);
883 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
885 bch2_inconsistent_error(c);
889 m->block_sectors[p.block] += sectors;
893 for (i = 0; i < m->nr_blocks; i++)
894 blocks_nonempty += m->block_sectors[i] != 0;
896 if (m->blocks_nonempty != blocks_nonempty) {
897 m->blocks_nonempty = blocks_nonempty;
899 bch2_stripes_heap_update(c, m, p.idx);
902 spin_unlock(&c->ec_stripes_heap_lock);
904 r.e.data_type = data_type;
905 update_replicas(c, &r.e, sectors, journal_seq, gc);
910 static int bch2_mark_extent(struct bch_fs *c,
911 struct bkey_s_c old, struct bkey_s_c new,
912 unsigned journal_seq, unsigned flags)
914 bool gc = flags & BTREE_TRIGGER_GC;
915 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
916 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
917 const union bch_extent_entry *entry;
918 struct extent_ptr_decoded p;
919 struct bch_replicas_padded r;
920 enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
923 s64 sectors = bkey_is_btree_ptr(k.k)
924 ? c->opts.btree_node_size
926 s64 dirty_sectors = 0;
930 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
931 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
933 r.e.data_type = data_type;
937 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
938 s64 disk_sectors = ptr_disk_sectors(sectors, p);
940 if (flags & BTREE_TRIGGER_OVERWRITE)
941 disk_sectors = -disk_sectors;
943 ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
952 if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
954 bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
958 } else if (!p.has_ec) {
959 dirty_sectors += disk_sectors;
960 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
962 ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
963 disk_sectors, journal_seq, flags);
968 * There may be other dirty pointers in this extent, but
969 * if so they're not required for mounting if we have an
970 * erasure coded pointer in this extent:
977 if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
980 bch2_bkey_val_to_text(&PBUF(buf), c, k);
981 bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
989 static int bch2_mark_stripe(struct bch_fs *c,
990 struct bkey_s_c old, struct bkey_s_c new,
991 u64 journal_seq, unsigned flags)
993 bool gc = flags & BTREE_TRIGGER_GC;
994 size_t idx = new.k->p.offset;
995 const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
996 ? bkey_s_c_to_stripe(old).v : NULL;
997 const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
998 ? bkey_s_c_to_stripe(new).v : NULL;
999 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
1003 BUG_ON(gc && old_s);
1005 if (!m || (old_s && !m->alive)) {
1006 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
1008 bch2_inconsistent_error(c);
1013 spin_lock(&c->ec_stripes_heap_lock);
1014 bch2_stripes_heap_del(c, m, idx);
1015 spin_unlock(&c->ec_stripes_heap_lock);
1017 memset(m, 0, sizeof(*m));
1020 m->sectors = le16_to_cpu(new_s->sectors);
1021 m->algorithm = new_s->algorithm;
1022 m->nr_blocks = new_s->nr_blocks;
1023 m->nr_redundant = new_s->nr_redundant;
1024 m->blocks_nonempty = 0;
1026 for (i = 0; i < new_s->nr_blocks; i++) {
1027 m->block_sectors[i] =
1028 stripe_blockcount_get(new_s, i);
1029 m->blocks_nonempty += !!m->block_sectors[i];
1031 m->ptrs[i] = new_s->ptrs[i];
1034 bch2_bkey_to_replicas(&m->r.e, new);
1037 spin_lock(&c->ec_stripes_heap_lock);
1038 bch2_stripes_heap_update(c, m, idx);
1039 spin_unlock(&c->ec_stripes_heap_lock);
1045 * gc recalculates this field from stripe ptr
1048 memset(m->block_sectors, 0, sizeof(m->block_sectors));
1049 m->blocks_nonempty = 0;
1051 for (i = 0; i < new_s->nr_blocks; i++) {
1052 ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
1057 if (update_replicas(c, &m->r.e,
1058 ((s64) m->sectors * m->nr_redundant),
1062 bch2_bkey_val_to_text(&PBUF(buf), c, new);
1063 bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
1071 static int bch2_mark_inode(struct bch_fs *c,
1072 struct bkey_s_c old, struct bkey_s_c new,
1073 u64 journal_seq, unsigned flags)
1075 struct bch_fs_usage __percpu *fs_usage;
1078 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
1079 fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
1080 fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
1085 static int bch2_mark_reservation(struct bch_fs *c,
1086 struct bkey_s_c old, struct bkey_s_c new,
1087 u64 journal_seq, unsigned flags)
1089 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1090 struct bch_fs_usage __percpu *fs_usage;
1091 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1092 s64 sectors = (s64) k.k->size;
1094 if (flags & BTREE_TRIGGER_OVERWRITE)
1096 sectors *= replicas;
1099 fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
1100 replicas = clamp_t(unsigned, replicas, 1,
1101 ARRAY_SIZE(fs_usage->persistent_reserved));
1103 fs_usage->reserved += sectors;
1104 fs_usage->persistent_reserved[replicas - 1] += sectors;
1110 static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
1111 u64 idx, unsigned flags, size_t *r_idx)
1113 struct reflink_gc *r;
1114 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
1117 while (*r_idx < c->reflink_gc_nr) {
1118 r = genradix_ptr(&c->reflink_gc_table, *r_idx);
1121 if (idx < r->offset)
1126 if (*r_idx >= c->reflink_gc_nr ||
1127 idx < r->offset - r->size) {
1132 BUG_ON((s64) r->refcount + add < 0);
1135 return r->offset - idx;
1137 if ((flags & BTREE_TRIGGER_GC) &&
1138 (flags & BTREE_TRIGGER_NOATOMIC)) {
1140 * XXX: we're replacing the entire reflink pointer with an error
1141 * key, we should just be replacing the part that was missing:
1143 if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
1144 p.k->p.inode, p.k->p.offset, p.k->size, idx)) {
1145 struct bkey_i_error *new;
1147 new = kmalloc(sizeof(*new), GFP_KERNEL);
1149 bch_err(c, "%s: error allocating new key", __func__);
1154 new->k.type = KEY_TYPE_error;
1156 new->k.size = p.k->size;
1157 ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new->k_i);
1161 bch2_fs_inconsistent(c,
1162 "%llu:%llu len %u points to nonexistent indirect extent %llu",
1163 p.k->p.inode, p.k->p.offset, p.k->size, idx);
1164 bch2_inconsistent_error(c);
1171 static int bch2_mark_reflink_p(struct bch_fs *c,
1172 struct bkey_s_c old, struct bkey_s_c new,
1173 u64 journal_seq, unsigned flags)
1175 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1176 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
1177 struct reflink_gc *ref;
1179 u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
1180 u64 sectors = (u64) le32_to_cpu(p.v->front_pad) +
1181 le32_to_cpu(p.v->back_pad) +
1185 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1186 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1189 r = c->reflink_gc_nr;
1191 m = l + (r - l) / 2;
1193 ref = genradix_ptr(&c->reflink_gc_table, m);
1194 if (ref->offset <= idx)
1201 ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
1205 ret = min_t(s64, ret, sectors);
1213 static int bch2_mark_key_locked(struct bch_fs *c,
1214 struct bkey_s_c old,
1215 struct bkey_s_c new,
1216 u64 journal_seq, unsigned flags)
1218 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1220 BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1222 switch (k.k->type) {
1223 case KEY_TYPE_alloc:
1224 case KEY_TYPE_alloc_v2:
1225 return bch2_mark_alloc(c, old, new, journal_seq, flags);
1226 case KEY_TYPE_btree_ptr:
1227 case KEY_TYPE_btree_ptr_v2:
1228 case KEY_TYPE_extent:
1229 case KEY_TYPE_reflink_v:
1230 return bch2_mark_extent(c, old, new, journal_seq, flags);
1231 case KEY_TYPE_stripe:
1232 return bch2_mark_stripe(c, old, new, journal_seq, flags);
1233 case KEY_TYPE_inode:
1234 return bch2_mark_inode(c, old, new, journal_seq, flags);
1235 case KEY_TYPE_reservation:
1236 return bch2_mark_reservation(c, old, new, journal_seq, flags);
1237 case KEY_TYPE_reflink_p:
1238 return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
1239 case KEY_TYPE_snapshot:
1240 return bch2_mark_snapshot(c, old, new, journal_seq, flags);
1246 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
1248 struct bkey deleted = KEY(0, 0, 0);
1249 struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
1252 percpu_down_read(&c->mark_lock);
1253 ret = bch2_mark_key_locked(c, old, new, 0, flags);
1254 percpu_up_read(&c->mark_lock);
1259 int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
1260 struct bkey_i *new, unsigned flags)
1262 struct bch_fs *c = trans->c;
1263 struct bkey _deleted = KEY(0, 0, 0);
1264 struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
1265 struct bkey_s_c old;
1266 struct bkey unpacked;
1269 if (unlikely(flags & BTREE_TRIGGER_NORUN))
1272 if (!btree_node_type_needs_gc(path->btree_id))
1275 old = bch2_btree_path_peek_slot(path, &unpacked);
1277 if (old.k->type == new->k.type &&
1278 ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
1279 ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
1280 trans->journal_res.seq,
1281 BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1283 ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
1284 trans->journal_res.seq,
1285 BTREE_TRIGGER_INSERT|flags) ?:
1286 bch2_mark_key_locked(c, old, deleted,
1287 trans->journal_res.seq,
1288 BTREE_TRIGGER_OVERWRITE|flags);
1294 static noinline __cold
1295 void fs_usage_apply_warn(struct btree_trans *trans,
1296 unsigned disk_res_sectors,
1297 s64 should_not_have_added)
1299 struct bch_fs *c = trans->c;
1300 struct btree_insert_entry *i;
1303 bch_err(c, "disk usage increased %lli more than %u sectors reserved",
1304 should_not_have_added, disk_res_sectors);
1306 trans_for_each_update(trans, i) {
1307 pr_err("while inserting");
1308 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
1310 pr_err("overlapping with");
1314 struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
1316 bch2_bkey_val_to_text(&PBUF(buf), c, k);
1319 struct bkey_cached *ck = (void *) i->path->l[0].b;
1322 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
1330 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
1331 struct replicas_delta_list *deltas)
1333 struct bch_fs *c = trans->c;
1334 static int warned_disk_usage = 0;
1336 unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
1337 struct replicas_delta *d = deltas->d;
1338 struct replicas_delta *top = (void *) deltas->d + deltas->used;
1339 struct bch_fs_usage *dst;
1340 s64 added = 0, should_not_have_added;
1343 percpu_rwsem_assert_held(&c->mark_lock);
1346 dst = fs_usage_ptr(c, trans->journal_res.seq, false);
1348 for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
1349 switch (d->r.data_type) {
1350 case BCH_DATA_btree:
1352 case BCH_DATA_parity:
1356 BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
1359 dst->nr_inodes += deltas->nr_inodes;
1361 for (i = 0; i < BCH_REPLICAS_MAX; i++) {
1362 added += deltas->persistent_reserved[i];
1363 dst->reserved += deltas->persistent_reserved[i];
1364 dst->persistent_reserved[i] += deltas->persistent_reserved[i];
1368 * Not allowed to reduce sectors_available except by getting a
1371 should_not_have_added = added - (s64) disk_res_sectors;
1372 if (unlikely(should_not_have_added > 0)) {
1373 u64 old, new, v = atomic64_read(&c->sectors_available);
1377 new = max_t(s64, 0, old - should_not_have_added);
1378 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1381 added -= should_not_have_added;
1386 trans->disk_res->sectors -= added;
1387 this_cpu_sub(*c->online_reserved, added);
1392 if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
1393 fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
1398 static struct bkey_alloc_buf *
1399 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
1400 const struct bch_extent_ptr *ptr,
1401 struct bkey_alloc_unpacked *u)
1403 struct bch_fs *c = trans->c;
1404 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
1405 struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
1407 struct bkey_alloc_buf *a;
1408 struct bkey_i *update = btree_trans_peek_updates(trans, BTREE_ID_alloc, pos);
1411 a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
1415 bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos,
1417 BTREE_ITER_CACHED_NOFILL|
1419 ret = bch2_btree_iter_traverse(iter);
1421 bch2_trans_iter_exit(trans, iter);
1422 return ERR_PTR(ret);
1425 if (update && !bpos_cmp(update->k.p, pos)) {
1426 *u = bch2_alloc_unpack(bkey_i_to_s_c(update));
1428 percpu_down_read(&c->mark_lock);
1429 g = bucket(ca, pos.offset);
1430 *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
1431 percpu_up_read(&c->mark_lock);
1437 static int bch2_trans_mark_pointer(struct btree_trans *trans,
1438 struct bkey_s_c k, struct extent_ptr_decoded p,
1439 s64 sectors, enum bch_data_type data_type)
1441 struct bch_fs *c = trans->c;
1442 struct btree_iter iter;
1443 struct bkey_alloc_unpacked u;
1444 struct bkey_alloc_buf *a;
1447 a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
1451 ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
1452 &u.dirty_sectors, &u.cached_sectors);
1456 bch2_alloc_pack(c, a, u);
1457 bch2_trans_update(trans, &iter, &a->k, 0);
1459 bch2_trans_iter_exit(trans, &iter);
1463 static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
1464 struct extent_ptr_decoded p,
1465 s64 sectors, enum bch_data_type data_type)
1467 struct bch_fs *c = trans->c;
1468 struct btree_iter iter;
1470 struct bkey_i_stripe *s;
1471 struct bch_replicas_padded r;
1474 bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
1476 BTREE_ITER_WITH_UPDATES);
1477 k = bch2_btree_iter_peek_slot(&iter);
1482 if (k.k->type != KEY_TYPE_stripe) {
1483 bch2_fs_inconsistent(c,
1484 "pointer to nonexistent stripe %llu",
1486 bch2_inconsistent_error(c);
1491 if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
1492 bch2_fs_inconsistent(c,
1493 "stripe pointer doesn't match stripe %llu",
1499 s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1500 ret = PTR_ERR_OR_ZERO(s);
1504 bkey_reassemble(&s->k_i, k);
1505 stripe_blockcount_set(&s->v, p.ec.block,
1506 stripe_blockcount_get(&s->v, p.ec.block) +
1508 bch2_trans_update(trans, &iter, &s->k_i, 0);
1510 bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
1511 r.e.data_type = data_type;
1512 update_replicas_list(trans, &r.e, sectors);
1514 bch2_trans_iter_exit(trans, &iter);
1518 static int bch2_trans_mark_extent(struct btree_trans *trans,
1519 struct bkey_s_c k, unsigned flags)
1521 struct bch_fs *c = trans->c;
1522 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
1523 const union bch_extent_entry *entry;
1524 struct extent_ptr_decoded p;
1525 struct bch_replicas_padded r;
1526 enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
1529 s64 sectors = bkey_is_btree_ptr(k.k)
1530 ? c->opts.btree_node_size
1532 s64 dirty_sectors = 0;
1536 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1537 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1539 r.e.data_type = data_type;
1541 r.e.nr_required = 1;
1543 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
1544 s64 disk_sectors = ptr_disk_sectors(sectors, p);
1546 if (flags & BTREE_TRIGGER_OVERWRITE)
1547 disk_sectors = -disk_sectors;
1549 ret = bch2_trans_mark_pointer(trans, k, p,
1550 disk_sectors, data_type);
1558 update_cached_sectors_list(trans, p.ptr.dev,
1560 } else if (!p.has_ec) {
1561 dirty_sectors += disk_sectors;
1562 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
1564 ret = bch2_trans_mark_stripe_ptr(trans, p,
1565 disk_sectors, data_type);
1569 r.e.nr_required = 0;
1574 update_replicas_list(trans, &r.e, dirty_sectors);
1579 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
1580 struct bkey_s_c_stripe s,
1581 unsigned idx, bool deleting)
1583 struct bch_fs *c = trans->c;
1584 const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
1585 struct bkey_alloc_buf *a;
1586 struct btree_iter iter;
1587 struct bkey_alloc_unpacked u;
1588 bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
1591 a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
1596 s64 sectors = le16_to_cpu(s.v->sectors);
1601 u.dirty_sectors += sectors;
1602 u.data_type = u.dirty_sectors
1608 if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
1609 "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
1610 iter.pos.inode, iter.pos.offset, u.gen,
1611 u.stripe, s.k->p.offset)) {
1616 u.stripe = s.k->p.offset;
1617 u.stripe_redundancy = s.v->nr_redundant;
1620 u.stripe_redundancy = 0;
1623 bch2_alloc_pack(c, a, u);
1624 bch2_trans_update(trans, &iter, &a->k, 0);
1626 bch2_trans_iter_exit(trans, &iter);
1630 static int bch2_trans_mark_stripe(struct btree_trans *trans,
1631 struct bkey_s_c old, struct bkey_s_c new,
1634 struct bkey_s_c_stripe old_s = { .k = NULL };
1635 struct bkey_s_c_stripe new_s = { .k = NULL };
1636 struct bch_replicas_padded r;
1640 if (old.k->type == KEY_TYPE_stripe)
1641 old_s = bkey_s_c_to_stripe(old);
1642 if (new.k->type == KEY_TYPE_stripe)
1643 new_s = bkey_s_c_to_stripe(new);
1646 * If the pointers aren't changing, we don't need to do anything:
1648 if (new_s.k && old_s.k &&
1649 new_s.v->nr_blocks == old_s.v->nr_blocks &&
1650 new_s.v->nr_redundant == old_s.v->nr_redundant &&
1651 !memcmp(old_s.v->ptrs, new_s.v->ptrs,
1652 new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
1656 s64 sectors = le16_to_cpu(new_s.v->sectors);
1658 bch2_bkey_to_replicas(&r.e, new);
1659 update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
1661 for (i = 0; i < new_s.v->nr_blocks; i++) {
1662 ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
1670 s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
1672 bch2_bkey_to_replicas(&r.e, old);
1673 update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
1675 for (i = 0; i < old_s.v->nr_blocks; i++) {
1676 ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
1686 static int bch2_trans_mark_inode(struct btree_trans *trans,
1687 struct bkey_s_c old,
1688 struct bkey_s_c new,
1691 int nr = (new.k->type == KEY_TYPE_inode) -
1692 (old.k->type == KEY_TYPE_inode);
1695 struct replicas_delta_list *d =
1696 replicas_deltas_realloc(trans, 0);
1703 static int bch2_trans_mark_reservation(struct btree_trans *trans,
1704 struct bkey_s_c k, unsigned flags)
1706 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
1707 s64 sectors = (s64) k.k->size;
1708 struct replicas_delta_list *d;
1710 BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
1711 (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
1713 if (flags & BTREE_TRIGGER_OVERWRITE)
1715 sectors *= replicas;
1717 d = replicas_deltas_realloc(trans, 0);
1719 replicas = clamp_t(unsigned, replicas, 1,
1720 ARRAY_SIZE(d->persistent_reserved));
1722 d->persistent_reserved[replicas - 1] += sectors;
1726 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
1727 struct bkey_s_c_reflink_p p,
1728 u64 idx, unsigned flags)
1730 struct bch_fs *c = trans->c;
1731 struct btree_iter iter;
1735 int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
1738 bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, idx),
1740 BTREE_ITER_WITH_UPDATES);
1741 k = bch2_btree_iter_peek_slot(&iter);
1746 n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
1747 ret = PTR_ERR_OR_ZERO(n);
1751 bkey_reassemble(n, k);
1753 refcount = bkey_refcount(n);
1755 bch2_fs_inconsistent(c,
1756 "%llu:%llu len %u points to nonexistent indirect extent %llu",
1757 p.k->p.inode, p.k->p.offset, p.k->size, idx);
1762 if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
1763 bch2_fs_inconsistent(c,
1764 "%llu:%llu len %u idx %llu indirect extent refcount underflow",
1765 p.k->p.inode, p.k->p.offset, p.k->size, idx);
1770 if (flags & BTREE_TRIGGER_INSERT) {
1771 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
1774 pad = max_t(s64, le32_to_cpu(v->front_pad),
1775 le64_to_cpu(v->idx) - bkey_start_offset(k.k));
1776 BUG_ON(pad > U32_MAX);
1777 v->front_pad = cpu_to_le32(pad);
1779 pad = max_t(s64, le32_to_cpu(v->back_pad),
1780 k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
1781 BUG_ON(pad > U32_MAX);
1782 v->back_pad = cpu_to_le32(pad);
1785 le64_add_cpu(refcount, add);
1788 n->k.type = KEY_TYPE_deleted;
1789 set_bkey_val_u64s(&n->k, 0);
1792 bch2_btree_iter_set_pos_to_extent_start(&iter);
1793 ret = bch2_trans_update(trans, &iter, n, 0);
1797 ret = k.k->p.offset - idx;
1799 bch2_trans_iter_exit(trans, &iter);
1803 static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
1804 struct bkey_s_c k, unsigned flags)
1806 struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
1810 if (flags & BTREE_TRIGGER_INSERT) {
1811 struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
1813 v->front_pad = v->back_pad = 0;
1816 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
1817 sectors = (u64) le32_to_cpu(p.v->front_pad) +
1818 le32_to_cpu(p.v->back_pad) +
1822 ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
1826 ret = min_t(s64, ret, sectors);
1834 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
1835 struct bkey_s_c new, unsigned flags)
1837 struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
1839 BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
1841 switch (k.k->type) {
1842 case KEY_TYPE_btree_ptr:
1843 case KEY_TYPE_btree_ptr_v2:
1844 case KEY_TYPE_extent:
1845 case KEY_TYPE_reflink_v:
1846 return bch2_trans_mark_extent(trans, k, flags);
1847 case KEY_TYPE_stripe:
1848 return bch2_trans_mark_stripe(trans, old, new, flags);
1849 case KEY_TYPE_inode:
1850 return bch2_trans_mark_inode(trans, old, new, flags);
1851 case KEY_TYPE_reservation:
1852 return bch2_trans_mark_reservation(trans, k, flags);
1853 case KEY_TYPE_reflink_p:
1854 return bch2_trans_mark_reflink_p(trans, k, flags);
1860 int bch2_trans_mark_update(struct btree_trans *trans,
1861 struct btree_path *path,
1865 struct bkey _deleted = KEY(0, 0, 0);
1866 struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
1867 struct bkey_s_c old;
1868 struct bkey unpacked;
1871 if (unlikely(flags & BTREE_TRIGGER_NORUN))
1874 if (!btree_node_type_needs_gc(path->btree_id))
1877 old = bch2_btree_path_peek_slot(path, &unpacked);
1879 if (old.k->type == new->k.type &&
1880 ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
1881 ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
1882 BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
1884 ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
1885 BTREE_TRIGGER_INSERT|flags) ?:
1886 bch2_trans_mark_key(trans, old, deleted,
1887 BTREE_TRIGGER_OVERWRITE|flags);
1893 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1894 struct bch_dev *ca, size_t b,
1895 enum bch_data_type type,
1898 struct bch_fs *c = trans->c;
1899 struct btree_iter iter;
1900 struct bkey_alloc_unpacked u;
1901 struct bkey_alloc_buf *a;
1902 struct bch_extent_ptr ptr = {
1904 .offset = bucket_to_sector(ca, b),
1909 * Backup superblock might be past the end of our normal usable space:
1911 if (b >= ca->mi.nbuckets)
1914 a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
1918 if (u.data_type && u.data_type != type) {
1919 bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
1920 "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
1922 iter.pos.inode, iter.pos.offset, u.gen,
1923 bch2_data_types[u.data_type],
1924 bch2_data_types[type],
1925 bch2_data_types[type]);
1931 u.dirty_sectors = sectors;
1933 bch2_alloc_pack(c, a, u);
1934 bch2_trans_update(trans, &iter, &a->k, 0);
1936 bch2_trans_iter_exit(trans, &iter);
1940 int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
1941 struct bch_dev *ca, size_t b,
1942 enum bch_data_type type,
1945 return __bch2_trans_do(trans, NULL, NULL, 0,
1946 __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
1949 static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
1952 enum bch_data_type type,
1953 u64 *bucket, unsigned *bucket_sectors)
1956 u64 b = sector_to_bucket(ca, start);
1958 min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
1960 if (b != *bucket && *bucket_sectors) {
1961 int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
1962 type, *bucket_sectors);
1966 *bucket_sectors = 0;
1970 *bucket_sectors += sectors;
1972 } while (start < end);
1977 static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
1980 struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
1982 unsigned i, bucket_sectors = 0;
1985 for (i = 0; i < layout->nr_superblocks; i++) {
1986 u64 offset = le64_to_cpu(layout->sb_offset[i]);
1988 if (offset == BCH_SB_SECTOR) {
1989 ret = bch2_trans_mark_metadata_sectors(trans, ca,
1991 BCH_DATA_sb, &bucket, &bucket_sectors);
1996 ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
1997 offset + (1 << layout->sb_max_size_bits),
1998 BCH_DATA_sb, &bucket, &bucket_sectors);
2003 if (bucket_sectors) {
2004 ret = bch2_trans_mark_metadata_bucket(trans, ca,
2005 bucket, BCH_DATA_sb, bucket_sectors);
2010 for (i = 0; i < ca->journal.nr; i++) {
2011 ret = bch2_trans_mark_metadata_bucket(trans, ca,
2012 ca->journal.buckets[i],
2013 BCH_DATA_journal, ca->mi.bucket_size);
2021 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
2023 return bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
2024 __bch2_trans_mark_dev_sb(&trans, ca));
2027 /* Disk reservations: */
2029 #define SECTORS_CACHE 1024
2031 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
2032 u64 sectors, int flags)
2034 struct bch_fs_pcpu *pcpu;
2036 s64 sectors_available;
2039 percpu_down_read(&c->mark_lock);
2041 pcpu = this_cpu_ptr(c->pcpu);
2043 if (sectors <= pcpu->sectors_available)
2046 v = atomic64_read(&c->sectors_available);
2049 get = min((u64) sectors + SECTORS_CACHE, old);
2051 if (get < sectors) {
2055 } while ((v = atomic64_cmpxchg(&c->sectors_available,
2056 old, old - get)) != old);
2058 pcpu->sectors_available += get;
2061 pcpu->sectors_available -= sectors;
2062 this_cpu_add(*c->online_reserved, sectors);
2063 res->sectors += sectors;
2066 percpu_up_read(&c->mark_lock);
2070 mutex_lock(&c->sectors_available_lock);
2072 percpu_u64_set(&c->pcpu->sectors_available, 0);
2073 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
2075 if (sectors <= sectors_available ||
2076 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
2077 atomic64_set(&c->sectors_available,
2078 max_t(s64, 0, sectors_available - sectors));
2079 this_cpu_add(*c->online_reserved, sectors);
2080 res->sectors += sectors;
2083 atomic64_set(&c->sectors_available, sectors_available);
2087 mutex_unlock(&c->sectors_available_lock);
2088 percpu_up_read(&c->mark_lock);
2093 /* Startup/shutdown: */
2095 static void buckets_free_rcu(struct rcu_head *rcu)
2097 struct bucket_array *buckets =
2098 container_of(rcu, struct bucket_array, rcu);
2101 sizeof(struct bucket_array) +
2102 buckets->nbuckets * sizeof(struct bucket));
2105 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
2107 struct bucket_array *buckets = NULL, *old_buckets = NULL;
2108 unsigned long *buckets_nouse = NULL;
2109 alloc_fifo free[RESERVE_NR];
2110 alloc_fifo free_inc;
2111 alloc_heap alloc_heap;
2113 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
2114 ca->mi.bucket_size / c->opts.btree_node_size);
2115 /* XXX: these should be tunable */
2116 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
2117 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
2118 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
2120 bool resize = ca->buckets[0] != NULL;
2124 memset(&free, 0, sizeof(free));
2125 memset(&free_inc, 0, sizeof(free_inc));
2126 memset(&alloc_heap, 0, sizeof(alloc_heap));
2128 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
2129 nbuckets * sizeof(struct bucket),
2130 GFP_KERNEL|__GFP_ZERO)) ||
2131 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
2132 sizeof(unsigned long),
2133 GFP_KERNEL|__GFP_ZERO)) ||
2134 !init_fifo(&free[RESERVE_MOVINGGC],
2135 copygc_reserve, GFP_KERNEL) ||
2136 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
2137 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
2138 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
2141 buckets->first_bucket = ca->mi.first_bucket;
2142 buckets->nbuckets = nbuckets;
2144 bch2_copygc_stop(c);
2147 down_write(&c->gc_lock);
2148 down_write(&ca->bucket_lock);
2149 percpu_down_write(&c->mark_lock);
2152 old_buckets = bucket_array(ca);
2155 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
2159 n * sizeof(struct bucket));
2160 memcpy(buckets_nouse,
2162 BITS_TO_LONGS(n) * sizeof(unsigned long));
2165 rcu_assign_pointer(ca->buckets[0], buckets);
2166 buckets = old_buckets;
2168 swap(ca->buckets_nouse, buckets_nouse);
2171 percpu_up_write(&c->mark_lock);
2172 up_write(&c->gc_lock);
2175 spin_lock(&c->freelist_lock);
2176 for (i = 0; i < RESERVE_NR; i++) {
2177 fifo_move(&free[i], &ca->free[i]);
2178 swap(ca->free[i], free[i]);
2180 fifo_move(&free_inc, &ca->free_inc);
2181 swap(ca->free_inc, free_inc);
2182 spin_unlock(&c->freelist_lock);
2184 /* with gc lock held, alloc_heap can't be in use: */
2185 swap(ca->alloc_heap, alloc_heap);
2187 nbuckets = ca->mi.nbuckets;
2190 up_write(&ca->bucket_lock);
2194 free_heap(&alloc_heap);
2195 free_fifo(&free_inc);
2196 for (i = 0; i < RESERVE_NR; i++)
2197 free_fifo(&free[i]);
2198 kvpfree(buckets_nouse,
2199 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
2201 call_rcu(&old_buckets->rcu, buckets_free_rcu);
2206 void bch2_dev_buckets_free(struct bch_dev *ca)
2210 free_heap(&ca->alloc_heap);
2211 free_fifo(&ca->free_inc);
2212 for (i = 0; i < RESERVE_NR; i++)
2213 free_fifo(&ca->free[i]);
2214 kvpfree(ca->buckets_nouse,
2215 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
2216 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
2217 sizeof(struct bucket_array) +
2218 ca->mi.nbuckets * sizeof(struct bucket));
2220 for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
2221 free_percpu(ca->usage[i]);
2222 kfree(ca->usage_base);
2225 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
2229 ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
2230 if (!ca->usage_base)
2233 for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
2234 ca->usage[i] = alloc_percpu(struct bch_dev_usage);
2239 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;