#include "ec.h"
#include "error.h"
#include "movinggc.h"
+#include "replicas.h"
#include <linux/preempt.h>
#include <trace/events/bcachefs.h>
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-
-#ifdef DEBUG_BUCKETS
-
-#define lg_local_lock lg_global_lock
-#define lg_local_unlock lg_global_unlock
-
-static void bch2_fs_stats_verify(struct bch_fs *c)
-{
- struct bch_fs_usage stats =_bch2_fs_usage_read(c);
- unsigned i, j;
-
- for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
- for (j = 0; j < ARRAY_SIZE(stats.replicas[i].data); j++)
- if ((s64) stats.replicas[i].data[j] < 0)
- panic("replicas %u %s sectors underflow: %lli\n",
- i + 1, bch_data_types[j],
- stats.replicas[i].data[j]);
-
- if ((s64) stats.replicas[i].persistent_reserved < 0)
- panic("replicas %u reserved underflow: %lli\n",
- i + 1, stats.replicas[i].persistent_reserved);
- }
-
- for (j = 0; j < ARRAY_SIZE(stats.buckets); j++)
- if ((s64) stats.replicas[i].data_buckets[j] < 0)
- panic("%s buckets underflow: %lli\n",
- bch_data_types[j],
- stats.buckets[j]);
-
- if ((s64) stats.s.online_reserved < 0)
- panic("sectors_online_reserved underflow: %lli\n",
- stats.s.online_reserved);
-}
-
-static void bch2_dev_stats_verify(struct bch_dev *ca)
-{
- struct bch_dev_usage stats =
- __bch2_dev_usage_read(ca);
- u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
- BUG_ON(stats.buckets[i] > n);
- BUG_ON(stats.buckets_alloc > n);
- BUG_ON(stats.buckets_unavailable > n);
-}
-
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
-{
- if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
- u64 used = __bch2_fs_sectors_used(c);
- u64 cached = 0;
- u64 avail = atomic64_read(&c->sectors_available);
- int cpu;
-
- for_each_possible_cpu(cpu)
- cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
-
- if (used + avail + cached > c->capacity)
- panic("used %llu avail %llu cached %llu capacity %llu\n",
- used, avail, cached, c->capacity);
- }
-}
-
-#else
-
-static void bch2_fs_stats_verify(struct bch_fs *c) {}
-static void bch2_dev_stats_verify(struct bch_dev *ca) {}
-static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
-
-#endif
-
/*
* Clear journal_seq_valid for buckets for which it's not needed, to prevent
* wraparound:
}
}
-#define bch2_usage_add(_acc, _stats) \
-do { \
- typeof(_acc) _a = (_acc), _s = (_stats); \
- unsigned i; \
- \
- for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \
- ((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \
-} while (0)
-
#define bch2_usage_read_raw(_stats) \
({ \
typeof(*this_cpu_ptr(_stats)) _acc; \
- int cpu; \
\
memset(&_acc, 0, sizeof(_acc)); \
- \
- for_each_possible_cpu(cpu) \
- bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
+ acc_u64s_percpu((u64 *) &_acc, \
+ (u64 __percpu *) _stats, \
+ sizeof(_acc) / sizeof(u64)); \
\
_acc; \
})
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
-{
- return bch2_usage_read_raw(ca->usage[gc]);
-}
-
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_raw(ca->usage[0]);
}
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
- return bch2_usage_read_raw(c->usage[gc]);
-}
+ struct bch_fs_usage *ret;
+ unsigned nr = READ_ONCE(c->replicas.nr);
+retry:
+ ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+ if (unlikely(!ret))
+ return NULL;
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
-{
- return bch2_usage_read_raw(c->usage[0]);
+ percpu_down_read_preempt_disable(&c->mark_lock);
+
+ if (unlikely(nr < c->replicas.nr)) {
+ nr = c->replicas.nr;
+ percpu_up_read_preempt_enable(&c->mark_lock);
+ kfree(ret);
+ goto retry;
+ }
+
+ acc_u64s_percpu((u64 *) ret,
+ (u64 __percpu *) c->usage[0],
+ sizeof(*ret) / sizeof(u64) + nr);
+
+ return ret;
}
#define RESERVE_FACTOR 6
return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
}
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
-{
- return fs_usage.s.hidden +
- fs_usage.s.data +
- reserve_factor(fs_usage.s.reserved +
- fs_usage.s.online_reserved);
-}
-
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
{
- return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
+ return min(fs_usage.s.hidden +
+ fs_usage.s.data +
+ reserve_factor(fs_usage.s.reserved +
+ fs_usage.s.online_reserved),
+ c->capacity);
}
struct bch_fs_usage_short
!is_available_bucket(new);
}
-void bch2_fs_usage_apply(struct bch_fs *c,
- struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct disk_reservation *disk_res,
+ struct gc_pos gc_pos)
{
s64 added = fs_usage->s.data + fs_usage->s.reserved;
s64 should_not_have_added;
+ int ret = 0;
percpu_rwsem_assert_held(&c->mark_lock);
"disk usage increased without a reservation")) {
atomic64_sub(should_not_have_added, &c->sectors_available);
added -= should_not_have_added;
+ ret = -1;
}
if (added > 0) {
fs_usage->s.online_reserved -= added;
}
- bch2_usage_add(this_cpu_ptr(c->usage[0]), fs_usage);
+ acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
+ (u64 *) fs_usage,
+ sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
- if (gc_visited(c, gc_pos))
- bch2_usage_add(this_cpu_ptr(c->usage[1]), fs_usage);
-
- bch2_fs_stats_verify(c);
+ if (gc_visited(c, gc_pos)) {
+ BUG_ON(!c->usage[1]);
+ acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
+ (u64 *) fs_usage,
+ sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
+ }
- memset(fs_usage, 0, sizeof(*fs_usage));
+ return ret;
}
static inline void account_bucket(struct bch_fs_usage *fs_usage,
if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
fs_usage->s.hidden += size;
- fs_usage->buckets[type] += size;
dev_usage->buckets[type] += nr;
}
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
-
- bch2_dev_stats_verify(ca);
}
void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
_old; \
})
+static inline void update_replicas(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ struct bch_replicas_entry *r,
+ s64 sectors)
+{
+ int idx = bch2_replicas_entry_idx(c, r);
+
+ BUG_ON(idx < 0);
+ BUG_ON(!sectors);
+
+ if (r->data_type == BCH_DATA_CACHED)
+ fs_usage->s.cached += sectors;
+ else
+ fs_usage->s.data += sectors;
+ fs_usage->data[idx] += sectors;
+}
+
+static inline void update_cached_sectors(struct bch_fs *c,
+ struct bch_fs_usage *fs_usage,
+ unsigned dev, s64 sectors)
+{
+ struct bch_replicas_padded r;
+
+ bch2_replicas_entry_cached(&r.e, dev);
+
+ update_replicas(c, fs_usage, &r.e, sectors);
+}
+
static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old,
bool gc)
*old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
- new.owned_by_allocator = 1;
+ new.owned_by_allocator = true;
+ new.dirty = true;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.gen++;
}));
- fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
- fs_usage->s.cached -= old->cached_sectors;
+ if (old->cached_sectors)
+ update_cached_sectors(c, fs_usage, ca->dev_idx,
+ -old->cached_sectors);
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.dirty = true;
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
-
- if (type == BCH_DATA_BTREE ||
- type == BCH_DATA_USER)
- fs_usage->s.data += sectors;
- fs_usage->replicas[0].data[type] += sectors;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
true);
} else {
struct bucket *g;
- struct bucket_mark old, new;
+ struct bucket_mark new;
rcu_read_lock();
g = bucket(ca, b);
- old = bucket_cmpxchg(g, new, ({
- new.data_type = type;
+ bucket_cmpxchg(g, new, ({
+ new.dirty = true;
+ new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
do {
new.v.counter = old.v.counter = v;
+ new.dirty = true;
+
/*
* Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already
static int bch2_mark_stripe_ptr(struct bch_fs *c,
struct bch_extent_stripe_ptr p,
+ enum bch_data_type data_type,
+ struct bch_fs_usage *fs_usage,
s64 sectors, unsigned flags,
- s64 *adjusted_disk_sectors,
- unsigned *redundancy,
bool gc)
{
struct stripe *m;
int blocks_nonempty_delta;
s64 parity_sectors;
+ BUG_ON(!sectors);
+
m = genradix_ptr(&c->stripes[gc], p.idx);
+ spin_lock(&c->ec_stripes_heap_lock);
+
if (!m || !m->alive) {
+ spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx);
return -1;
}
+ BUG_ON(m->r.e.data_type != data_type);
+
nr_data = m->nr_blocks - m->nr_redundant;
parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
if (sectors < 0)
parity_sectors = -parity_sectors;
+ sectors += parity_sectors;
- *adjusted_disk_sectors += parity_sectors;
-
- *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
-
- new = atomic_add_return(sectors, &m->block_sectors[p.block]);
- old = new - sectors;
+ old = m->block_sectors[p.block];
+ m->block_sectors[p.block] += sectors;
+ new = m->block_sectors[p.block];
blocks_nonempty_delta = (int) !!new - (int) !!old;
- if (!blocks_nonempty_delta)
- return 0;
+ if (blocks_nonempty_delta) {
+ m->blocks_nonempty += blocks_nonempty_delta;
- atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+ if (!gc)
+ bch2_stripes_heap_update(c, m, p.idx);
+ }
- BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
+ m->dirty = true;
- if (!gc)
- bch2_stripes_heap_update(c, m, p.idx);
+ spin_unlock(&c->ec_stripes_heap_lock);
+
+ update_replicas(c, fs_usage, &m->r.e, sectors);
return 0;
}
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- s64 cached_sectors = 0;
- s64 dirty_sectors = 0;
- s64 ec_sectors = 0;
- unsigned replicas = 0;
- unsigned ec_redundancy = 0;
+ struct bch_replicas_padded r;
+ s64 dirty_sectors = 0;
unsigned i;
int ret;
+ r.e.data_type = data_type;
+ r.e.nr_devs = 0;
+ r.e.nr_required = 1;
+
BUG_ON(!sectors);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
: ptr_disk_sectors_delta(p, sectors);
- s64 adjusted_disk_sectors = disk_sectors;
bch2_mark_pointer(c, p, disk_sectors, data_type,
fs_usage, journal_seq, flags, gc);
- if (!p.ptr.cached)
+ if (p.ptr.cached) {
+ update_cached_sectors(c, fs_usage, p.ptr.dev,
+ disk_sectors);
+ } else if (!p.ec_nr) {
+ dirty_sectors += disk_sectors;
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
+ } else {
for (i = 0; i < p.ec_nr; i++) {
ret = bch2_mark_stripe_ptr(c, p.ec[i],
- disk_sectors, flags,
- &adjusted_disk_sectors,
- &ec_redundancy, gc);
+ data_type, fs_usage,
+ disk_sectors, flags, gc);
if (ret)
return ret;
}
- if (!p.ptr.cached)
- replicas++;
- if (p.ptr.cached)
- cached_sectors += adjusted_disk_sectors;
- else if (!p.ec_nr)
- dirty_sectors += adjusted_disk_sectors;
- else
- ec_sectors += adjusted_disk_sectors;
+ r.e.nr_required = 0;
+ }
}
- replicas = clamp_t(unsigned, replicas,
- 1, ARRAY_SIZE(fs_usage->replicas));
- ec_redundancy = clamp_t(unsigned, ec_redundancy,
- 1, ARRAY_SIZE(fs_usage->replicas));
-
- fs_usage->s.cached += cached_sectors;
- fs_usage->replicas[0].data[BCH_DATA_CACHED] += cached_sectors;
-
- fs_usage->s.data += dirty_sectors;
- fs_usage->replicas[replicas - 1].data[data_type] += dirty_sectors;
-
- fs_usage->s.data += ec_sectors;
- fs_usage->replicas[ec_redundancy - 1].ec_data += ec_sectors;
+ if (dirty_sectors)
+ update_replicas(c, fs_usage, &r.e, dirty_sectors);
return 0;
}
BUG_ON(ptr_stale(ca, ptr));
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ new.dirty = true;
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
}));
-
- BUG_ON(old.stripe == enabled);
}
}
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
+ spin_lock(&c->ec_stripes_heap_lock);
+
if (!m || (!inserting && !m->alive)) {
+ spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
idx);
return -1;
}
- if (inserting && m->alive) {
- bch_err_ratelimited(c, "error marking stripe %zu: already exists",
- idx);
- return -1;
- }
-
- BUG_ON(atomic_read(&m->blocks_nonempty));
+ if (m->alive)
+ bch2_stripes_heap_del(c, m, idx);
- for (i = 0; i < EC_STRIPE_MAX; i++)
- BUG_ON(atomic_read(&m->block_sectors[i]));
+ memset(m, 0, sizeof(*m));
if (inserting) {
m->sectors = le16_to_cpu(s.v->sectors);
m->algorithm = s.v->algorithm;
m->nr_blocks = s.v->nr_blocks;
m->nr_redundant = s.v->nr_redundant;
- }
- if (!gc) {
- if (inserting)
+ memset(&m->r, 0, sizeof(m->r));
+
+ m->r.e.data_type = BCH_DATA_USER;
+ m->r.e.nr_devs = s.v->nr_blocks;
+ m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
+
+ for (i = 0; i < s.v->nr_blocks; i++)
+ m->r.e.devs[i] = s.v->ptrs[i].dev;
+
+ /*
+ * XXX: account for stripes somehow here
+ */
+#if 0
+ update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+#endif
+
+ /* gc recalculates these fields: */
+ if (!(flags & BCH_BUCKET_MARK_GC)) {
+ for (i = 0; i < s.v->nr_blocks; i++) {
+ m->block_sectors[i] =
+ stripe_blockcount_get(s.v, i);
+ m->blocks_nonempty += !!m->block_sectors[i];
+ }
+ }
+
+ if (!gc)
bch2_stripes_heap_insert(c, m, idx);
else
- bch2_stripes_heap_del(c, m, idx);
- } else {
- m->alive = inserting;
+ m->alive = true;
}
+ spin_unlock(&c->ec_stripes_heap_lock);
+
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
return 0;
}
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
sectors *= replicas;
- replicas = clamp_t(unsigned, replicas,
- 1, ARRAY_SIZE(fs_usage->replicas));
+ replicas = clamp_t(unsigned, replicas, 1,
+ ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->s.reserved += sectors;
- fs_usage->replicas[replicas - 1].persistent_reserved += sectors;
+ fs_usage->s.reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
break;
}
default:
struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
struct btree_node_iter node_iter = iter->l[0].iter;
- struct bch_fs_usage fs_usage = { 0 };
+ struct bch_fs_usage *fs_usage;
struct gc_pos pos = gc_pos_btree_node(b);
struct bkey_packed *_k;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+ static int warned_disk_usage = 0;
if (!btree_node_type_needs_gc(iter->btree_id))
return;
percpu_down_read_preempt_disable(&c->mark_lock);
+ fs_usage = bch2_fs_usage_get_scratch(c);
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
- pos, &fs_usage, trans->journal_res.seq, 0);
+ pos, fs_usage, trans->journal_res.seq, 0);
while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
KEY_TYPE_discard))) {
BUG_ON(sectors <= 0);
bch2_mark_key_locked(c, k, true, sectors,
- pos, &fs_usage, trans->journal_res.seq, 0);
+ pos, fs_usage, trans->journal_res.seq, 0);
sectors = bkey_start_offset(&insert->k->k) -
k.k->p.offset;
}
bch2_mark_key_locked(c, k, false, sectors,
- pos, &fs_usage, trans->journal_res.seq, 0);
+ pos, fs_usage, trans->journal_res.seq, 0);
bch2_btree_node_iter_advance(&node_iter, b);
}
- bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
+ if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+ !warned_disk_usage &&
+ !xchg(&warned_disk_usage, 1)) {
+ char buf[200];
+
+ pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+ pr_err("while inserting");
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+ pr_err("%s", buf);
+ pr_err("overlapping with");
+
+ node_iter = iter->l[0].iter;
+ while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+ KEY_TYPE_discard))) {
+ struct bkey unpacked;
+ struct bkey_s_c k;
+
+ k = bkey_disassemble(b, _k, &unpacked);
+
+ if (btree_node_is_extents(b)
+ ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+ : bkey_cmp(insert->k->k.p, k.k->p))
+ break;
+
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
+ pr_err("%s", buf);
+
+ bch2_btree_node_iter_advance(&node_iter, b);
+ }
+ }
percpu_up_read_preempt_enable(&c->mark_lock);
}
percpu_down_read_preempt_disable(&c->mark_lock);
this_cpu_sub(c->usage[0]->s.online_reserved,
res->sectors);
-
- bch2_fs_stats_verify(c);
percpu_up_read_preempt_enable(&c->mark_lock);
res->sectors = 0;
this_cpu_add(c->usage[0]->s.online_reserved, sectors);
res->sectors += sectors;
- bch2_disk_reservations_verify(c, flags);
- bch2_fs_stats_verify(c);
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
this_cpu_add(c->usage[0]->s.online_reserved, sectors);
res->sectors += sectors;
ret = 0;
-
- bch2_disk_reservations_verify(c, flags);
} else {
atomic64_set(&c->sectors_available, sectors_available);
ret = -ENOSPC;
}
- bch2_fs_stats_verify(c);
percpu_up_write(&c->mark_lock);
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
- btree_reserve);
+ btree_reserve * 2);
bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;