-09a546543006b60d44c4c51e7b40cd3ec7837a5e
+75e8a078b85703322fcf558f75a6845c0ef5dbb0
struct bkey_i_extent *e;
BKEY_PADDED(k) k;
u64 b = sector_to_bucket(ca, physical);
+ struct bucket_mark m;
struct disk_reservation res;
unsigned sectors;
int ret;
.gen = bucket(ca, b)->mark.gen,
});
- bucket_set_dirty(ca, b);
+ bucket_cmpxchg(bucket(ca, b), m, m.dirty = true);
ret = bch2_disk_reservation_get(c, &res, sectors, 1,
BCH_DISK_RESERVATION_NOFAIL);
*p += bytes;
}
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+{
+ struct bkey_alloc_unpacked ret = { .gen = a->gen };
+ const void *d = a->data;
+ unsigned idx = 0;
+
+#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++);
+ BCH_ALLOC_FIELDS()
+#undef x
+ return ret;
+}
+
+static void bch2_alloc_pack(struct bkey_i_alloc *dst,
+ const struct bkey_alloc_unpacked src)
+{
+ unsigned idx = 0;
+ void *d = dst->v.data;
+
+ dst->v.fields = 0;
+ dst->v.gen = src.gen;
+
+#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name);
+ BCH_ALLOC_FIELDS()
+#undef x
+
+ set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+}
+
static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
{
unsigned i, bytes = offsetof(struct bch_alloc, data);
static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
{
const void *d = a->data;
- unsigned idx = 0;
+ unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
+ struct bucket_mark m;
- g->_mark.gen = a->gen;
- g->gen_valid = 1;
g->io_time[READ] = get_alloc_field(a, &d, idx++);
g->io_time[WRITE] = get_alloc_field(a, &d, idx++);
- g->_mark.data_type = get_alloc_field(a, &d, idx++);
- g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++);
- g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
+ data_type = get_alloc_field(a, &d, idx++);
+ dirty_sectors = get_alloc_field(a, &d, idx++);
+ cached_sectors = get_alloc_field(a, &d, idx++);
+ g->oldest_gen = get_alloc_field(a, &d, idx++);
+
+ bucket_cmpxchg(g, m, ({
+ m.gen = a->gen;
+ m.data_type = data_type;
+ m.dirty_sectors = dirty_sectors;
+ m.cached_sectors = cached_sectors;
+ }));
+
+ g->gen_valid = 1;
}
static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
put_alloc_field(a, &d, idx++, m.data_type);
put_alloc_field(a, &d, idx++, m.dirty_sectors);
put_alloc_field(a, &d, idx++, m.cached_sectors);
+ put_alloc_field(a, &d, idx++, g->oldest_gen);
set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
}
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_USE_ALLOC_RESERVE|
+ BTREE_INSERT_NOMARK|
flags,
BTREE_INSERT_ENTRY(iter, &a->k_i));
if (ret)
? 0
: bch2_btree_insert_at(c, NULL, NULL,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY,
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK,
BTREE_INSERT_ENTRY(&iter, k));
err:
bch2_btree_iter_unlock(&iter);
return -1;
}
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+ if (m.journal_seq_valid) {
+ u64 journal_seq = atomic64_read(&c->journal.seq);
+ u64 bucket_seq = journal_seq;
+
+ bucket_seq &= ~((u64) U16_MAX);
+ bucket_seq |= m.journal_seq;
+
+ if (bucket_seq > journal_seq)
+ bucket_seq -= 1 << 16;
+
+ return bucket_seq;
+ } else {
+ return 0;
+ }
+}
+
+static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+ struct btree_iter *iter,
+ u64 *journal_seq, unsigned flags)
+{
+#if 0
+ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+ /* hack: */
+ __BKEY_PADDED(k, 8) alloc_key;
+#endif
+ struct bkey_i_alloc *a;
+ struct bkey_alloc_unpacked u;
+ struct bucket_mark m;
+ struct bkey_s_c k;
+ bool invalidating_cached_data;
+ size_t b;
+ int ret;
+
+ BUG_ON(!ca->alloc_heap.used ||
+ !ca->alloc_heap.data[0].nr);
+ b = ca->alloc_heap.data[0].bucket;
+
+ /* first, put on free_inc and mark as owned by allocator: */
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ spin_lock(&c->freelist_lock);
+
+ verify_not_on_freelist(c, ca, b);
+
+ BUG_ON(!fifo_push(&ca->free_inc, b));
+
+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+ m = bucket(ca, b)->mark;
+
+ spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+
+ bch2_btree_iter_cond_resched(iter);
+
+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+ k = bch2_btree_iter_peek_slot(iter);
+ ret = btree_iter_err(k);
+ if (ret)
+ return ret;
+
+ if (k.k && k.k->type == KEY_TYPE_alloc)
+ u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+ else
+ memset(&u, 0, sizeof(u));
+
+ invalidating_cached_data = u.cached_sectors != 0;
+
+ //BUG_ON(u.dirty_sectors);
+ u.data_type = 0;
+ u.dirty_sectors = 0;
+ u.cached_sectors = 0;
+ u.read_time = c->bucket_clock[READ].hand;
+ u.write_time = c->bucket_clock[WRITE].hand;
+ u.gen++;
+
+ a = bkey_alloc_init(&alloc_key.k);
+ a->k.p = iter->pos;
+ bch2_alloc_pack(a, u);
+
+ ret = bch2_btree_insert_at(c, NULL,
+ invalidating_cached_data ? journal_seq : NULL,
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_NOCHECK_RW|
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE|
+ BTREE_INSERT_USE_ALLOC_RESERVE|
+ flags,
+ BTREE_INSERT_ENTRY(iter, &a->k_i));
+ if (ret == -EINTR)
+ goto retry;
+
+ if (!ret) {
+ /* remove from alloc_heap: */
+ struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+ top->bucket++;
+ top->nr--;
+
+ if (!top->nr)
+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+ /*
+ * Make sure we flush the last journal entry that updated this
+ * bucket (i.e. deleting the last reference) before writing to
+ * this bucket again:
+ */
+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+ } else {
+ size_t b2;
+
+ /* remove from free_inc: */
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ spin_lock(&c->freelist_lock);
+
+ bch2_mark_alloc_bucket(c, ca, b, false,
+ gc_pos_alloc(c, NULL), 0);
+
+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+ BUG_ON(b != b2);
+
+ spin_unlock(&c->freelist_lock);
+ percpu_up_read_preempt_enable(&c->mark_lock);
+ }
+
+ return ret;
+}
+
static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t bucket, u64 *flush_seq)
{
percpu_up_read_preempt_enable(&c->mark_lock);
- if (m.journal_seq_valid) {
- u64 journal_seq = atomic64_read(&c->journal.seq);
- u64 bucket_seq = journal_seq;
-
- bucket_seq &= ~((u64) U16_MAX);
- bucket_seq |= m.journal_seq;
-
- if (bucket_seq > journal_seq)
- bucket_seq -= 1 << 16;
-
- *flush_seq = max(*flush_seq, bucket_seq);
- }
+ *flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
return m.cached_sectors != 0;
}
struct btree_iter iter;
u64 journal_seq = 0;
int ret = 0;
- long b;
bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
!fifo_full(&ca->free_inc) &&
- (b = next_alloc_bucket(ca)) >= 0) {
- bool must_flush =
- bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
- ret = __bch2_alloc_write_key(c, ca, b, &iter,
- must_flush ? &journal_seq : NULL,
- !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
- }
+ ca->alloc_heap.used)
+ ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
+ BTREE_INSERT_GC_LOCK_HELD|
+ (!fifo_empty(&ca->free_inc)
+ ? BTREE_INSERT_NOWAIT : 0));
bch2_btree_iter_unlock(&iter);
return 0;
}
-static void flush_held_btree_writes(struct bch_fs *c)
+static bool flush_done(struct bch_fs *c)
{
struct bucket_table *tbl;
struct rhash_head *pos;
struct btree *b;
- bool nodes_blocked;
+ bool nodes_unwritten;
size_t i;
- struct closure cl;
-
- closure_init_stack(&cl);
-
- clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
again:
- pr_debug("flushing dirty btree nodes");
cond_resched();
- closure_wait(&c->btree_interior_update_wait, &cl);
-
- nodes_blocked = false;
+ nodes_unwritten = false;
rcu_read_lock();
for_each_cached_btree(b, c, tbl, i, pos)
six_unlock_read(&b->lock);
goto again;
} else {
- nodes_blocked = true;
+ nodes_unwritten = true;
}
}
rcu_read_unlock();
- if (c->btree_roots_dirty)
+ if (c->btree_roots_dirty) {
bch2_journal_meta(&c->journal);
-
- if (nodes_blocked) {
- closure_sync(&cl);
goto again;
}
- closure_wake_up(&c->btree_interior_update_wait);
- closure_sync(&cl);
+ return !nodes_unwritten &&
+ !bch2_btree_interior_updates_nr_pending(c);
+}
- closure_wait_event(&c->btree_interior_update_wait,
- !bch2_btree_interior_updates_nr_pending(c));
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+ clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+ closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
}
static void allocator_start_issue_discards(struct bch_fs *c)
&journal_seq);
fifo_push(&ca->free[RESERVE_BTREE], bu);
- bucket_set_dirty(ca, bu);
}
}
{
struct bch_dev *ca;
unsigned i;
- bool wrote;
int ret;
down_read(&c->gc_lock);
}
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
- return bch2_alloc_write(c, false, &wrote);
+ return 0;
}
void bch2_fs_allocator_background_init(struct bch_fs *c)
#include "alloc_types.h"
#include "debug.h"
+struct bkey_alloc_unpacked {
+ u8 gen;
+#define x(_name, _bits) u##_bits _name;
+ BCH_ALLOC_FIELDS()
+#undef x
+};
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+
#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
{
u64 stranded = c->write_points_nr * c->bucket_size_max;
- u64 free = bch2_fs_sectors_free(c);
+ u64 free = bch2_fs_usage_read_short(c).free;
return stranded * factor > free;
}
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_nouse;
unsigned long *buckets_written;
- /* most out of date gen in the btree */
- u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage[2];
} __attribute__((packed, aligned(8)));
#define BCH_ALLOC_FIELDS() \
- x(read_time, 2) \
- x(write_time, 2) \
- x(data_type, 1) \
- x(dirty_sectors, 2) \
- x(cached_sectors, 2)
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8)
enum {
#define x(name, bytes) BCH_ALLOC_FIELD_##name,
};
static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
BCH_ALLOC_FIELDS()
#undef x
};
-#define x(name, bytes) + bytes
+#define x(name, bits) + (bits / 8)
static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
DIV_ROUND_UP(offsetof(struct bch_alloc, data)
BCH_ALLOC_FIELDS(), sizeof(u64));
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- size_t b = PTR_BUCKET_NR(ca, ptr);
- struct bucket *g = PTR_BUCKET(ca, ptr);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
+ struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
if (mustfix_fsck_err_on(!g->gen_valid, c,
"found ptr with missing gen in alloc btree,\n"
"type %u gen %u",
k.k->type, ptr->gen)) {
- g->_mark.gen = ptr->gen;
- g->gen_valid = 1;
- bucket_set_dirty(ca, b);
+ g2->_mark.gen = g->_mark.gen = ptr->gen;
+ g2->_mark.dirty = g->_mark.dirty = true;
+ g2->gen_valid = g->gen_valid = true;
}
if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
"%u ptr gen in the future: %u > %u",
k.k->type, ptr->gen, g->mark.gen)) {
- g->_mark.gen = ptr->gen;
- g->gen_valid = 1;
- bucket_set_dirty(ca, b);
+ g2->_mark.gen = g->_mark.gen = ptr->gen;
+ g2->_mark.dirty = g->_mark.dirty = true;
+ g2->gen_valid = g->gen_valid = true;
set_bit(BCH_FS_FIXED_GENS, &c->flags);
}
}
bkey_for_each_ptr(ptrs, ptr) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- size_t b = PTR_BUCKET_NR(ca, ptr);
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
- if (gen_after(ca->oldest_gens[b], ptr->gen))
- ca->oldest_gens[b] = ptr->gen;
+ if (gen_after(g->oldest_gen, ptr->gen))
+ g->oldest_gen = ptr->gen;
*max_stale = max(*max_stale, ptr_stale(ca, ptr));
}
bch2_verify_btree_nr_keys(b);
+ gc_pos_set(c, gc_pos_btree_node(b));
+
ret = btree_gc_mark_node(c, b, &max_stale, initial);
if (ret)
break;
- gc_pos_set(c, gc_pos_btree_node(b));
-
if (!initial) {
if (max_stale > 64)
bch2_btree_node_rewrite(c, &iter,
percpu_up_write(&c->mark_lock);
}
-static void bch2_gc_done_nocheck(struct bch_fs *c)
-{
- struct bch_dev *ca;
- unsigned i;
-
- {
- struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
- struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
- struct stripe *dst, *src;
-
- c->ec_stripes_heap.used = 0;
-
- while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
- (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
- *dst = *src;
-
- if (dst->alive)
- bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-
- genradix_iter_advance(&dst_iter, &c->stripes[0]);
- genradix_iter_advance(&src_iter, &c->stripes[1]);
- }
- }
-
- for_each_member_device(ca, c, i) {
- struct bucket_array *src = __bucket_array(ca, 1);
-
- memcpy(__bucket_array(ca, 0), src,
- sizeof(struct bucket_array) +
- sizeof(struct bucket) * src->nbuckets);
- };
-
- for_each_member_device(ca, c, i) {
- unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
- struct bch_dev_usage *dst = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
- struct bch_dev_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-
- *dst = *src;
- }
-
- {
- unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
- c->replicas.nr;
- struct bch_fs_usage *dst = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[0], nr);
- struct bch_fs_usage *src = (void *)
- bch2_acc_percpu_u64s((void *) c->usage[1], nr);
-
- memcpy(&dst->s.gc_start[0],
- &src->s.gc_start[0],
- nr * sizeof(u64) - offsetof(typeof(*dst), s.gc_start));
- }
-}
-
static void bch2_gc_done(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
+ bool verify = !initial ||
+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
unsigned i;
#define copy_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
- bch_err(c, _msg ": got %llu, should be %llu, fixing" \
- , ##__VA_ARGS__, dst->_f, src->_f); \
+ if (verify) \
+ bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+ , ##__VA_ARGS__, dst->_f, src->_f); \
dst->_f = src->_f; \
}
#define copy_stripe_field(_f, _msg, ...) \
if (dst->_f != src->_f) { \
- bch_err_ratelimited(c, "stripe %zu has wrong "_msg \
- ": got %u, should be %u, fixing", \
- dst_iter.pos, ##__VA_ARGS__, \
- dst->_f, src->_f); \
+ if (verify) \
+ bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
+ ": got %u, should be %u, fixing", \
+ dst_iter.pos, ##__VA_ARGS__, \
+ dst->_f, src->_f); \
dst->_f = src->_f; \
dst->dirty = true; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
- bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
- ": got %u, should be %u, fixing", \
- i, b, dst->b[b].mark._f, src->b[b].mark._f); \
+ if (verify) \
+ bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
+ ": got %u, should be %u, fixing", i, b, \
+ dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
+ dst->b[b]._mark.dirty = true; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
percpu_down_write(&c->mark_lock);
- if (initial &&
- !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
- bch2_gc_done_nocheck(c);
- goto out;
- }
-
{
struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
+
+ if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
+ dst->b[b].oldest_gen = src->b[b].oldest_gen;
+ dst->b[b]._mark.dirty = true;
+ }
}
};
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
- copy_dev_field(buckets[b],
- "buckets[%s]", bch2_data_types[b]);
- copy_dev_field(buckets_alloc, "buckets_alloc");
- copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets[b], "buckets[%s]",
+ bch2_data_types[b]);
+ copy_dev_field(buckets_alloc, "buckets_alloc");
+ copy_dev_field(buckets_ec, "buckets_ec");
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
for (b = 0; b < BCH_DATA_NR; b++)
- copy_dev_field(sectors[b],
- "sectors[%s]", bch2_data_types[b]);
- copy_dev_field(sectors_fragmented,
- "sectors_fragmented");
+ copy_dev_field(sectors[b], "sectors[%s]",
+ bch2_data_types[b]);
+ copy_dev_field(sectors_fragmented, "sectors_fragmented");
}
{
- unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
- c->replicas.nr;
+ unsigned nr = fs_usage_u64s(c);
struct bch_fs_usage *dst = (void *)
bch2_acc_percpu_u64s((void *) c->usage[0], nr);
struct bch_fs_usage *src = (void *)
bch2_acc_percpu_u64s((void *) c->usage[1], nr);
- copy_fs_field(s.hidden, "hidden");
- copy_fs_field(s.data, "data");
- copy_fs_field(s.cached, "cached");
- copy_fs_field(s.reserved, "reserved");
- copy_fs_field(s.nr_inodes, "nr_inodes");
+ copy_fs_field(hidden, "hidden");
+ copy_fs_field(data, "data");
+ copy_fs_field(cached, "cached");
+ copy_fs_field(reserved, "reserved");
+ copy_fs_field(nr_inodes, "nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(persistent_reserved[i],
"persistent_reserved[%i]", i);
for (i = 0; i < c->replicas.nr; i++) {
- /*
- * XXX: print out replicas entry
- */
- copy_fs_field(data[i], "data[%i]", i);
+ struct bch_replicas_entry *e =
+ cpu_replicas_entry(&c->replicas, i);
+ char buf[80];
+
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
+
+ copy_fs_field(replicas[i], "%s", buf);
}
}
-out:
+
percpu_up_write(&c->mark_lock);
#undef copy_fs_field
struct bch_dev *ca;
unsigned i;
+ percpu_down_write(&c->mark_lock);
+
/*
* indicate to stripe code that we need to allocate for the gc stripes
* radix tree, too
*/
gc_pos_set(c, gc_phase(GC_PHASE_START));
- percpu_down_write(&c->mark_lock);
BUG_ON(c->usage[1]);
- c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
- sizeof(u64) * c->replicas.nr,
- sizeof(u64),
- GFP_KERNEL);
+ c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+ sizeof(u64), GFP_KERNEL);
percpu_up_write(&c->mark_lock);
if (!c->usage[1])
dst->first_bucket = src->first_bucket;
dst->nbuckets = src->nbuckets;
- for (b = 0; b < src->nbuckets; b++)
- dst->b[b]._mark.gen = src->b[b].mark.gen;
+ for (b = 0; b < src->nbuckets; b++) {
+ dst->b[b]._mark.gen =
+ dst->b[b].oldest_gen =
+ src->b[b].mark.gen;
+ dst->b[b].gen_valid = src->b[b].gen_valid;
+ }
};
percpu_up_write(&c->mark_lock);
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+ bch2_gc_free(c);
goto again;
}
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
switch (type) {
+ case BKEY_TYPE_ALLOC:
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES:
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
BTREE_INSERT_ENOSPC,
- BTREE_INSERT_NEED_GC_LOCK,
BTREE_INSERT_NEED_MARK_REPLICAS,
};
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
+ __BTREE_INSERT_NOMARK,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
+/* Insert is for journal replay - don't get journal reservations: */
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
+/* Don't call bch2_mark_key: */
+#define BTREE_INSERT_NOMARK (1 << __BTREE_INSERT_NOMARK)
+
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
struct btree *b;
struct disk_reservation disk_res = { 0, 0 };
unsigned sectors = nr_nodes * c->opts.btree_node_size;
- int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+ int ret, disk_res_flags = 0;
if (flags & BTREE_INSERT_NOFAIL)
disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&old->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_root(b->btree_id));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
bkey_disassemble(b, k, &tmp),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_node(b));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
closure_init_stack(&cl);
/* Hack, because gc and splitting nodes doesn't mix yet: */
- if (!down_read_trylock(&c->gc_lock)) {
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+ !down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
return -EINTR;
*/
__bch2_btree_iter_downgrade(iter, 1);
out:
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
closure_sync(&cl);
return ret;
}
}
/* We're changing btree topology, doesn't mix with gc: */
- if (!down_read_trylock(&c->gc_lock))
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+ !down_read_trylock(&c->gc_lock))
goto err_cycle_gc_lock;
if (!bch2_btree_iter_upgrade(iter, U8_MAX,
bch2_btree_update_done(as);
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
out:
bch2_btree_iter_verify_locks(iter);
err_unlock:
six_unlock_intent(&m->lock);
- up_read(&c->gc_lock);
+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+ up_read(&c->gc_lock);
err:
BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
- BCH_DISK_RESERVATION_NOFAIL|
- BCH_DISK_RESERVATION_GC_LOCK_HELD);
+ BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
bch2_btree_node_free_index(as, NULL,
bkey_i_to_s_c(&b->key),
fs_usage);
- bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
- gc_pos_btree_root(b->btree_id));
+ bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
percpu_up_read_preempt_enable(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
btree_iter_cmp(l.iter, r.iter);
}
+static bool btree_trans_relock(struct btree_insert *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_iter(trans, i)
+ return bch2_btree_iter_relock(i->iter);
+ return true;
+}
+
+static void btree_trans_unlock(struct btree_insert *trans)
+{
+ struct btree_insert_entry *i;
+
+ trans_for_each_iter(trans, i) {
+ bch2_btree_iter_unlock(i->iter);
+ break;
+ }
+}
+
/* Normal update interface: */
static enum btree_insert_ret
struct btree_iter *linked;
unsigned u64s;
int ret;
-
+retry:
trans_for_each_iter(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
- /* reserve space for deferred updates */
- __trans_for_each_entry(trans, i, i->deferred) {
-
- }
-
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- u64s = 0;
- trans_for_each_entry(trans, i)
- u64s += jset_u64s(i->k->k.u64s);
-
- while ((ret = bch2_journal_res_get(&c->journal,
- &trans->journal_res, u64s,
- JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
- struct btree_iter *iter = NULL;
-
- trans_for_each_iter(trans, i)
- iter = i->iter;
-
- if (iter)
- bch2_btree_iter_unlock(iter);
-
- ret = bch2_journal_res_get(&c->journal,
- &trans->journal_res, u64s,
- JOURNAL_RES_GET_CHECK);
- if (ret)
- return ret;
-
- if (iter && !bch2_btree_iter_relock(iter)) {
- trans_restart(" (iter relock after journal res get blocked)");
- return -EINTR;
- }
- }
-
- if (ret)
- return ret;
- }
-
multi_lock_write(c, trans);
if (race_fault()) {
}
}
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ u64s = 0;
+ trans_for_each_entry(trans, i)
+ u64s += jset_u64s(i->k->k.u64s);
+
+ ret = bch2_journal_res_get(&c->journal,
+ &trans->journal_res, u64s,
+ JOURNAL_RES_GET_NONBLOCK);
+ if (likely(!ret))
+ goto got_journal_res;
+ if (ret != -EAGAIN)
+ goto out;
+
+ multi_unlock_write(trans);
+ btree_trans_unlock(trans);
+
+ ret = bch2_journal_res_get(&c->journal,
+ &trans->journal_res, u64s,
+ JOURNAL_RES_GET_CHECK);
+ if (ret)
+ return ret;
+
+ if (!btree_trans_relock(trans)) {
+ trans_restart(" (iter relock after journal res get blocked)");
+ return -EINTR;
+ }
+
+ goto retry;
+ }
+got_journal_res:
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_entry(trans, i)
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+ lockdep_assert_held(&c->gc_lock);
+
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
trans_for_each_entry(trans, i)
ret = -EINTR;
}
break;
- case BTREE_INSERT_NEED_GC_LOCK:
- ret = -EINTR;
-
- if (!down_read_trylock(&c->gc_lock)) {
- if (flags & BTREE_INSERT_NOUNLOCK)
- goto out;
-
- bch2_btree_iter_unlock(trans->entries[0].iter);
- down_read(&c->gc_lock);
- }
- up_read(&c->gc_lock);
- break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
void bch2_fs_usage_initialize(struct bch_fs *c)
{
struct bch_fs_usage *usage;
- unsigned i, nr;
+ unsigned i;
percpu_down_write(&c->mark_lock);
- nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
- usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+ usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
+ fs_usage_u64s(c));
for (i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->s.reserved += usage->persistent_reserved[i];
+ usage->reserved += usage->persistent_reserved[i];
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
switch (e->data_type) {
case BCH_DATA_BTREE:
case BCH_DATA_USER:
- usage->s.data += usage->data[i];
+ usage->data += usage->replicas[i];
break;
case BCH_DATA_CACHED:
- usage->s.cached += usage->data[i];
+ usage->cached += usage->replicas[i];
break;
}
}
percpu_up_write(&c->mark_lock);
}
-#define bch2_usage_read_raw(_stats) \
-({ \
- typeof(*this_cpu_ptr(_stats)) _acc; \
- \
- memset(&_acc, 0, sizeof(_acc)); \
- acc_u64s_percpu((u64 *) &_acc, \
- (u64 __percpu *) _stats, \
- sizeof(_acc) / sizeof(u64)); \
- \
- _acc; \
-})
-
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
- return bch2_usage_read_raw(ca->usage[0]);
+ struct bch_dev_usage ret;
+
+ memset(&ret, 0, sizeof(ret));
+ acc_u64s_percpu((u64 *) &ret,
+ (u64 __percpu *) ca->usage[0],
+ sizeof(ret) / sizeof(u64));
+
+ return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
{
struct bch_fs_usage *ret;
- unsigned nr = READ_ONCE(c->replicas.nr);
+ unsigned v, u64s = fs_usage_u64s(c);
retry:
- ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+ ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
if (unlikely(!ret))
return NULL;
percpu_down_read_preempt_disable(&c->mark_lock);
- if (unlikely(nr < c->replicas.nr)) {
- nr = c->replicas.nr;
+ v = fs_usage_u64s(c);
+ if (unlikely(u64s != v)) {
+ u64s = v;
percpu_up_read_preempt_enable(&c->mark_lock);
kfree(ret);
goto retry;
}
- acc_u64s_percpu((u64 *) ret,
- (u64 __percpu *) c->usage[0],
- sizeof(*ret) / sizeof(u64) + nr);
+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
return ret;
}
return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
}
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
{
- return min(fs_usage.s.hidden +
- fs_usage.s.data +
- reserve_factor(fs_usage.s.reserved +
- fs_usage.s.online_reserved),
+ return min(fs_usage->hidden +
+ fs_usage->data +
+ reserve_factor(fs_usage->reserved +
+ fs_usage->online_reserved),
c->capacity);
}
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+ struct bch_fs_usage_short ret;
+ u64 data, reserved;
+
+ ret.capacity = c->capacity -
+ percpu_u64_get(&c->usage[0]->hidden);
+
+ data = percpu_u64_get(&c->usage[0]->data);
+ reserved = percpu_u64_get(&c->usage[0]->reserved) +
+ percpu_u64_get(&c->usage[0]->online_reserved);
+
+ ret.used = min(ret.capacity, data + reserve_factor(reserved));
+ ret.free = ret.capacity - ret.used;
+
+ ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
+
+ return ret;
+}
+
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *c)
{
- struct bch_fs_usage_summarized usage =
- bch2_usage_read_raw(&c->usage[0]->s);
struct bch_fs_usage_short ret;
- ret.capacity = READ_ONCE(c->capacity) - usage.hidden;
- ret.used = min(ret.capacity, usage.data +
- reserve_factor(usage.reserved +
- usage.online_reserved));
- ret.nr_inodes = usage.nr_inodes;
+ percpu_down_read_preempt_disable(&c->mark_lock);
+ ret = __bch2_fs_usage_read_short(c);
+ percpu_up_read_preempt_enable(&c->mark_lock);
return ret;
}
int bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct disk_reservation *disk_res,
- struct gc_pos gc_pos)
+ struct disk_reservation *disk_res)
{
- s64 added = fs_usage->s.data + fs_usage->s.reserved;
+ s64 added = fs_usage->data + fs_usage->reserved;
s64 should_not_have_added;
int ret = 0;
if (added > 0) {
disk_res->sectors -= added;
- fs_usage->s.online_reserved -= added;
+ fs_usage->online_reserved -= added;
}
acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
- (u64 *) fs_usage,
- sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-
- if (gc_visited(c, gc_pos)) {
- BUG_ON(!c->usage[1]);
- acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
- (u64 *) fs_usage,
- sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
- }
+ (u64 *) fs_usage, fs_usage_u64s(c));
return ret;
}
int nr, s64 size)
{
if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
- fs_usage->s.hidden += size;
+ fs_usage->hidden += size;
dev_usage->buckets[type] += nr;
}
BUG_ON(!sectors);
if (r->data_type == BCH_DATA_CACHED)
- fs_usage->s.cached += sectors;
+ fs_usage->cached += sectors;
else
- fs_usage->s.data += sectors;
- fs_usage->data[idx] += sectors;
+ fs_usage->data += sectors;
+ fs_usage->replicas[idx] += sectors;
}
static inline void update_cached_sectors(struct bch_fs *c,
update_replicas(c, fs_usage, &r.e, sectors);
}
-static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, struct bucket_mark *old,
- bool gc)
+#define do_mark_fn(fn, c, pos, flags, ...) \
+({ \
+ int gc, ret = 0; \
+ \
+ percpu_rwsem_assert_held(&c->mark_lock); \
+ \
+ for (gc = 0; gc < 2 && !ret; gc++) \
+ if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
+ (gc && gc_visited(c, pos))) \
+ ret = fn(c, __VA_ARGS__, gc); \
+ ret; \
+})
+
+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, struct bucket_mark *ret,
+ bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
- struct bucket_mark new;
+ struct bucket_mark old, new;
- *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
BUG_ON(!is_available_bucket(new));
new.owned_by_allocator = true;
new.gen++;
}));
- if (old->cached_sectors)
+ if (old.cached_sectors)
update_cached_sectors(c, fs_usage, ca->dev_idx,
- -old->cached_sectors);
+ -((s64) old.cached_sectors));
+
+ if (!gc)
+ *ret = old;
+ return 0;
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
- percpu_rwsem_assert_held(&c->mark_lock);
-
- __bch2_invalidate_bucket(c, ca, b, old, false);
+ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
+ ca, b, old);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
-static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, bool owned_by_allocator,
- bool gc)
+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, bool owned_by_allocator,
+ bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
+
+ return 0;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
- percpu_rwsem_assert_held(&c->mark_lock);
+ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
+ ca, b, owned_by_allocator);
+}
- if (!(flags & BCH_BUCKET_MARK_GC))
- __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+ bool inserting,
+ struct bch_fs_usage *fs_usage,
+ unsigned journal_seq, unsigned flags,
+ bool gc)
+{
+ struct bkey_alloc_unpacked u;
+ struct bch_dev *ca;
+ struct bucket *g;
+ struct bucket_mark old, m;
+
+ if (!inserting)
+ return 0;
+
+ /*
+ * alloc btree is read in by bch2_alloc_read, not gc:
+ */
+ if (flags & BCH_BUCKET_MARK_GC)
+ return 0;
+
+ u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = __bucket(ca, k.k->p.offset, gc);
+
+ /*
+ * this should currently only be getting called from the bucket
+ * invalidate path:
+ */
+ BUG_ON(u.dirty_sectors);
+ BUG_ON(u.cached_sectors);
+ BUG_ON(!g->mark.owned_by_allocator);
+
+ old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+ m.gen = u.gen;
+ m.data_type = u.data_type;
+ m.dirty_sectors = u.dirty_sectors;
+ m.cached_sectors = u.cached_sectors;
+ }));
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos))
- __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = u.oldest_gen;
+ g->gen_valid = 1;
+
+ if (old.cached_sectors) {
+ update_cached_sectors(c, fs_usage, ca->dev_idx,
+ -old.cached_sectors);
+ trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+ old.cached_sectors);
+ }
+
+ return 0;
}
#define checked_add(a, b) \
BUG_ON((a) != _res); \
} while (0)
-static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
- size_t b, enum bch_data_type type,
- unsigned sectors, bool gc)
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+ size_t b, enum bch_data_type type,
+ unsigned sectors, bool gc)
{
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
+
+ return 0;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
type != BCH_DATA_JOURNAL);
if (likely(c)) {
- percpu_rwsem_assert_held(&c->mark_lock);
-
- if (!(flags & BCH_BUCKET_MARK_GC))
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- false);
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos))
- __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
- true);
+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
+ ca, b, type, sectors);
} else {
struct bucket *g;
struct bucket_mark new;
* loop, to avoid racing with the start of gc clearing all the marks - GC does
* that with the gc pos seqlock held.
*/
-static void bch2_mark_pointer(struct bch_fs *c,
+static bool bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
EBUG_ON(!p.ptr.cached &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
- return;
+ return true;
}
if (!p.ptr.cached)
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
BUG_ON(!gc && bucket_became_unavailable(old, new));
+
+ return false;
}
static int bch2_mark_stripe_ptr(struct bch_fs *c,
s64 disk_sectors = data_type == BCH_DATA_BTREE
? sectors
: ptr_disk_sectors_delta(p, sectors);
-
- bch2_mark_pointer(c, p, disk_sectors, data_type,
- fs_usage, journal_seq, flags, gc);
+ bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
+ fs_usage, journal_seq, flags, gc);
if (p.ptr.cached) {
- update_cached_sectors(c, fs_usage, p.ptr.dev,
- disk_sectors);
+ if (disk_sectors && !stale)
+ update_cached_sectors(c, fs_usage, p.ptr.dev,
+ disk_sectors);
} else if (!p.ec_nr) {
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
unsigned journal_seq, unsigned flags,
bool gc)
{
- int ret = 0;
+ if (!fs_usage || gc)
+ fs_usage = this_cpu_ptr(c->usage[gc]);
switch (k.k->type) {
+ case KEY_TYPE_alloc:
+ return bch2_mark_alloc(c, k, inserting,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_btree_ptr:
- ret = bch2_mark_extent(c, k, inserting
- ? c->opts.btree_node_size
- : -c->opts.btree_node_size,
- BCH_DATA_BTREE,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_extent(c, k, inserting
+ ? c->opts.btree_node_size
+ : -c->opts.btree_node_size,
+ BCH_DATA_BTREE,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_extent:
- ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_stripe:
- ret = bch2_mark_stripe(c, k, inserting,
- fs_usage, journal_seq, flags, gc);
- break;
+ return bch2_mark_stripe(c, k, inserting,
+ fs_usage, journal_seq, flags, gc);
case KEY_TYPE_inode:
if (inserting)
- fs_usage->s.nr_inodes++;
+ fs_usage->nr_inodes++;
else
- fs_usage->s.nr_inodes--;
- break;
+ fs_usage->nr_inodes--;
+ return 0;
case KEY_TYPE_reservation: {
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
replicas = clamp_t(unsigned, replicas, 1,
ARRAY_SIZE(fs_usage->persistent_reserved));
- fs_usage->s.reserved += sectors;
+ fs_usage->reserved += sectors;
fs_usage->persistent_reserved[replicas - 1] += sectors;
- break;
+ return 0;
}
default:
- break;
+ return 0;
}
-
- return ret;
}
int bch2_mark_key_locked(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
{
- int ret;
-
- if (!(flags & BCH_BUCKET_MARK_GC)) {
- ret = __bch2_mark_key(c, k, inserting, sectors,
- fs_usage ?: this_cpu_ptr(c->usage[0]),
- journal_seq, flags, false);
- if (ret)
- return ret;
- }
-
- if ((flags & BCH_BUCKET_MARK_GC) ||
- gc_visited(c, pos)) {
- ret = __bch2_mark_key(c, k, inserting, sectors,
- this_cpu_ptr(c->usage[1]),
- journal_seq, flags, true);
- if (ret)
- return ret;
- }
-
- return 0;
+ return do_mark_fn(__bch2_mark_key, c, pos, flags,
+ k, inserting, sectors, fs_usage,
+ journal_seq, flags);
}
int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
percpu_down_read_preempt_disable(&c->mark_lock);
fs_usage = bch2_fs_usage_get_scratch(c);
- if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(trans->flags & BTREE_INSERT_NOMARK))
bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
bpos_min(insert->k->k.p, b->key.k.p).offset -
bkey_start_offset(&insert->k->k),
bch2_btree_node_iter_advance(&node_iter, b);
}
- if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+ if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
!warned_disk_usage &&
!xchg(&warned_disk_usage, 1)) {
char buf[200];
{
percpu_u64_set(&c->pcpu->sectors_available, 0);
- return avail_factor(bch2_fs_sectors_free(c));
+ return avail_factor(__bch2_fs_usage_read_short(c).free);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read_preempt_disable(&c->mark_lock);
- this_cpu_sub(c->usage[0]->s.online_reserved,
+ this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
percpu_up_read_preempt_enable(&c->mark_lock);
out:
pcpu->sectors_available -= sectors;
- this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+ this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
percpu_up_read_preempt_enable(&c->mark_lock);
return 0;
recalculate:
- /*
- * GC recalculates sectors_available when it starts, so that hopefully
- * we don't normally end up blocking here:
- */
-
- /*
- * Piss fuck, we can be called from extent_insert_fixup() with btree
- * locks held:
- */
-
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
- if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
- down_read(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock))
- return -EINTR;
- }
-
percpu_down_write(&c->mark_lock);
+
sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
- this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+ this_cpu_add(c->usage[0]->online_reserved, sectors);
res->sectors += sectors;
ret = 0;
} else {
percpu_up_write(&c->mark_lock);
- if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
- up_read(&c->gc_lock);
-
return ret;
}
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_nouse = NULL;
unsigned long *buckets_written = NULL;
- u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
- !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
- GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
- memcpy(oldest_gens,
- ca->oldest_gens,
- n * sizeof(u8));
memcpy(buckets_nouse,
ca->buckets_nouse,
BITS_TO_LONGS(n) * sizeof(unsigned long));
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
- swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_nouse, buckets_nouse);
swap(ca->buckets_written, buckets_written);
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(buckets_written,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
- kvpfree(oldest_gens,
- nbuckets * sizeof(u8));
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
- kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
#define bucket_cmpxchg(g, new, expr) \
({ \
+ struct bucket *_g = g; \
u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
(new).v.counter = _old.v.counter = _v; \
expr; \
- } while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
_old.v.counter, \
(new).v.counter)) != _old.v.counter);\
_old; \
return __bucket(ca, b, false);
}
-static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
-{
- struct bucket *g;
- struct bucket_mark m;
-
- rcu_read_lock();
- g = bucket(ca, b);
- bucket_cmpxchg(g, m, m.dirty = true);
- rcu_read_unlock();
-
-}
-
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
{
- return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+ struct bucket *g = bucket(ca, b);
+
+ return g->mark.gen - g->oldest_gen;
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
}
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
- const struct bch_extent_ptr *ptr)
+ const struct bch_extent_ptr *ptr,
+ bool gc)
{
- return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
}
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
/* Filesystem usage: */
-static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
{
- struct bch_fs_usage *ret;
- ret = this_cpu_ptr(c->usage_scratch);
+ return sizeof(struct bch_fs_usage) / sizeof(u64) +
+ READ_ONCE(c->replicas.nr);
+}
- memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+ struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
+ memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
return ret;
}
struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
-static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
-{
- struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-
- return usage.capacity - usage.used;
-}
-
/* key/bucket marking: */
void bch2_bucket_seq_cleanup(struct bch_fs *);
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
-#define BCH_BUCKET_MARK_GC (1 << 1)
+#define BCH_BUCKET_MARK_GC (1 << 0)
+#define BCH_BUCKET_MARK_NOATOMIC (1 << 1)
int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
- struct disk_reservation *, struct gc_pos);
+ struct disk_reservation *);
/* disk reservations: */
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2)
int bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,
};
u16 io_time[2];
+ u8 oldest_gen;
unsigned gen_valid:1;
};
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
- /* summarized: */
- struct bch_fs_usage_summarized {
- u64 online_reserved;
+ u64 online_reserved;
- /* fields after online_reserved are cleared/recalculated by gc: */
- u64 gc_start[0];
+ /* fields after online_reserved are cleared/recalculated by gc: */
+ u64 gc_start[0];
- u64 hidden;
- u64 data;
- u64 cached;
- u64 reserved;
- u64 nr_inodes;
+ u64 hidden;
+ u64 data;
+ u64 cached;
+ u64 reserved;
+ u64 nr_inodes;
- /* XXX: add stats for compression ratio */
+ /* XXX: add stats for compression ratio */
#if 0
- u64 uncompressed;
- u64 compressed;
+ u64 uncompressed;
+ u64 compressed;
#endif
- } s;
/* broken out: */
u64 persistent_reserved[BCH_REPLICAS_MAX];
- u64 data[];
+ u64 replicas[];
};
struct bch_fs_usage_short {
u64 capacity;
u64 used;
+ u64 free;
u64 nr_inodes;
};
if (!src)
return -ENOMEM;
- percpu_up_read_preempt_enable(&c->mark_lock);
+ dst.used = bch2_fs_sectors_used(c, src);
+ dst.online_reserved = src->online_reserved;
- dst.used = bch2_fs_sectors_used(c, *src);
- dst.online_reserved = src->s.online_reserved;
+ percpu_up_read_preempt_enable(&c->mark_lock);
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
dst.persistent_reserved[i] =
if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
(sectors = bch2_extent_is_compressed(k))) {
- int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
- if (trans->flags & BTREE_INSERT_NOFAIL)
- flags |= BCH_DISK_RESERVATION_NOFAIL;
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
switch (bch2_disk_reservation_add(trans->c,
trans->disk_res,
break;
case -ENOSPC:
return BTREE_INSERT_ENOSPC;
- case -EINTR:
- return BTREE_INSERT_NEED_GC_LOCK;
default:
BUG();
}
({ \
bool _r = !fifo_empty((fifo)); \
if (_r) \
- (i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
_r; \
})
#include <trace/events/bcachefs.h>
-static bool journal_entry_is_open(struct journal *j)
+static bool __journal_entry_is_open(union journal_res_state state)
{
- return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
}
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static bool journal_entry_is_open(struct journal *j)
{
- struct journal_buf *w = journal_prev_buf(j);
-
- atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
- if (!need_write_just_set &&
- test_bit(JOURNAL_NEED_WRITE, &j->flags))
- bch2_time_stats_update(j->delay_time,
- j->need_write_time);
-
- closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+ return __journal_entry_is_open(j->reservations);
}
static void journal_pin_new_entry(struct journal *j, int count)
return true;
}
-static enum {
- JOURNAL_ENTRY_ERROR,
- JOURNAL_ENTRY_INUSE,
- JOURNAL_ENTRY_CLOSED,
- JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+void bch2_journal_halt(struct journal *j)
+{
+ union journal_res_state old, new;
+ u64 v = atomic64_read(&j->reservations.counter);
+
+ do {
+ old.v = new.v = v;
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+ return;
+
+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+ old.v, new.v)) != old.v);
+
+ journal_wake(j);
+ closure_wake_up(&journal_cur_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+ if (!need_write_just_set &&
+ test_bit(JOURNAL_NEED_WRITE, &j->flags))
+ bch2_time_stats_update(j->delay_time,
+ j->need_write_time);
+
+ clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
+ bool set_need_write = false;
+ unsigned sectors;
lockdep_assert_held(&j->lock);
do {
old.v = new.v = v;
if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
- return JOURNAL_ENTRY_CLOSED;
+ return true;
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
/* this entry will never be written: */
closure_wake_up(&buf->wait);
- return JOURNAL_ENTRY_ERROR;
+ return true;
}
- if (new.prev_buf_unwritten)
- return JOURNAL_ENTRY_INUSE;
+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
+ j->need_write_time = local_clock();
+ set_need_write = true;
+ }
- /*
- * avoid race between setting buf->data->u64s and
- * journal_res_put starting write:
- */
- journal_state_inc(&new);
+ if (new.prev_buf_unwritten)
+ return false;
new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
new.idx++;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
- j->prev_buf_sectors =
- vstruct_blocks_plus(buf->data, c->block_bits,
- buf->u64s_reserved) *
- c->opts.block_size;
- BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+ sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+ buf->u64s_reserved) << c->block_bits;
+ BUG_ON(sectors > buf->sectors);
+ buf->sectors = sectors;
bkey_extent_init(&buf->key);
* Hence, we want update/set last_seq on the current journal entry right
* before we open a new one:
*/
- bch2_journal_reclaim_fast(j);
buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
if (journal_entry_empty(buf->data))
bch2_journal_buf_init(j);
cancel_delayed_work(&j->write_work);
- spin_unlock(&j->lock);
- /* ugh - might be called from __journal_res_get() under wait_event() */
- __set_current_state(TASK_RUNNING);
- bch2_journal_buf_put(j, old.idx, need_write_just_set);
+ bch2_journal_space_available(j);
- return JOURNAL_UNLOCKED;
+ bch2_journal_buf_put(j, old.idx, set_need_write);
+ return true;
}
-void bch2_journal_halt(struct journal *j)
+static bool journal_entry_close(struct journal *j)
{
- union journal_res_state old, new;
- u64 v = atomic64_read(&j->reservations.counter);
-
- do {
- old.v = new.v = v;
- if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return;
+ bool ret;
- new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
- } while ((v = atomic64_cmpxchg(&j->reservations.counter,
- old.v, new.v)) != old.v);
+ spin_lock(&j->lock);
+ ret = __journal_entry_close(j);
+ spin_unlock(&j->lock);
- journal_wake(j);
- closure_wake_up(&journal_cur_buf(j)->wait);
- closure_wake_up(&journal_prev_buf(j)->wait);
+ return ret;
}
/*
* journal reservation - journal entry is open means journal is dirty:
*
* returns:
- * 1: success
- * 0: journal currently full (must wait)
- * -EROFS: insufficient rw devices
- * -EIO: journal error
+ * 0: success
+ * -ENOSPC: journal currently full, must invoke reclaim
+ * -EAGAIN: journal blocked, must wait
+ * -EROFS: insufficient rw devices or journal error
*/
static int journal_entry_open(struct journal *j)
{
struct journal_buf *buf = journal_cur_buf(j);
union journal_res_state old, new;
- ssize_t u64s;
- int sectors;
+ int u64s;
u64 v;
lockdep_assert_held(&j->lock);
BUG_ON(journal_entry_is_open(j));
- if (!fifo_free(&j->pin))
- return 0;
+ if (j->blocked)
+ return -EAGAIN;
- sectors = bch2_journal_entry_sectors(j);
- if (sectors <= 0)
- return sectors;
+ if (j->cur_entry_error)
+ return j->cur_entry_error;
- buf->disk_sectors = sectors;
- buf->u64s_reserved = j->entry_u64s_reserved;
+ BUG_ON(!j->cur_entry_sectors);
- sectors = min_t(unsigned, sectors, buf->size >> 9);
- j->cur_buf_sectors = sectors;
-
- u64s = (sectors << 9) / sizeof(u64);
-
- /* Subtract the journal header */
- u64s -= sizeof(struct jset) / sizeof(u64);
- u64s -= buf->u64s_reserved;
- u64s = max_t(ssize_t, 0L, u64s);
+ buf->u64s_reserved = j->entry_u64s_reserved;
+ buf->disk_sectors = j->cur_entry_sectors;
+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
- BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+ u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+ journal_entry_overhead(j);
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
if (u64s <= le32_to_cpu(buf->data->u64s))
- return 0;
+ return -ENOSPC;
/*
* Must be set before marking the journal entry as open:
do {
old.v = new.v = v;
+ EBUG_ON(journal_state_count(new, new.idx));
+
if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
- return -EIO;
+ return -EROFS;
/* Handle any already added entries */
new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+ journal_state_inc(&new);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
&j->write_work,
msecs_to_jiffies(j->write_delay_ms));
journal_wake(j);
- return 1;
+ return 0;
}
-static bool __journal_entry_close(struct journal *j)
+static bool journal_quiesced(struct journal *j)
{
- bool set_need_write;
-
- if (!journal_entry_is_open(j)) {
- spin_unlock(&j->lock);
- return true;
- }
-
- set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
- if (set_need_write)
- j->need_write_time = local_clock();
+ union journal_res_state state = READ_ONCE(j->reservations);
+ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
- switch (journal_buf_switch(j, set_need_write)) {
- case JOURNAL_ENTRY_INUSE:
- spin_unlock(&j->lock);
- return false;
- default:
- spin_unlock(&j->lock);
- case JOURNAL_UNLOCKED:
- return true;
- }
+ if (!ret)
+ journal_entry_close(j);
+ return ret;
}
-static bool journal_entry_close(struct journal *j)
+static void journal_quiesce(struct journal *j)
{
- spin_lock(&j->lock);
- return __journal_entry_close(j);
+ wait_event(j->wait, journal_quiesced(j));
}
static void journal_write_work(struct work_struct *work)
if (journal_res_get_fast(j, res, flags))
return 0;
+ if (bch2_journal_error(j))
+ return -EROFS;
+
spin_lock(&j->lock);
+
/*
* Recheck after taking the lock, so we don't race with another thread
* that just did journal_entry_open() and call journal_entry_close()
*/
buf = journal_cur_buf(j);
if (journal_entry_is_open(j) &&
- buf->size >> 9 < buf->disk_sectors &&
- buf->size < JOURNAL_ENTRY_SIZE_MAX)
- j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+ buf->buf_size >> 9 < buf->disk_sectors &&
+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- /*
- * Close the current journal entry if necessary, then try to start a new
- * one:
- */
- switch (journal_buf_switch(j, false)) {
- case JOURNAL_ENTRY_ERROR:
- spin_unlock(&j->lock);
- return -EROFS;
- case JOURNAL_ENTRY_INUSE:
+ if (journal_entry_is_open(j) &&
+ !__journal_entry_close(j)) {
/*
- * The current journal entry is still open, but we failed to get
- * a journal reservation because there's not enough space in it,
- * and we can't close it and start another because we haven't
- * finished writing out the previous entry:
+ * We failed to get a reservation on the current open journal
+ * entry because it's full, and we can't close it because
+ * there's still a previous one in flight:
*/
- spin_unlock(&j->lock);
trace_journal_entry_full(c);
- goto blocked;
- case JOURNAL_ENTRY_CLOSED:
- break;
- case JOURNAL_UNLOCKED:
- goto retry;
+ ret = -EAGAIN;
+ } else {
+ ret = journal_entry_open(j);
}
- /* We now have a new, closed journal buf - see if we can open it: */
- ret = journal_entry_open(j);
+ if ((ret == -EAGAIN || ret == -ENOSPC) &&
+ !j->res_get_blocked_start)
+ j->res_get_blocked_start = local_clock() ?: 1;
+
spin_unlock(&j->lock);
- if (ret < 0)
- return ret;
- if (ret)
+ if (!ret)
goto retry;
+ if (ret == -ENOSPC) {
+ /*
+ * Journal is full - can't rely on reclaim from work item due to
+ * freezing:
+ */
+ trace_journal_full(c);
+ if (!(flags & JOURNAL_RES_GET_NONBLOCK))
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
+ ret = -EAGAIN;
+ }
- /* Journal's full, we have to wait */
-
- /*
- * Direct reclaim - can't rely on reclaim from work item
- * due to freezing..
- */
- bch2_journal_reclaim_work(&j->reclaim_work.work);
-
- trace_journal_full(c);
-blocked:
- if (!j->res_get_blocked_start)
- j->res_get_blocked_start = local_clock() ?: 1;
- return -EAGAIN;
+ return ret;
}
/*
{
int ret;
- wait_event(j->wait,
+ closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
j->entry_u64s_reserved += d;
if (d <= 0)
- goto out_unlock;
+ goto out;
- j->cur_entry_u64s -= d;
+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
smp_mb();
state = READ_ONCE(j->reservations);
* Not enough room in current journal entry, have to flush it:
*/
__journal_entry_close(j);
- goto out;
+ } else {
+ journal_cur_buf(j)->u64s_reserved += d;
}
-
- journal_cur_buf(j)->u64s_reserved += d;
-out_unlock:
- spin_unlock(&j->lock);
out:
+ spin_unlock(&j->lock);
res->u64s += d;
- return;
}
/* journal flushing: */
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
int ret;
-retry:
+
spin_lock(&j->lock);
- if (seq < journal_cur_seq(j) ||
+ /*
+ * Can't try to open more than one sequence number ahead:
+ */
+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+ if (journal_cur_seq(j) > seq ||
journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return 0;
}
- if (journal_cur_seq(j) < seq) {
- switch (journal_buf_switch(j, false)) {
- case JOURNAL_ENTRY_ERROR:
- spin_unlock(&j->lock);
- return -EROFS;
- case JOURNAL_ENTRY_INUSE:
- /* haven't finished writing out the previous one: */
- trace_journal_entry_full(c);
- goto blocked;
- case JOURNAL_ENTRY_CLOSED:
- break;
- case JOURNAL_UNLOCKED:
- goto retry;
- }
- }
-
- BUG_ON(journal_cur_seq(j) < seq);
+ if (journal_cur_seq(j) < seq &&
+ !__journal_entry_close(j)) {
+ /* haven't finished writing out the previous one: */
+ trace_journal_entry_full(c);
+ ret = -EAGAIN;
+ } else {
+ BUG_ON(journal_cur_seq(j) != seq);
- ret = journal_entry_open(j);
- if (ret) {
- spin_unlock(&j->lock);
- return ret < 0 ? ret : 0;
+ ret = journal_entry_open(j);
}
-blocked:
- if (!j->res_get_blocked_start)
+
+ if ((ret == -EAGAIN || ret == -ENOSPC) &&
+ !j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
- closure_wait(&j->async_wait, cl);
+ if (ret == -EAGAIN || ret == -ENOSPC)
+ closure_wait(&j->async_wait, cl);
+
spin_unlock(&j->lock);
- bch2_journal_reclaim_work(&j->reclaim_work.work);
- return -EAGAIN;
+ if (ret == -ENOSPC) {
+ trace_journal_full(c);
+ bch2_journal_reclaim_work(&j->reclaim_work.work);
+ ret = -EAGAIN;
+ }
+
+ return ret;
}
static int journal_seq_error(struct journal *j, u64 seq)
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
- else
- spin_unlock(&j->lock);
+ spin_unlock(&j->lock);
}
static int journal_seq_flushed(struct journal *j, u64 seq)
if (seq == journal_cur_seq(j))
__journal_entry_close(j);
- else
- spin_unlock(&j->lock);
+ spin_unlock(&j->lock);
return ret;
}
return bch2_journal_flush_seq(j, seq);
}
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked--;
+ spin_unlock(&j->lock);
+
+ journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked++;
+ spin_unlock(&j->lock);
+
+ journal_quiesce(j);
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
goto err;
journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
- nr + sizeof(*journal_buckets) / sizeof(u64));
+ nr + sizeof(*journal_buckets) / sizeof(u64));
if (!journal_buckets)
goto err;
ja->nr++;
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
- ca->mi.bucket_size,
- gc_phase(GC_PHASE_SB),
- 0);
+ ca->mi.bucket_size,
+ gc_phase(GC_PHASE_SB),
+ 0);
if (c) {
spin_unlock(&c->journal.lock);
*/
if (bch2_disk_reservation_get(c, &disk_res,
- bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
mutex_unlock(&c->sb_lock);
return -ENOSPC;
}
c->btree_roots_dirty)
bch2_journal_meta(j);
- BUG_ON(journal_entry_is_open(j) ||
- j->reservations.prev_buf_unwritten);
+ journal_quiesce(j);
BUG_ON(!bch2_journal_error(j) &&
test_bit(JOURNAL_NOT_EMPTY, &j->flags));
journal_pin_new_entry(j, 0);
/*
- * journal_buf_switch() only inits the next journal entry when it
+ * __journal_entry_close() only inits the next journal entry when it
* closes an open journal entry - the very first journal entry gets
* initialized here:
*/
c->last_bucket_seq_cleanup = journal_cur_seq(j);
+ bch2_journal_space_available(j);
spin_unlock(&j->lock);
/*
*/
bch2_journal_seq_blacklist_write(j);
- queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
}
/* init/exit: */
void bch2_fs_journal_exit(struct journal *j)
{
- kvpfree(j->buf[1].data, j->buf[1].size);
- kvpfree(j->buf[0].data, j->buf[0].size);
+ kvpfree(j->buf[1].data, j->buf[1].buf_size);
+ kvpfree(j->buf[0].data, j->buf[0].buf_size);
free_fifo(&j->pin);
}
lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
- j->buf[0].size = JOURNAL_ENTRY_SIZE_MIN;
- j->buf[1].size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN;
+ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN;
j->write_delay_ms = 1000;
j->reclaim_delay_ms = 100;
{ .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
- !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
ret = -ENOMEM;
goto out;
}
{
struct printbuf out = _PBUF(buf, PAGE_SIZE);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- union journal_res_state *s = &j->reservations;
+ union journal_res_state s;
struct bch_dev *ca;
unsigned iter;
rcu_read_lock();
spin_lock(&j->lock);
+ s = READ_ONCE(j->reservations);
pr_buf(&out,
"active journal entries:\t%llu\n"
"seq:\t\t\t%llu\n"
"last_seq:\t\t%llu\n"
"last_seq_ondisk:\t%llu\n"
- "reservation count:\t%u\n"
- "reservation offset:\t%u\n"
- "current entry u64s:\t%u\n"
- "io in flight:\t\t%i\n"
- "need write:\t\t%i\n"
- "dirty:\t\t\t%i\n"
- "replay done:\t\t%i\n",
+ "current entry:\t\t",
fifo_used(&j->pin),
journal_cur_seq(j),
journal_last_seq(j),
- j->last_seq_ondisk,
- journal_state_count(*s, s->idx),
- s->cur_entry_offset,
- j->cur_entry_u64s,
- s->prev_buf_unwritten,
+ j->last_seq_ondisk);
+
+ switch (s.cur_entry_offset) {
+ case JOURNAL_ENTRY_ERROR_VAL:
+ pr_buf(&out, "error\n");
+ break;
+ case JOURNAL_ENTRY_CLOSED_VAL:
+ pr_buf(&out, "closed\n");
+ break;
+ default:
+ pr_buf(&out, "%u/%u\n",
+ s.cur_entry_offset,
+ j->cur_entry_u64s);
+ break;
+ }
+
+ pr_buf(&out,
+ "current entry refs:\t%u\n"
+ "prev entry unwritten:\t",
+ journal_state_count(s, s.idx));
+
+ if (s.prev_buf_unwritten)
+ pr_buf(&out, "yes, ref %u\n",
+ journal_state_count(s, !s.idx));
+ else
+ pr_buf(&out, "no\n");
+
+ pr_buf(&out,
+ "need write:\t\t%i\n"
+ "replay done:\t\t%i\n",
test_bit(JOURNAL_NEED_WRITE, &j->flags),
- journal_entry_is_open(j),
test_bit(JOURNAL_REPLAY_DONE, &j->flags));
for_each_member_device_rcu(ca, c, iter,
pr_buf(&out,
"dev %u:\n"
"\tnr\t\t%u\n"
+ "\tavailable\t%u:%u\n"
"\tcur_idx\t\t%u (seq %llu)\n"
"\tlast_idx\t%u (seq %llu)\n",
iter, ja->nr,
+ bch2_journal_dev_buckets_available(j, ja),
+ ja->sectors_free,
ja->cur_idx, ja->bucket_seq[ja->cur_idx],
ja->last_idx, ja->bucket_seq[ja->last_idx]);
}
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
+static inline int journal_entry_overhead(struct journal *j)
+{
+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
static inline struct jset_entry *
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
id, 0, k, k->k.u64s);
}
-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
.buf0_count = idx == 0,
.buf1_count = idx == 1,
}).v, &j->reservations.counter);
-
- EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
- /*
- * Do not initiate a journal write if the journal is in an error state
- * (previous journal entry write may have failed)
- */
- if (s.idx != idx &&
- !journal_state_count(s, idx) &&
- s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
- bch2_journal_buf_put_slowpath(j, need_write_just_set);
+ if (!journal_state_count(s, idx)) {
+ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+ __bch2_journal_buf_put(j, need_write_just_set);
+ }
}
/*
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
+ EBUG_ON(!journal_state_count(new, new.idx));
+
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
return 0;
}
+/* journal_entry_res: */
+
void bch2_journal_entry_res_resize(struct journal *,
struct journal_entry_res *,
unsigned);
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
}
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
- struct journal_entry_pin_list *pin_list;
struct bkey_i *k, *_n;
struct jset_entry *entry;
struct journal_replay *i, *n;
ret = bch2_btree_insert(c, entry->btree_id, k,
&disk_res, NULL,
BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY);
+ BTREE_INSERT_JOURNAL_REPLAY|
+ BTREE_INSERT_NOMARK);
}
if (ret) {
cond_resched();
}
- pin_list = journal_seq_pin(j, j->replay_journal_seq);
-
- if (atomic_dec_and_test(&pin_list->count))
- journal_wake(j);
+ bch2_journal_pin_put(j, j->replay_journal_seq);
}
j->replay_journal_seq = 0;
/* journal write: */
-static unsigned journal_dev_buckets_available(struct journal *j,
- struct journal_device *ja)
-{
- unsigned next = (ja->cur_idx + 1) % ja->nr;
- unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
- /*
- * Don't use the last bucket unless writing the new last_seq
- * will make another bucket available:
- */
- if (available &&
- journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
- --available;
-
- return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
-{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned sectors_available = UINT_MAX;
- unsigned i, nr_online = 0, nr_devs = 0;
-
- lockdep_assert_held(&j->lock);
-
- rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_JOURNAL]) {
- struct journal_device *ja = &ca->journal;
- unsigned buckets_this_device, sectors_this_device;
-
- if (!ja->nr)
- continue;
-
- buckets_this_device = journal_dev_buckets_available(j, ja);
- sectors_this_device = ja->sectors_free;
-
- nr_online++;
-
- /*
- * We that we don't allocate the space for a journal entry
- * until we write it out - thus, account for it here:
- */
- if (j->prev_buf_sectors >= sectors_this_device) {
- if (!buckets_this_device)
- continue;
-
- buckets_this_device--;
- sectors_this_device = ca->mi.bucket_size;
- }
-
- sectors_this_device -= j->prev_buf_sectors;
-
- if (buckets_this_device)
- sectors_this_device = ca->mi.bucket_size;
-
- if (!sectors_this_device)
- continue;
-
- sectors_available = min(sectors_available,
- sectors_this_device);
- nr_devs++;
- }
- rcu_read_unlock();
-
- if (nr_online < c->opts.metadata_replicas_required)
- return -EROFS;
-
- if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
- return 0;
-
- return sectors_available;
-}
-
static void __journal_write_alloc(struct journal *j,
struct journal_buf *w,
struct dev_alloc_list *devs_sorted,
devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
&c->rw_devs[BCH_DATA_JOURNAL]);
- spin_lock(&j->lock);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
if (sectors > ja->sectors_free &&
sectors <= ca->mi.bucket_size &&
- journal_dev_buckets_available(j, ja)) {
+ bch2_journal_dev_buckets_available(j, ja)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size;
}
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
done:
- if (replicas >= replicas_want)
- j->prev_buf_sectors = 0;
-
- spin_unlock(&j->lock);
rcu_read_unlock();
return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
- if (buf->size >= new_size)
+ if (buf->buf_size >= new_size)
return;
new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
if (!new_buf)
return;
- memcpy(new_buf, buf->data, buf->size);
- kvpfree(buf->data, buf->size);
+ memcpy(new_buf, buf->data, buf->buf_size);
+ kvpfree(buf->data, buf->buf_size);
buf->data = new_buf;
- buf->size = new_size;
+ buf->buf_size = new_size;
}
static void journal_write_done(struct closure *cl)
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
out:
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
struct bch_extent_ptr *ptr;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s;
+ int ret;
+
+ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
journal_buf_realloc(j, w);
jset = w->data;
j->write_start_time = local_clock();
- start = vstruct_last(w->data);
+ start = vstruct_last(jset);
end = bch2_journal_super_entries_add_common(c, start);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
- le32_add_cpu(&w->data->u64s, u64s);
- BUG_ON(vstruct_sectors(jset, c->block_bits) >
- w->disk_sectors);
+ le32_add_cpu(&jset->u64s, u64s);
+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
journal_write_compact(jset);
goto err;
sectors = vstruct_sectors(jset, c->block_bits);
- BUG_ON(sectors > j->prev_buf_sectors);
+ BUG_ON(sectors > w->sectors);
+
+ bytes = vstruct_bytes(jset);
+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+
+ spin_lock(&j->lock);
+ ret = journal_write_alloc(j, w, sectors);
- bytes = vstruct_bytes(w->data);
- memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+ /*
+ * write is allocated, no longer need to account for it in
+ * bch2_journal_space_available():
+ */
+ w->sectors = 0;
+
+ /*
+ * journal entry has been compacted and allocated, recalculate space
+ * available:
+ */
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
- if (journal_write_alloc(j, w, sectors)) {
+ if (ret) {
bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
trace_journal_write(bio);
closure_bio_submit(bio, cl);
- ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
}
for_each_rw_member(ca, c, i)
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
-int bch2_journal_entry_sectors(struct journal *);
void bch2_journal_write(struct closure *);
#endif /* _BCACHEFS_JOURNAL_IO_H */
#include "bcachefs.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
+/* Free space calculations: */
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+ struct journal_device *ja)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ unsigned next = (ja->cur_idx + 1) % ja->nr;
+ unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+ /*
+ * Allocator startup needs some journal space before we can do journal
+ * replay:
+ */
+ if (available &&
+ test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+ available--;
+
+ /*
+ * Don't use the last bucket unless writing the new last_seq
+ * will make another bucket available:
+ */
+ if (available &&
+ journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+ --available;
+
+ return available;
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned sectors_next_entry = UINT_MAX;
+ unsigned sectors_total = UINT_MAX;
+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
+ j->buf[1].buf_size >> 9);
+ unsigned i, nr_online = 0, nr_devs = 0;
+ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+ ? journal_prev_buf(j)->sectors
+ : 0;
+ int ret = 0;
+
+ lockdep_assert_held(&j->lock);
+
+ rcu_read_lock();
+ for_each_member_device_rcu(ca, c, i,
+ &c->rw_devs[BCH_DATA_JOURNAL]) {
+ struct journal_device *ja = &ca->journal;
+ unsigned buckets_this_device, sectors_this_device;
+
+ if (!ja->nr)
+ continue;
+
+ nr_online++;
+
+ buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+ sectors_this_device = ja->sectors_free;
+
+ /*
+ * We that we don't allocate the space for a journal entry
+ * until we write it out - thus, account for it here:
+ */
+ if (unwritten_sectors >= sectors_this_device) {
+ if (!buckets_this_device)
+ continue;
+
+ buckets_this_device--;
+ sectors_this_device = ca->mi.bucket_size;
+ }
+
+ sectors_this_device -= unwritten_sectors;
+
+ if (sectors_this_device < ca->mi.bucket_size &&
+ buckets_this_device) {
+ buckets_this_device--;
+ sectors_this_device = ca->mi.bucket_size;
+ }
+
+ if (!sectors_this_device)
+ continue;
+
+ sectors_next_entry = min(sectors_next_entry,
+ sectors_this_device);
+
+ sectors_total = min(sectors_total,
+ buckets_this_device * ca->mi.bucket_size +
+ sectors_this_device);
+
+ max_entry_size = min_t(unsigned, max_entry_size,
+ ca->mi.bucket_size);
+
+ nr_devs++;
+ }
+ rcu_read_unlock();
+
+ if (nr_online < c->opts.metadata_replicas_required) {
+ ret = -EROFS;
+ sectors_next_entry = 0;
+ } else if (!sectors_next_entry ||
+ nr_devs < min_t(unsigned, nr_online,
+ c->opts.metadata_replicas)) {
+ ret = -ENOSPC;
+ sectors_next_entry = 0;
+ } else if (!fifo_free(&j->pin)) {
+ ret = -ENOSPC;
+ sectors_next_entry = 0;
+ }
+
+ j->cur_entry_sectors = sectors_next_entry;
+ j->cur_entry_error = ret;
+
+ if (!ret)
+ journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+ bool ret;
+
+ spin_lock(&j->lock);
+ ret = ja->nr &&
+ ja->last_idx != ja->cur_idx &&
+ ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+ spin_unlock(&j->lock);
+
+ return ret;
+}
+
+/*
+ * Advance ja->last_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+static void journal_do_discards(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_dev *ca;
+ unsigned iter;
+
+ mutex_lock(&j->reclaim_lock);
+
+ for_each_rw_member(ca, c, iter) {
+ struct journal_device *ja = &ca->journal;
+
+ while (should_discard_bucket(j, ja)) {
+ if (ca->mi.discard &&
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+ blkdev_issue_discard(ca->disk_sb.bdev,
+ bucket_to_sector(ca,
+ ja->buckets[ja->last_idx]),
+ ca->mi.bucket_size, GFP_NOIO, 0);
+
+ spin_lock(&j->lock);
+ ja->last_idx = (ja->last_idx + 1) % ja->nr;
+
+ bch2_journal_space_available(j);
+ spin_unlock(&j->lock);
+ }
+ }
+
+ mutex_unlock(&j->reclaim_lock);
+}
+
/*
* Journal entry pinning - machinery for holding a reference on a given journal
* entry, holding it open to ensure it gets replayed during recovery:
*/
+static void bch2_journal_reclaim_fast(struct journal *j)
+{
+ struct journal_entry_pin_list temp;
+ bool popped = false;
+
+ lockdep_assert_held(&j->lock);
+
+ /*
+ * Unpin journal entries whose reference counts reached zero, meaning
+ * all btree nodes got written out
+ */
+ while (!fifo_empty(&j->pin) &&
+ !atomic_read(&fifo_peek_front(&j->pin).count)) {
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+ BUG_ON(!fifo_pop(&j->pin, temp));
+ popped = true;
+ }
+
+ if (popped)
+ bch2_journal_space_available(j);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ if (atomic_dec_and_test(&pin_list->count)) {
+ spin_lock(&j->lock);
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
+ }
+}
+
static inline void __journal_pin_add(struct journal *j,
u64 seq,
struct journal_entry_pin *pin,
pin->seq = seq;
pin->flush = flush_fn;
- if (flush_fn)
- list_add(&pin->list, &pin_list->list);
- else
- INIT_LIST_HEAD(&pin->list);
+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
/*
* If the journal is currently full, we might want to call flush_fn
* data off of a specific device:
*/
-/**
- * bch2_journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-void bch2_journal_reclaim_fast(struct journal *j)
-{
- struct journal_entry_pin_list temp;
- bool popped = false;
-
- lockdep_assert_held(&j->lock);
-
- /*
- * Unpin journal entries whose reference counts reached zero, meaning
- * all btree nodes got written out
- */
- while (!fifo_empty(&j->pin) &&
- !atomic_read(&fifo_peek_front(&j->pin).count)) {
- BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
- BUG_ON(!fifo_pop(&j->pin, temp));
- popped = true;
- }
-
- if (popped)
- journal_wake(j);
-}
-
-static void journal_pin_mark_flushing(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
-{
- lockdep_assert_held(&j->reclaim_lock);
-
- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
- BUG_ON(j->flush_in_progress);
- j->flush_in_progress = pin;
-}
-
-static void journal_pin_flush(struct journal *j,
- struct journal_entry_pin *pin,
- u64 seq)
-{
- pin->flush(j, pin, seq);
-
- BUG_ON(j->flush_in_progress != pin);
- j->flush_in_progress = NULL;
- wake_up(&j->pin_flush_wait);
-}
-
static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret = NULL;
- /* no need to iterate over empty fifo entries: */
- bch2_journal_reclaim_fast(j);
+ spin_lock(&j->lock);
+
+ BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
- if (*seq > seq_to_flush ||
+ if (*seq > max_seq ||
(ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list)))
break;
+ if (ret) {
+ list_move(&ret->list, &pin_list->flushed);
+ BUG_ON(j->flush_in_progress);
+ j->flush_in_progress = ret;
+ j->last_flushed = jiffies;
+ }
+
+ spin_unlock(&j->lock);
+
return ret;
}
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+ unsigned min_nr)
{
- bool ret;
+ struct journal_entry_pin *pin;
+ u64 seq;
- spin_lock(&j->lock);
- ret = ja->nr &&
- (ja->last_idx != ja->cur_idx &&
- ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
- spin_unlock(&j->lock);
+ lockdep_assert_held(&j->reclaim_lock);
- return ret;
+ while ((pin = journal_get_next_pin(j, min_nr
+ ? U64_MAX : seq_to_flush, &seq))) {
+ if (min_nr)
+ min_nr--;
+
+ pin->flush(j, pin, seq);
+
+ BUG_ON(j->flush_in_progress != pin);
+ j->flush_in_progress = NULL;
+ wake_up(&j->pin_flush_wait);
+ }
}
/**
struct bch_fs, journal.reclaim_work);
struct journal *j = &c->journal;
struct bch_dev *ca;
- struct journal_entry_pin *pin;
- u64 seq, seq_to_flush = 0;
- unsigned iter, bucket_to_flush;
- unsigned long next_flush;
- bool reclaim_lock_held = false, need_flush;
+ unsigned iter, bucket_to_flush, min_nr = 0;
+ u64 seq_to_flush = 0;
+
+ journal_do_discards(j);
+
+ mutex_lock(&j->reclaim_lock);
+ spin_lock(&j->lock);
- /*
- * Advance last_idx to point to the oldest journal entry containing
- * btree node updates that have not yet been written out
- */
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
- while (should_discard_bucket(j, ja)) {
- if (!reclaim_lock_held) {
- /*
- * ugh:
- * might be called from __journal_res_get()
- * under wait_event() - have to go back to
- * TASK_RUNNING before doing something that
- * would block, but only if we're doing work:
- */
- __set_current_state(TASK_RUNNING);
-
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- /* recheck under reclaim_lock: */
- continue;
- }
- if (ca->mi.discard &&
- blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
- blkdev_issue_discard(ca->disk_sb.bdev,
- bucket_to_sector(ca,
- ja->buckets[ja->last_idx]),
- ca->mi.bucket_size, GFP_NOIO, 0);
-
- spin_lock(&j->lock);
- ja->last_idx = (ja->last_idx + 1) % ja->nr;
- spin_unlock(&j->lock);
-
- journal_wake(j);
- }
-
- /*
- * Write out enough btree nodes to free up 50% journal
- * buckets
- */
- spin_lock(&j->lock);
+ /* Try to keep the journal at most half full: */
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
- spin_unlock(&j->lock);
}
/* Also flush if the pin fifo is more than half full */
- spin_lock(&j->lock);
seq_to_flush = max_t(s64, seq_to_flush,
(s64) journal_cur_seq(j) -
(j->pin.size >> 1));
+ spin_unlock(&j->lock);
/*
* If it's been longer than j->reclaim_delay_ms since we last flushed,
* make sure to flush at least one journal pin:
*/
- next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
- need_flush = time_after(jiffies, next_flush);
-
- while ((pin = journal_get_next_pin(j, need_flush
- ? U64_MAX
- : seq_to_flush, &seq))) {
- if (!reclaim_lock_held) {
- spin_unlock(&j->lock);
- __set_current_state(TASK_RUNNING);
- mutex_lock(&j->reclaim_lock);
- reclaim_lock_held = true;
- spin_lock(&j->lock);
- continue;
- }
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(j->reclaim_delay_ms)))
+ min_nr = 1;
- journal_pin_mark_flushing(j, pin, seq);
- spin_unlock(&j->lock);
-
- journal_pin_flush(j, pin, seq);
-
- need_flush = false;
- j->last_flushed = jiffies;
+ journal_flush_pins(j, seq_to_flush, min_nr);
- spin_lock(&j->lock);
- }
-
- spin_unlock(&j->lock);
-
- if (reclaim_lock_held)
- mutex_unlock(&j->reclaim_lock);
+ mutex_unlock(&j->reclaim_lock);
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
static int journal_flush_done(struct journal *j, u64 seq_to_flush)
{
- struct journal_entry_pin *pin;
- u64 pin_seq;
int ret;
ret = bch2_journal_error(j);
return ret;
mutex_lock(&j->reclaim_lock);
- spin_lock(&j->lock);
-
- while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
- journal_pin_mark_flushing(j, pin, pin_seq);
- spin_unlock(&j->lock);
- journal_pin_flush(j, pin, pin_seq);
+ journal_flush_pins(j, seq_to_flush, 0);
- spin_lock(&j->lock);
- }
+ spin_lock(&j->lock);
/*
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
#define JOURNAL_PIN (32 * 1024)
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+ struct journal_device *);
+void bch2_journal_space_available(struct journal *);
+
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->seq != 0;
return &j->pin.data[seq & j->pin.mask];
}
+void bch2_journal_pin_put(struct journal *, u64);
+
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
-void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
void bch2_journal_flush_pins(struct journal *, u64);
struct closure_waitlist wait;
- unsigned size;
- unsigned disk_sectors;
+ unsigned buf_size; /* size in bytes of @data */
+ unsigned sectors; /* maximum size for current entry */
+ unsigned disk_sectors; /* maximum size entry could have been, if
+ buf_size was bigger */
unsigned u64s_reserved;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
unsigned long flags;
union journal_res_state reservations;
+
+ /* Max size of current journal entry */
unsigned cur_entry_u64s;
- unsigned prev_buf_sectors;
- unsigned cur_buf_sectors;
+ unsigned cur_entry_sectors;
+
+ /*
+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+ * insufficient devices:
+ */
+ int cur_entry_error;
+
+ /* Reserved space in journal entry to be used just prior to write */
+ unsigned entry_u64s_reserved;
+
unsigned buf_size_want;
/*
spinlock_t lock;
+ /* if nonzero, we may not open a new journal entry: */
+ unsigned blocked;
+
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
u64 seq_ondisk;
u64 last_seq_ondisk;
- /* Reserved space in journal entry to be used just prior to write */
- unsigned entry_u64s_reserved;
-
/*
* FIFO of journal entries whose btree updates have not yet been
* written out.
le64_to_cpu(u->v));
break;
case FS_USAGE_INODES:
- percpu_u64_set(&c->usage[0]->s.nr_inodes,
+ percpu_u64_set(&c->usage[0]->nr_inodes,
le64_to_cpu(u->v));
break;
case FS_USAGE_KEY_VERSION:
mutex_unlock(&c->sb_lock);
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- ret = bch2_gc(c, &journal, true);
- if (ret)
- goto err;
-
- set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
err = "unable to allocate journal buckets";
- for_each_online_member(ca, c, i)
- if (bch2_dev_journal_alloc(ca)) {
+ for_each_online_member(ca, c, i) {
+ ret = bch2_dev_journal_alloc(ca);
+ if (ret) {
percpu_ref_put(&ca->io_ref);
goto err;
}
+ }
/*
* journal_res_get() will crash if called before this has
*dst = *src;
for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
- if (!src->data[src_idx])
+ if (!src->replicas[src_idx])
continue;
dst_idx = __replicas_entry_idx(dst_r,
cpu_replicas_entry(src_r, src_idx));
BUG_ON(dst_idx < 0);
- dst->data[dst_idx] = src->data[src_idx];
+ dst->replicas[dst_idx] = src->replicas[src_idx];
}
}
static int replicas_table_update(struct bch_fs *c,
struct bch_replicas_cpu *new_r)
{
- struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+ struct bch_fs_usage __percpu *new_scratch = NULL;
unsigned bytes = sizeof(struct bch_fs_usage) +
sizeof(u64) * new_r->nr;
- unsigned i;
int ret = -ENOMEM;
- for (i = 0; i < 3; i++) {
- if (i < 2 && !c->usage[i])
- continue;
-
- new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
- GFP_NOIO);
- if (!new_usage[i])
- goto err;
- }
-
- for (i = 0; i < 2; i++) {
- if (!c->usage[i])
- continue;
-
- __replicas_table_update(new_usage[i], new_r,
- c->usage[i], &c->replicas);
-
- swap(c->usage[i], new_usage[i]);
- }
-
- swap(c->usage_scratch, new_usage[2]);
+ if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO)) ||
+ (c->usage[1] &&
+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO))) ||
+ !(new_scratch = __alloc_percpu_gfp(bytes, sizeof(u64),
+ GFP_NOIO)))
+ goto err;
- swap(c->replicas, *new_r);
+ if (c->usage[0])
+ __replicas_table_update(new_usage[0], new_r,
+ c->usage[0], &c->replicas);
+ if (c->usage[1])
+ __replicas_table_update(new_usage[1], new_r,
+ c->usage[1], &c->replicas);
+
+ swap(c->usage[0], new_usage[0]);
+ swap(c->usage[1], new_usage[1]);
+ swap(c->usage_scratch, new_scratch);
+ swap(c->replicas, *new_r);
ret = 0;
err:
- for (i = 0; i < 3; i++)
- free_percpu(new_usage[i]);
+ free_percpu(new_scratch);
+ free_percpu(new_usage[1]);
+ free_percpu(new_usage[0]);
return ret;
}
if (__replicas_has_entry(&c->replicas_gc, e))
continue;
- v = percpu_u64_get(&c->usage[0]->data[i]);
+ v = percpu_u64_get(&c->usage[0]->replicas[i]);
if (!v)
continue;
BUG_ON(ret < 0);
}
- percpu_u64_set(&c->usage[0]->data[idx], sectors);
+ percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
return 0;
}
{
c->journal.entry_u64s_reserved +=
reserve_journal_replicas(c, &c->replicas);
- return 0;
+
+ return replicas_table_update(c, &c->replicas);
}
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
};
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
bch2_hash_lookup(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
}
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
bch2_hash_hole(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
}
-static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *start)
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *start)
{
struct btree_iter *iter;
struct bkey_s_c k;
return btree_iter_err(k);
}
-static inline int __bch2_hash_set(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- u64 inode, struct bkey_i *insert, int flags)
+static __always_inline
+int __bch2_hash_set(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, struct bkey_i *insert, int flags)
{
struct btree_iter *iter, *slot = NULL;
struct bkey_s_c k;
inode, insert, flags));
}
-static inline int bch2_hash_delete_at(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- struct btree_iter *iter)
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ struct btree_iter *iter)
{
struct bkey_i *delete;
int ret;
return 0;
}
-static inline int bch2_hash_delete(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- const struct bch_hash_info *info,
- u64 inode, const void *key)
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ u64 inode, const void *key)
{
struct btree_iter *iter;
sb->bio = bio;
}
- new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
if (!new_sb)
return -ENOMEM;
percpu_down_read_preempt_disable(&c->mark_lock);
{
- u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+ u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
for (i = 0; i < c->replicas.nr; i++) {
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
- u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+ u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
{
struct bch_sb_field_members *mi;
struct bch_fs *c;
- unsigned i, iter_size, fs_usage_size;
+ unsigned i, iter_size;
const char *err;
pr_verbose_init(opts, "");
(btree_blocks(c) + 1) * 2 *
sizeof(struct btree_node_iter_set);
- fs_usage_size = sizeof(struct bch_fs_usage) +
- sizeof(u64) * c->replicas.nr;
-
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
!(c->copygc_wq = alloc_workqueue("bcache_copygc",
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
- !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
- !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
pr_buf(&out, "hidden:\t\t\t\t%llu\n",
- fs_usage->s.hidden);
+ fs_usage->hidden);
pr_buf(&out, "data:\t\t\t\t%llu\n",
- fs_usage->s.data);
+ fs_usage->data);
pr_buf(&out, "cached:\t\t\t\t%llu\n",
- fs_usage->s.cached);
+ fs_usage->cached);
pr_buf(&out, "reserved:\t\t\t%llu\n",
- fs_usage->s.reserved);
+ fs_usage->reserved);
pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->s.nr_inodes);
+ fs_usage->nr_inodes);
pr_buf(&out, "online reserved:\t\t%llu\n",
- fs_usage->s.online_reserved);
+ fs_usage->online_reserved);
for (i = 0;
i < ARRAY_SIZE(fs_usage->persistent_reserved);
pr_buf(&out, "\t");
bch2_replicas_entry_to_text(&out, e);
- pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+ pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
}
percpu_up_read_preempt_enable(&c->mark_lock);