-938f680845d1be28979e23aee972dba010c464ba
+783085c3cc440183ba5e987b1aa7791cc1ca42ba
Q = @
endif
-CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \
+CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \
-Wno-pointer-sign \
-Wno-deprecated-declarations \
-fno-strict-aliasing \
#include "libbcachefs/bcachefs_ioctl.h"
#include "libbcachefs/btree_cache.h"
+#include "libbcachefs/move.h"
#include "cmds.h"
#include "libbcachefs.h"
die("too many arguments");
return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
- .op = BCH_DATA_OP_REREPLICATE,
+ .op = BCH_DATA_OP_rereplicate,
.start_btree = 0,
.start_pos = POS_MIN,
.end_btree = BTREE_ID_NR,
"\n"
"Kick off a data job and report progress\n"
"\n"
- "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
+ "job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n"
"\n"
"Options:\n"
" -b btree btree to operate on\n"
exit(EXIT_SUCCESS);
}
-const char * const data_jobs[] = {
- "scrub",
- "rereplicate",
- "migrate",
- "rewrite_old_nodes",
- NULL
-};
-
int cmd_data_job(int argc, char *argv[])
{
struct bch_ioctl_data op = {
if (!job)
die("please specify which type of job");
- op.op = read_string_list_or_die(job, data_jobs, "bad job type");
-
- if (op.op == BCH_DATA_OP_SCRUB)
- die("scrub not implemented yet");
+ op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type");
char *fs_path = arg_pop();
if (!fs_path)
}
return bchu_data(fs, (struct bch_ioctl_data) {
- .op = BCH_DATA_OP_MIGRATE,
+ .op = BCH_DATA_OP_migrate,
.start_btree = 0,
.start_pos = POS_MIN,
.end_btree = BTREE_ID_NR,
return __ATOMIC_READ(&v->counter); \
} \
\
+static inline i_type a_type##_read_acquire(const a_type##_t *v) \
+{ \
+ i_type ret = __ATOMIC_READ(&v->counter); \
+ smp_mb__after_atomic(); \
+ return ret; \
+} \
+ \
static inline void a_type##_set(a_type##_t *v, i_type i) \
{ \
return __ATOMIC_SET(&v->counter, i); \
struct closure;
struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
extern struct dentry *bcache_debug;
struct closure_waitlist {
INIT_WORK(&cl->work, cl->work.func);
BUG_ON(!queue_work(wq, &cl->work));
} else
- cl->fn(cl);
+ cl->fn(&cl->work);
}
/**
__closure_wake_up(list);
}
+#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws)
+#define closure_type(name, type, member) \
+ struct closure *cl = container_of(ws, struct closure, work); \
+ type *name = container_of(cl, type, member)
+
/**
* continue_at - jump to another function with barrier
*
int seeks; /* seeks to recreate an obj */
long batch; /* reclaim batch size, 0 = default */
struct list_head list;
+ void *private_data;
};
-int register_shrinker(struct shrinker *, const char *, ...);
-void unregister_shrinker(struct shrinker *);
+static inline void shrinker_free(struct shrinker *s)
+{
+ free(s);
+}
+
+struct shrinker *shrinker_alloc(unsigned int, const char *, ...);
+
+int shrinker_register(struct shrinker *);
+void shrinker_unregister(struct shrinker *);
void run_shrinkers(gfp_t gfp_mask, bool);
return ret;
}
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ old_a->cached_sectors) {
+ ret = bch2_update_cached_sectors_list(trans, new->k.p.inode,
+ -((s64) old_a->cached_sectors));
+ if (ret)
+ return ret;
+ }
+
return 0;
}
return ret;
}
-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bch_fs *c = trans->c;
goto out;
}
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end)
-{
- if (!btree_id_is_extents(iter->btree_id)) {
- return __bch2_check_discard_freespace_key(trans, iter);
- } else {
- int ret = 0;
-
- while (!bkey_eq(iter->pos, end) &&
- !(ret = btree_trans_too_many_iters(trans) ?:
- __bch2_check_discard_freespace_key(trans, iter)))
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
- return ret;
- }
-}
-
/*
* We've already checked that generation numbers in the bucket_gens btree are
* valid for buckets that exist; this just checks for keys for nonexistent
ret = for_each_btree_key2(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key2(trans, iter,
- BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key_commit(trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter));
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k) ?:
+ bch2_check_discard_freespace_key(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err(c, "while checking %s", buf.buf);
+ printbuf_exit(&buf);
+ break;
+ }
+
+ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
unsigned i;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
return wp;
}
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+ struct open_buckets *ptrs,
+ struct open_buckets *ptrs_no_use,
+ unsigned extra_replicas)
+{
+ struct open_buckets ptrs2 = { 0 };
+ struct open_bucket *ob;
+ unsigned i;
+
+ open_bucket_for_each(c, ptrs, ob, i) {
+ unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+ if (d && d <= extra_replicas) {
+ extra_replicas -= d;
+ ob_push(c, ptrs_no_use, ob);
+ } else {
+ ob_push(c, &ptrs2, ob);
+ }
+ }
+
+ *ptrs = ptrs2;
+}
+
/*
* Get us an open_bucket we can allocate from, return with it locked:
*/
if (ret)
goto err;
+ if (nr_effective > nr_replicas)
+ deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
/* Free buckets we didn't use: */
open_bucket_for_each(c, &wp->ptrs, ob, i)
open_bucket_free_unused(c, ob);
x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
+ x(blocked_write_buffer_full) \
x(nocow_lock_contended)
enum bch_time_stats {
size_t gap;
size_t nr;
size_t size;
+ atomic_t ref;
+ bool initial_ref_held;
};
struct btree_trans_buf {
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
- x(sysfs)
+ x(sysfs) \
+ x(btree_write_buffer)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
#endif
}
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget(&c->writes);
+#endif
+}
+
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
struct bch_val v;
};
+#define POS_KEY(_pos) \
+((struct bkey) { \
+ .u64s = BKEY_U64s, \
+ .format = KEY_FORMAT_CURRENT, \
+ .p = _pos, \
+})
+
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[];
+ struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_quota: */
x(clock, 7) \
x(dev_usage, 8) \
x(log, 9) \
- x(overwrite, 10)
+ x(overwrite, 10) \
+ x(write_buffer_keys, 11)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct jset_entry_clock {
__u64 dev;
};
+#define BCH_DATA_OPS() \
+ x(scrub, 0) \
+ x(rereplicate, 1) \
+ x(migrate, 2) \
+ x(rewrite_old_nodes, 3) \
+ x(drop_extra_replicas, 4)
+
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_REWRITE_OLD_NODES = 3,
- BCH_DATA_OP_NR = 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+ BCH_DATA_OPS()
+#undef x
+ BCH_DATA_OP_NR
};
/*
struct bch_replicas_usage {
__u64 sectors;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
static inline struct bch_replicas_usage *
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_cache *bc = &c->btree_cache;
if (bch2_btree_shrinker_disabled)
static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
char *cbuf;
size_t buflen = seq_buf_get_buf(s, &cbuf);
struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
struct btree *b;
unsigned i, flags;
- unregister_shrinker(&bc->shrink);
+ shrinker_free(bc->shrink);
/* vfree() can allocate memory: */
flags = memalloc_nofs_save();
int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
+ struct shrinker *shrink;
unsigned i;
int ret = 0;
mutex_init(&c->verify_lock);
- bc->shrink.count_objects = bch2_btree_cache_count;
- bc->shrink.scan_objects = bch2_btree_cache_scan;
- bc->shrink.to_text = bch2_btree_cache_shrinker_to_text;
- bc->shrink.seeks = 4;
- ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
- if (ret)
+ shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+ if (!shrink)
goto err;
+ bc->shrink = shrink;
+ shrink->count_objects = bch2_btree_cache_count;
+ shrink->scan_objects = bch2_btree_cache_scan;
+ shrink->to_text = bch2_btree_cache_shrinker_to_text;
+ shrink->seeks = 4;
+ shrink->private_data = c;
+ shrinker_register(shrink);
return 0;
err:
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (metadata_only &&
return offset;
}
-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
{
- struct btree_node_read_all *ra =
- container_of(cl, struct btree_node_read_all, cl);
+ closure_type(ra, struct btree_node_read_all, cl);
struct bch_fs *c = ra->c;
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
if (sync) {
closure_sync(&ra->cl);
- btree_node_read_all_replicas_done(&ra->cl);
+ btree_node_read_all_replicas_done(&ra->cl.work);
} else {
continue_at(&ra->cl, btree_node_read_all_replicas_done,
c->io_complete_wq);
struct btree_iter *iter,
struct bpos end_pos)
{
- struct bkey_i *k;
-
- if (bpos_lt(iter->path->pos, iter->journal_pos))
- iter->journal_idx = 0;
-
- k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
- iter->path->level,
- iter->path->pos,
- end_pos,
- &iter->journal_idx);
-
- iter->journal_pos = k ? k->k.p : end_pos;
- return k;
+ return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ iter->path->level,
+ iter->path->pos,
+ end_pos,
+ &iter->journal_idx);
}
static noinline
trans->fn_idx = fn_idx;
trans->locking_wait.task = current;
trans->journal_replay_not_finished =
- !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+ unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ atomic_inc_not_zero(&c->journal_keys.ref);
closure_init_stack(&trans->ref);
s = btree_trans_stats(trans);
kfree(trans->fs_usage_deltas);
}
+ if (unlikely(trans->journal_replay_not_finished))
+ bch2_journal_keys_put(c);
+
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
unsigned flags,
unsigned long ip)
{
- memset(iter, 0, sizeof(*iter));
- iter->trans = trans;
- iter->btree_id = btree_id;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k.p = pos;
-
+ iter->trans = trans;
+ iter->update_path = NULL;
+ iter->key_cache_path = NULL;
+ iter->btree_id = btree_id;
+ iter->min_depth = 0;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k = POS_KEY(pos);
+ iter->journal_idx = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
struct journal_keys *keys = &c->journal_keys;
unsigned iters = 0;
struct journal_key *k;
+
+ BUG_ON(*idx > keys->nr);
search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
- !k->overwritten)
+ if (k->overwritten) {
+ (*idx)++;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
/* Since @keys was full, there was no gap: */
memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
kvfree(keys->d);
- *keys = new_keys;
+ keys->d = new_keys.d;
+ keys->nr = new_keys.nr;
+ keys->size = new_keys.size;
/* And now the gap is at the end: */
- keys->gap = keys->nr;
+ keys->gap = keys->nr;
}
journal_iters_move_gap(c, keys->gap, idx);
cmp_int(l->journal_offset, r->journal_offset);
}
-void bch2_journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_put(struct bch_fs *c)
{
+ struct journal_keys *keys = &c->journal_keys;
struct journal_key *i;
+ BUG_ON(atomic_read(&keys->ref) <= 0);
+
+ if (!atomic_dec_and_test(&keys->ref))
+ return;
+
move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
keys->gap = keys->nr;
kvfree(keys->d);
keys->d = NULL;
keys->nr = keys->gap = keys->size = 0;
+
+ bch2_journal_entries_free(c);
}
static void __journal_keys_sort(struct journal_keys *keys)
struct bch_fs *,
struct btree *);
-void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+ if (c->journal_keys.initial_ref_held)
+ bch2_journal_keys_put(c);
+ c->journal_keys.initial_ref_held = false;
+}
+
void bch2_journal_entries_free(struct bch_fs *);
int bch2_journal_keys_sort(struct bch_fs *);
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
+ trans->journal_res.seq = ck->journal.seq;
+
/*
- * Since journal reclaim depends on us making progress here, and the
- * allocator/copygc depend on journal reclaim making progress, we need
- * to be using alloc reserves:
+ * If we're at the end of the journal, we really want to free up space
+ * in the journal right away - we don't want to pin that old journal
+ * sequence number with a new btree node write, we want to re-journal
+ * the update
*/
+ if (ck->journal.seq == journal_last_seq(j))
+ commit_flags |= BCH_WATERMARK_reclaim;
+ else
+ commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
bch2_trans_commit(trans, NULL, NULL,
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
- (ck->journal.seq == journal_last_seq(j)
- ? BCH_WATERMARK_reclaim
- : 0)|
commit_flags);
bch2_fs_fatal_err_on(ret &&
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_key_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
struct bucket_table *tbl;
struct bkey_cached *ck, *t;
static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct bch_fs *c = container_of(shrink, struct bch_fs,
- btree_key_cache.shrink);
+ struct bch_fs *c = shrink->private_data;
struct btree_key_cache *bc = &c->btree_key_cache;
long nr = atomic_long_read(&bc->nr_keys) -
atomic_long_read(&bc->nr_dirty);
int cpu;
#endif
- unregister_shrinker(&bc->shrink);
+ shrinker_free(bc->shrink);
mutex_lock(&bc->lock);
static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
{
- struct btree_key_cache *bc =
- container_of(shrink, struct btree_key_cache, shrink);
+ struct bch_fs *c = shrink->private_data;
+ struct btree_key_cache *bc = &c->btree_key_cache;
char *cbuf;
size_t buflen = seq_buf_get_buf(s, &cbuf);
struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
{
struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+ struct shrinker *shrink;
#ifdef __KERNEL__
bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
bc->table_init_done = true;
- bc->shrink.seeks = 0;
- bc->shrink.count_objects = bch2_btree_key_cache_count;
- bc->shrink.scan_objects = bch2_btree_key_cache_scan;
- bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text;
- if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
+ shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+ if (!shrink)
return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+ bc->shrink = shrink;
+ shrink->seeks = 0;
+ shrink->count_objects = bch2_btree_key_cache_count;
+ shrink->scan_objects = bch2_btree_key_cache_scan;
+ shrink->to_text = bch2_btree_key_cache_shrinker_to_text;
+ shrink->private_data = c;
+ shrinker_register(shrink);
return 0;
}
struct list_head freed_nonpcpu;
size_t nr_freed_nonpcpu;
- struct shrinker shrink;
+ struct shrinker *shrink;
unsigned shrink_iter;
struct btree_key_cache_freelist __percpu *pcpu_freed;
i->k->k.needs_whiteout = false;
}
- if (trans->nr_wb_updates &&
- trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
- return -BCH_ERR_btree_insert_need_flush_buffer;
-
/*
* Don't get journal reservation until after we know insert will
* succeed:
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
- if (trans->nr_wb_updates) {
- EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
-
- ret = bch2_btree_insert_keys_write_buffer(trans);
- if (ret)
- goto revert_fs_usage;
- }
-
h = trans->hooks;
while (h) {
ret = h->fn(trans, h);
trans_for_each_wb_update(trans, wb) {
entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
+ BCH_JSET_ENTRY_write_buffer_keys,
wb->btree, 0,
wb->k.k.u64s);
bkey_copy((struct bkey_i *) entry->start, &wb->k);
ret = bch2_trans_relock(trans);
break;
- case -BCH_ERR_btree_insert_need_flush_buffer: {
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- ret = 0;
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_unlock(trans);
- mutex_lock(&wb->flush_lock);
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_begin(trans);
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BCH_TRANS_COMMIT_no_check_rw, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- } else {
- mutex_unlock(&wb->flush_lock);
- ret = bch2_trans_relock(trans);
- }
- }
- break;
- }
default:
BUG_ON(ret >= 0);
break;
goto out_reset;
}
- if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
- mutex_trylock(&c->btree_write_buffer.flush_lock)) {
- bch2_trans_begin(trans);
- bch2_trans_unlock(trans);
-
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BCH_TRANS_COMMIT_no_check_rw, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- goto out;
- }
-
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
trans->journal_u64s = trans->extra_journal_entries.nr;
unsigned not_freed_will_make_reachable;
unsigned not_freed_access_bit;
atomic_t dirty;
- struct shrinker shrink;
+ struct shrinker *shrink;
/*
* If we need to allocate memory for a new btree node and that
struct btree_path *key_cache_path;
enum btree_id btree_id:8;
- unsigned min_depth:3;
- unsigned advanced:1;
+ u8 min_depth;
/* btree_iter_copy starts here: */
u16 flags;
/* BTREE_ITER_WITH_JOURNAL: */
size_t journal_idx;
- struct bpos journal_pos;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
}
}
-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
{
- struct btree_update *as = container_of(cl, struct btree_update, cl);
+ closure_type(as, struct btree_update, cl);
struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
-#include <linux/sort.h>
+#include <linux/prefetch.h>
static int bch2_btree_write_buffer_journal_flush(struct journal *,
struct journal_entry_pin *, u64);
-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
+
+static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ return (cmp_int(l->hi, r->hi) ?:
+ cmp_int(l->mi, r->mi) ?:
+ cmp_int(l->lo, r->lo)) >= 0;
+}
- return cmp_int(l->btree, r->btree) ?:
- bpos_cmp(l->k.k.p, r->k.k.p) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+ int cmp;
+
+ asm(".intel_syntax noprefix;"
+ "mov rax, [%[l]];"
+ "sub rax, [%[r]];"
+ "mov rax, [%[l] + 8];"
+ "sbb rax, [%[r] + 8];"
+ "mov rax, [%[l] + 16];"
+ "sbb rax, [%[r] + 16];"
+ ".att_syntax prefix;"
+ : "=@ccae" (cmp)
+ : [l] "r" (l), [r] "r" (r)
+ : "rax", "cc");
+
+ EBUG_ON(cmp != __wb_key_cmp(l, r));
+ return cmp;
+#else
+ return __wb_key_cmp(l, r);
+#endif
}
-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ const struct wb_key_ref *l = _l;
+ const struct wb_key_ref *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq);
+ return !((l->hi ^ r->hi)|
+ (l->mi ^ r->mi)|
+ ((l->lo >> 24) ^ (r->lo >> 24)));
}
-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- unsigned commit_flags,
- bool *write_locked,
- size_t *fast)
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+ size_t n = num, a = num / 2;
+
+ if (!a) /* num < 2 || size == 0 */
+ return;
+
+ for (;;) {
+ size_t b, c, d;
+
+ if (a) /* Building heap: sift down --a */
+ --a;
+ else if (--n) /* Sorting: Extract root to --n */
+ swap(base[0], base[n]);
+ else /* Sort complete */
+ break;
+
+ /*
+ * Sift element at "a" down into heap. This is the
+ * "bottom-up" variant, which significantly reduces
+ * calls to cmp_func(): we find the sift-down path all
+ * the way to the leaves (one compare per level), then
+ * backtrack to find where to insert the target element.
+ *
+ * Because elements tend to sift down close to the leaves,
+ * this uses fewer compares than doing two per level
+ * on the way down. (A bit more than half as many on
+ * average, 3/4 worst-case.)
+ */
+ for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+ b = wb_key_cmp(base + c, base + d) ? c : d;
+ if (d == n) /* Special case last leaf with no sibling */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a" */
+ while (b != a && wb_key_cmp(base + a, base + b))
+ b = (b - 1) / 2;
+ c = b; /* Where "a" belongs */
+ while (b != a) { /* Shift it into place */
+ b = (b - 1) / 2;
+ swap(base[b], base[c]);
+ }
+ }
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb)
+{
+ bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b);
+
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ bool *write_locked, size_t *fast)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
+ EBUG_ON(!wb->journal_seq);
+ EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
*write_locked = true;
}
- if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
- goto trans_commit;
+ return wb_flush_one_slowpath(trans, iter, wb);
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
return 0;
-trans_commit:
- trans->journal_res.seq = wb->journal_seq;
-
- return bch2_trans_update(trans, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- commit_flags|
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
- union btree_write_buffer_state old, new;
- u64 v = READ_ONCE(wb->state.v);
-
- do {
- old.v = new.v = v;
-
- new.nr = 0;
- new.idx++;
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
- while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
- cpu_relax();
-
- smp_mb();
-
- return old;
}
/*
return ret;
}
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
- bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+ struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+ struct journal *j = &c->journal;
+
+ if (!wb->inc.keys.nr)
+ return;
+
+ bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+ darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+ swap(wb->flushing.keys, wb->inc.keys);
+ goto out;
+ }
+
+ size_t nr = min(darray_room(wb->flushing.keys),
+ wb->sorted.size - wb->flushing.keys.nr);
+ nr = min(nr, wb->inc.keys.nr);
+
+ memcpy(&darray_top(wb->flushing.keys),
+ wb->inc.keys.data,
+ sizeof(wb->inc.keys.data[0]) * nr);
+
+ memmove(wb->inc.keys.data,
+ wb->inc.keys.data + nr,
+ sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+ wb->flushing.keys.nr += nr;
+ wb->inc.keys.nr -= nr;
+out:
+ if (!wb->inc.keys.nr)
+ bch2_journal_pin_drop(j, &wb->inc.pin);
+ else
+ bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ if (j->watermark) {
+ spin_lock(&j->lock);
+ bch2_journal_set_watermark(j);
+ spin_unlock(&j->lock);
+ }
+
+ BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct journal_entry_pin pin;
- struct btree_write_buffered_key *i, *keys;
+ struct wb_key_ref *i;
struct btree_iter iter = { NULL };
- size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ size_t skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
- union btree_write_buffer_state s;
int ret = 0;
- memset(&pin, 0, sizeof(pin));
-
- if (!locked && !mutex_trylock(&wb->flush_lock))
- return 0;
-
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
- bch2_btree_write_buffer_journal_flush);
- bch2_journal_pin_drop(j, &wb->journal_pin);
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
- s = btree_write_buffer_switch(wb);
- keys = wb->keys[s.idx];
- nr = s.nr;
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
+ mutex_unlock(&wb->inc.lock);
- if (race_fault())
- goto slowpath;
+ for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+ wb->sorted.data[i].idx = i;
+ wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+ memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+ }
+ wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
* If that happens, simply skip the key so we can optimistically insert
* as many keys as possible in the fast path.
*/
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_key_cmp, NULL);
+ wb_sort(wb->sorted.data, wb->sorted.nr);
+
+ darray_for_each(wb->sorted, i) {
+ struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+ for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+ prefetch(&wb->flushing.keys.data[n->idx]);
+
+ BUG_ON(!k->journal_seq);
+
+ if (i + 1 < &darray_top(wb->sorted) &&
+ wb_key_eq(i, i + 1)) {
+ struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- for (i = keys; i < keys + nr; i++) {
- if (i + 1 < keys + nr &&
- i[0].btree == i[1].btree &&
- bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
- i->journal_seq = 0;
+ n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);;
+ k->journal_seq = 0;
continue;
}
if (write_locked &&
- (iter.path->btree_id != i->btree ||
- bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+ (iter.path->btree_id != k->btree ||
+ bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) {
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
write_locked = false;
}
- if (!iter.path || iter.path->btree_id != i->btree) {
+ if (!iter.path || iter.path->btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
- bch2_btree_iter_set_pos(&iter, i->k.k.p);
+ bch2_btree_iter_set_pos(&iter, k->k.k.p);
iter.path->preserve = false;
do {
- ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
- commit_flags, &write_locked, &fast);
+ if (race_fault()) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ break;
+ }
+
+ ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ if (!ret) {
+ k->journal_seq = 0;
+ } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
- continue;
- }
- if (ret)
+ ret = 0;
+ } else
break;
-
- i->journal_seq = 0;
}
if (write_locked)
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
bch2_trans_iter_exit(trans, &iter);
- trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+ if (ret)
+ goto err;
+
+ if (slowpath) {
+ /*
+ * Flush in the order they were present in the journal, so that
+ * we can release journal pins:
+ * The fastpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr);
+
+ struct btree_write_buffered_key *i;
+ darray_for_each(wb->flushing.keys, i) {
+ if (!i->journal_seq)
+ continue;
+
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ bch2_trans_begin(trans);
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
+ btree_write_buffered_insert(trans, i));
+ if (ret)
+ goto err;
+ }
+ }
+err:
+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
+ return ret;
+}
- if (slowpath)
- goto slowpath;
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+ struct journal *j = &c->journal;
+ struct journal_buf *buf;
+ int ret = 0;
+
+ mutex_lock(&j->buf_lock);
+ while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq)))
+ if (bch2_journal_keys_to_write_buffer(c, buf)) {
+ ret = -ENOMEM;
+ break;
+ }
+ mutex_unlock(&j->buf_lock);
- bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
- bch2_journal_pin_drop(j, &pin);
- mutex_unlock(&wb->flush_lock);
return ret;
-slowpath:
- trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0, fetch_from_journal_err;
+
+ trace_write_buffer_flush_sync(trans, _RET_IP_);
+retry:
+ bch2_trans_unlock(trans);
+
+ bch2_journal_block_reservations(&c->journal);
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX);
+ bch2_journal_unblock(&c->journal);
/*
- * Now sort the rest by journal seq and bump the journal pin as we go.
- * The slowpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
+ * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+ * is not guaranteed to empty wb->inc:
*/
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_journal_cmp,
- NULL);
+ mutex_lock(&wb->flushing.lock);
+ while (!ret &&
+ (wb->flushing.keys.nr || wb->inc.keys.nr))
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
- commit_flags &= ~BCH_WATERMARK_MASK;
- commit_flags |= BCH_WATERMARK_reclaim;
+ if (!ret && fetch_from_journal_err)
+ goto retry;
- for (i = keys; i < keys + nr; i++) {
- if (!i->journal_seq)
- continue;
+ return ret;
+}
- bch2_journal_pin_update(j, i->journal_seq, &pin,
- bch2_btree_write_buffer_journal_flush);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0;
- ret = commit_do(trans, NULL, NULL,
- commit_flags|
- BCH_TRANS_COMMIT_no_enospc|
- BCH_TRANS_COMMIT_no_journal_res|
- BCH_TRANS_COMMIT_journal_reclaim,
- btree_write_buffered_insert(trans, i));
- if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
- break;
+ if (mutex_trylock(&wb->flushing.lock)) {
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
}
- goto out;
+ return ret;
}
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
- bch2_trans_unlock(trans);
- mutex_lock(&trans->c->btree_write_buffer.flush_lock);
- return __bch2_btree_write_buffer_flush(trans, 0, true);
-}
+ struct bch_fs *c = trans->c;
-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
-{
- return __bch2_btree_write_buffer_flush(trans, 0, false);
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+ return -BCH_ERR_erofs_no_writes;
+
+ int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ return ret;
}
static int bch2_btree_write_buffer_journal_flush(struct journal *j,
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret, fetch_from_journal_err;
+
+ do {
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
- mutex_lock(&wb->flush_lock);
+ mutex_lock(&wb->flushing.lock);
+ ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+ mutex_unlock(&wb->flushing.lock);
+ } while (!ret &&
+ (fetch_from_journal_err ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) ||
+ (wb->inc.pin.seq && wb->inc.pin.seq <= seq)));
- return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
+ return ret;
}
-static inline u64 btree_write_buffer_ref(int idx)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
- return ((union btree_write_buffer_state) {
- .ref0 = idx == 0,
- .ref1 = idx == 1,
- }).v;
+ struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret;
+
+ mutex_lock(&wb->flushing.lock);
+ do {
+ ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+ } while (!ret && bch2_btree_write_buffer_should_flush(c));
+ mutex_unlock(&wb->flushing.lock);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+int __bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
{
- struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key *i;
- union btree_write_buffer_state old, new;
- int ret = 0;
- u64 v;
-
- trans_for_each_wb_update(trans, i) {
- EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+ int ret;
+retry:
+ ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+ if (!ret && dst->wb == &wb->flushing)
+ ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (unlikely(ret)) {
+ if (dst->wb == &c->btree_write_buffer.flushing) {
+ mutex_unlock(&dst->wb->lock);
+ dst->wb = &c->btree_write_buffer.inc;
+ bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+ bch2_btree_write_buffer_journal_flush);
+ goto retry;
+ }
- i->journal_seq = trans->journal_res.seq;
- i->journal_offset = trans->journal_res.offset;
+ return ret;
}
- preempt_disable();
- v = READ_ONCE(wb->state.v);
- do {
- old.v = new.v = v;
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ BUG_ON(!dst->room);
+ BUG_ON(!dst->seq);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (mutex_trylock(&wb->flushing.lock)) {
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
+
+ /*
+ * Attempt to skip wb->inc, and add keys directly to
+ * wb->flushing, saving us a copy later:
+ */
- new.v += btree_write_buffer_ref(new.idx);
- new.nr += trans->nr_wb_updates;
- if (new.nr > wb->size) {
- ret = -BCH_ERR_btree_insert_need_flush_buffer;
- goto out;
+ if (!wb->inc.keys.nr) {
+ dst->wb = &wb->flushing;
+ } else {
+ mutex_unlock(&wb->flushing.lock);
+ dst->wb = &wb->inc;
}
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+ } else {
+ mutex_lock(&wb->inc.lock);
+ dst->wb = &wb->inc;
+ }
- memcpy(wb->keys[new.idx] + old.nr,
- trans->wb_updates,
- sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ dst->seq = seq;
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+}
+
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (!dst->wb->keys.nr)
+ bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+ if (bch2_btree_write_buffer_should_flush(c) &&
+ __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+ if (dst->wb == &wb->flushing)
+ mutex_unlock(&wb->flushing.lock);
+ mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
- atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+
+ buf->need_flush_to_write_buffer = false;
out:
- preempt_enable();
+ bch2_journal_keys_to_write_buffer_end(c, &dst);
+ return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+ if (wb->keys.size >= new_size)
+ return 0;
+
+ if (!mutex_trylock(&wb->lock))
+ return -EINTR;
+
+ int ret = darray_resize(&wb->keys, new_size);
+ mutex_unlock(&wb->lock);
return ret;
}
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb_keys_resize(&wb->flushing, new_size) ?:
+ wb_keys_resize(&wb->inc, new_size);
+}
+
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+ BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+ !bch2_journal_error(&c->journal));
- kvfree(wb->keys[1]);
- kvfree(wb->keys[0]);
+ darray_exit(&wb->sorted);
+ darray_exit(&wb->flushing.keys);
+ darray_exit(&wb->inc.keys);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- mutex_init(&wb->flush_lock);
- wb->size = c->opts.btree_write_buffer_size;
+ mutex_init(&wb->inc.lock);
+ mutex_init(&wb->flushing.lock);
+ INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
- wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
- wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
- if (!wb->keys[0] || !wb->keys[1])
- return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+ /* Will be resized by journal as needed: */
+ unsigned initial_size = 1 << 16;
- return 0;
+ return darray_make_room(&wb->inc.keys, initial_size) ?:
+ darray_make_room(&wb->flushing.keys, initial_size) ?:
+ darray_make_room(&wb->sorted, initial_size);
}
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct journal_keys_to_wb {
+ struct btree_write_buffer_keys *wb;
+ size_t room;
+ u64 seq;
+};
+
+int __bch2_journal_key_to_wb(struct bch_fs *,
+ struct journal_keys_to_wb *,
+ enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ if (unlikely(!dst->room))
+ return __bch2_journal_key_to_wb(c, dst, btree, k);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-struct btree_write_buffered_key {
- u64 journal_seq;
- unsigned journal_offset;
- enum btree_id btree;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-union btree_write_buffer_state {
+struct wb_key_ref {
+union {
struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned idx:24;
+ u8 pos[sizeof(struct bpos)];
+ enum btree_id btree:8;
+#else
+ enum btree_id btree:8;
+ u8 pos[sizeof(struct bpos)];
+ unsigned idx:24;
+#endif
+ } __packed;
struct {
- u64 nr:23;
- u64 idx:1;
- u64 ref0:20;
- u64 ref1:20;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ u64 lo;
+ u64 mi;
+ u64 hi;
+#else
+ u64 hi;
+ u64 mi;
+ u64 lo;
+#endif
};
};
+};
-struct btree_write_buffer {
- struct mutex flush_lock;
- struct journal_entry_pin journal_pin;
+struct btree_write_buffered_key {
+ enum btree_id btree:8;
+ u64 journal_seq:56;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
- union btree_write_buffer_state state;
- size_t size;
+struct btree_write_buffer_keys {
+ DARRAY(struct btree_write_buffered_key) keys;
+ struct journal_entry_pin pin;
+ struct mutex lock;
+};
- struct btree_write_buffered_key *keys[2];
+struct btree_write_buffer {
+ DARRAY(struct wb_key_ref) sorted;
+ struct btree_write_buffer_keys inc;
+ struct btree_write_buffer_keys flushing;
+ struct work_struct flush_work;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
usage->reserved += usage->persistent_reserved[i];
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
prt_printf(out, "\t");
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
}
static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry *r, s64 sectors,
+ struct bch_replicas_entry_v1 *r, s64 sectors,
unsigned journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
__replicas_deltas_realloc(trans, more, _gfp));
}
-static inline int update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry *r,
- s64 sectors)
+int bch2_update_replicas_list(struct btree_trans *trans,
+ struct bch_replicas_entry_v1 *r,
+ s64 sectors)
{
struct replicas_delta_list *d;
struct replicas_delta *n;
return 0;
}
-static inline int update_cached_sectors_list(struct btree_trans *trans,
- unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas_list(trans, &r.e, sectors);
+ return bch2_update_replicas_list(trans, &r.e, sectors);
}
int bch2_mark_alloc(struct btree_trans *trans,
}
percpu_up_read(&c->mark_lock);
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_a->cached_sectors) {
- ret = update_cached_sectors(c, new, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- journal_seq, gc);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
-
if (new_a->data_type == BCH_DATA_free &&
(!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
closure_wake_up(&c->freelist_wait);
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
r.e.data_type = data_type;
- ret = update_replicas_list(trans, &r.e, sectors);
+ ret = bch2_update_replicas_list(trans, &r.e, sectors);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
if (p.ptr.cached) {
if (!stale) {
- ret = update_cached_sectors_list(trans, p.ptr.dev,
- disk_sectors);
+ ret = bch2_update_cached_sectors_list(trans, p.ptr.dev,
+ disk_sectors);
if (ret)
return ret;
}
}
if (r.e.nr_devs)
- ret = update_replicas_list(trans, &r.e, dirty_sectors);
+ ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors);
return ret;
}
s64 sectors = le16_to_cpu(new_s->sectors);
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
- ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
if (ret)
return ret;
}
s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
bch2_bkey_to_replicas(&r.e, old);
- ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
if (ret)
return ret;
}
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
+int bch2_update_replicas_list(struct btree_trans *,
+ struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
dst_end = (void *) arg->replicas + replica_entries_bytes;
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *src_e =
+ struct bch_replicas_entry_v1 *src_e =
cpu_replicas_entry(&c->replicas, i);
/* check that we have enough space for one replicas entry */
unsigned long io_until,
unsigned long cpu_timeout)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait;
wait.io_timer.expire = io_until;
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread && kthread_should_stop())
+ if (kthread_should_stop())
break;
if (wait.expired)
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = krealloc_array(d->data, new_size, element_size, gfp);
+ void *data = kvmalloc_array(new_size, element_size, gfp);
if (!data)
return -ENOMEM;
+ memcpy(data, d->data, d->size * element_size);
+ kvfree(d->data);
d->data = data;
d->size = new_size;
}
#define darray_exit(_d) \
do { \
- kfree((_d)->data); \
+ kvfree((_d)->data); \
darray_init(_d); \
} while (0)
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
#include "bcachefs_format.h"
struct bch_replicas_padded {
- struct bch_replicas_entry e;
+ struct bch_replicas_entry_v1 e;
u8 pad[BCH_BKEY_PTRS_MAX];
};
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
}
}
-static void bch2_dio_read_complete(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
{
- struct dio_read *dio = container_of(cl, struct dio_read, cl);
+ closure_type(dio, struct dio_read, cl);
dio->req->ki_complete(dio->req, dio->ret);
bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
return 0;
}
-static void bch2_dio_write_flush_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
{
- struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+ closure_type(dio, struct dio_write, op.cl);
struct bch_fs *c = dio->op.c;
closure_debug_destroy(cl);
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ lock_two_nondirectories(&src->v, &dst->v);
+ bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, "a_res);
- bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+ unlock_two_nondirectories(&src->v, &dst->v);
return bch2_err_class(ret);
}
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
- struct filename *name;
struct path path;
struct inode *dir;
- struct dentry *victim;
int ret = 0;
if (arg.flags)
return -EINVAL;
- name = getname((const char __user *)(unsigned long)arg.dst_ptr);
- victim = filename_path_locked(arg.dirfd, name, &path);
- putname(name);
- if (IS_ERR(victim))
- return PTR_ERR(victim);
+ ret = user_path_at(arg.dirfd,
+ (const char __user *)(unsigned long)arg.dst_ptr,
+ LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ret;
- if (victim->d_sb->s_fs_info != c) {
+ if (path.dentry->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
}
- dir = d_inode(path.dentry);
- ret = __bch2_unlink(dir, victim, true);
- if (!ret) {
- fsnotify_rmdir(dir, victim);
- d_delete(victim);
- }
- inode_unlock(dir);
+ dir = path.dentry->d_parent->d_inode;
+
+ ret = __bch2_unlink(dir, path.dentry, true);
+ if (ret)
+ goto err;
+
+ fsnotify_rmdir(dir, path.dentry);
+ d_delete(path.dentry);
err:
- dput(victim);
path_put(&path);
return ret;
}
if (!first)
seq_putc(seq, ':');
first = false;
- seq_puts(seq, "/dev/");
- seq_puts(seq, ca->name);
+ seq_puts(seq, ca->disk_sb.sb_name);
}
return 0;
sb->s_flags |= SB_POSIXACL;
#endif
- sb->s_shrink.seeks = 0;
+ sb->s_shrink->seeks = 0;
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
}
enum bch_inode_lock_op {
- INODE_LOCK = (1U << 0),
- INODE_PAGECACHE_BLOCK = (1U << 1),
- INODE_UPDATE_LOCK = (1U << 2),
+ INODE_PAGECACHE_BLOCK = (1U << 0),
+ INODE_UPDATE_LOCK = (1U << 1),
};
#define bch2_lock_inodes(_locks, ...) \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
goto out;
}
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return k.k->type == KEY_TYPE_set;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
return 0;
}
+ if (u.bi_flags & BCH_INODE_unlinked &&
+ c->sb.version >= bcachefs_metadata_version_deleted_inodes) {
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret)
+ return ret;
+
+ fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
+ }
+
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, inode_unlinked_but_clean,
again:
need_another_pass = false;
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
}
bch2_trans_iter_exit(trans, &iter);
- if (!ret && need_another_pass)
+ if (!ret && need_another_pass) {
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
goto again;
+ }
err:
bch2_trans_put(trans);
__wp_update_state(wp, state);
}
-static void bch2_write_index(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_write_index)
{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ closure_type(op, struct bch_write_op, cl);
struct write_point *wp = op->wp;
struct workqueue_struct *wq = index_update_wq(op);
unsigned long flags;
bch2_nocow_write_convert_unwritten(op);
}
-static void bch2_nocow_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ closure_type(op, struct bch_write_op, cl);
__bch2_nocow_write_done(op);
bch2_write_done(cl);
op->insert_keys.top = op->insert_keys.keys;
} else if (op->flags & BCH_WRITE_SYNC) {
closure_sync(&op->cl);
- bch2_nocow_write_done(&op->cl);
+ bch2_nocow_write_done(&op->cl.work);
} else {
/*
* XXX
* If op->discard is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
-void bch2_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_write)
{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ closure_type(op, struct bch_write_op, cl);
struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
unsigned data_len;
op->devs_need_flush = NULL;
}
-void bch2_write(struct closure *);
-
+CLOSURE_CALLBACK(bch2_write);
void bch2_write_point_do_index_updates(struct work_struct *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "error.h"
#include "journal.h"
bch2_journal_reclaim_fast(j);
if (write)
closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+ wake_up(&j->wait);
}
/*
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ trace_journal_entry_close(c, vstruct_bytes(buf->data));
+
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
+ buf->need_flush_to_write_buffer = true;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
journal_quiesce(j);
}
+/*
+ * XXX: ideally this would not be closing the current journal entry, but
+ * otherwise we do not have a way to avoid racing with res_get() - j->blocked
+ * will race.
+ */
+static bool journal_reservations_stopped(struct journal *j)
+{
+ union journal_res_state s;
+
+ journal_entry_close(j);
+
+ s.v = atomic64_read_acquire(&j->reservations.counter);
+
+ return s.buf0_count == 0 &&
+ s.buf1_count == 0 &&
+ s.buf2_count == 0 &&
+ s.buf3_count == 0;
+}
+
+void bch2_journal_block_reservations(struct journal *j)
+{
+ spin_lock(&j->lock);
+ j->blocked++;
+ spin_unlock(&j->lock);
+
+ wait_event(j->wait, journal_reservations_stopped(j));
+}
+
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ spin_lock(&j->lock);
+ max_seq = min(max_seq, journal_cur_seq(j));
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= max_seq;
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + idx;
+ union journal_res_state s;
+
+ if (!buf->need_flush_to_write_buffer)
+ continue;
+
+ if (seq == journal_cur_seq(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+ s.v = atomic64_read_acquire(&j->reservations.counter);
+
+ if (journal_state_count(s, idx)) {
+ spin_unlock(&j->lock);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ spin_unlock(&j->lock);
+ return buf;
+ }
+
+ spin_unlock(&j->lock);
+ return NULL;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+ return ret;
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
static struct lock_class_key res_key;
unsigned i;
+ mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
{
union journal_res_state s;
- s.v = atomic64_sub_return(((union journal_res_state) {
+ s.v = atomic64_sub_return_release(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
.buf2_count = idx == 2,
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+void bch2_journal_block_reservations(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
return 0;
}
-static void bch2_journal_read_device(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_journal_read_device)
{
- struct journal_device *ja =
- container_of(cl, struct journal_device, read);
+ closure_type(ja, struct journal_device, read);
struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
struct bch_fs *c = ca->fs;
struct journal_list *jlist =
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
if (buf->buf_size >= new_size)
return;
+ size_t btree_write_buffer_size = new_size / 64;
+
+ if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+ return;
+
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
}
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
{
- struct journal *j = container_of(cl, struct journal, io);
+ closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
+ bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
percpu_ref_put(&ca->io_ref);
}
-static void do_journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(do_journal_write)
{
- struct journal *j = container_of(cl, struct journal, io);
+ closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct jset_entry *start, *end, *i, *next, *prev = NULL;
struct jset *jset = w->data;
+ struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
- bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
+ bool validate_before_checksum = false;
+ u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
- if (i->type == BCH_JSET_ENTRY_btree_root) {
+ switch (i->type) {
+ case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
+ break;
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ EBUG_ON(!w->need_flush_to_write_buffer);
+
+ if (!wb.wb)
+ bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+ struct bkey_i *k;
+ jset_entry_for_each_key(i, k) {
+ ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+ if (ret) {
+ bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ return ret;
+ }
+ }
+ i->type = BCH_JSET_ENTRY_btree_keys;
+ break;
}
/* Can we merge with previous entry? */
memmove_u64s_down(prev, i, jset_u64s(u64s));
}
+ if (wb.wb)
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ w->need_flush_to_write_buffer = false;
+
prev = prev ? vstruct_next(prev) : jset->start;
jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
- bch2_journal_super_entries_add_common(c, &end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = le64_to_cpu(jset->seq);
+ j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
- w->noflush = true;
+ w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
return 0;
}
-void bch2_journal_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_journal_write)
{
- struct journal *j = container_of(cl, struct journal, io);
+ closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
if (ret)
goto err;
+ mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
+ mutex_unlock(&j->buf_lock);
if (ret)
goto err;
int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
-void bch2_journal_write(struct closure *);
+CLOSURE_CALLBACK(bch2_journal_write);
#endif /* _BCACHEFS_JOURNAL_IO_H */
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
return available;
}
-static inline void journal_set_watermark(struct journal *j)
+void bch2_journal_set_watermark(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool low_on_space = j->space[journal_space_clean].total * 4 <=
j->space[journal_space_total].total;
bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
- unsigned watermark = low_on_space || low_on_pin
+ bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+ unsigned watermark = low_on_space || low_on_pin || low_on_wb
? BCH_WATERMARK_reclaim
: BCH_WATERMARK_stripe;
if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
&j->low_on_space_start, low_on_space) ||
track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
- &j->low_on_pin_start, low_on_pin))
+ &j->low_on_pin_start, low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+ &j->write_buffer_full_start, low_on_wb))
trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- journal_set_watermark(j);
+ bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
+ j->pin.front <= j->seq_ondisk &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
j->pin.front++;
popped = true;
static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- bool kthread = (current->flags & PF_KTHREAD) != 0;
u64 seq_to_flush;
size_t min_nr, min_key_cache, nr_flushed;
unsigned flags;
flags = memalloc_noreclaim_save();
do {
- if (kthread && kthread_should_stop())
+ if (kthread_should_stop())
break;
if (bch2_journal_error(j)) {
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
+ bool need_flush_to_write_buffer;
};
/*
*/
darray_u64 early_journal_entries;
+ /*
+ * Protects journal_buf->data, when accessing without a jorunal
+ * reservation: for synchronization between the btree write buffer code
+ * and the journal write path:
+ */
+ struct mutex buf_lock;
/*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
u64 low_on_space_start;
u64 low_on_pin_start;
u64 max_in_flight_start;
+ u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
#include <linux/ioprio.h>
#include <linux/kthread.h>
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DATA_OPS()
+#undef x
+ NULL
+};
+
static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
{
if (trace_move_extent_enabled()) {
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
+static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+ closure_sync(&ctxt->cl);
+}
+
void bch2_moving_ctxt_exit(struct moving_context *ctxt)
{
struct bch_fs *c = ctxt->trans->c;
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
- closure_sync(&ctxt->cl);
+ bch2_moving_ctxt_flush_all(ctxt);
EBUG_ON(atomic_read(&ctxt->write_sectors));
EBUG_ON(atomic_read(&ctxt->write_ios));
trace_move_data(c, stats);
}
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
struct bch_fs *c = ctxt->trans->c;
u64 delay;
- if (ctxt->wait_on_copygc && !c->copygc_running) {
- bch2_trans_unlock_long(ctxt->trans);
+ if (ctxt->wait_on_copygc && c->copygc_running) {
+ bch2_moving_ctxt_flush_all(ctxt);
wait_event_killable(c->copygc_running_wq,
!c->copygc_running ||
kthread_should_stop());
set_current_state(TASK_INTERRUPTIBLE);
}
- if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+ if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
return 1;
}
schedule_timeout(delay);
if (unlikely(freezing(current))) {
- move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+ bch2_moving_ctxt_flush_all(ctxt);
try_to_freeze();
}
} while (delay);
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru;
- ret = bch2_btree_write_buffer_flush(trans);
- if (ret) {
- bch_err_msg(c, ret, "flushing btree write buffer");
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ bch_err_msg(c, ret, "flushing btree write buffer");
+ if (ret)
goto err;
- }
while (!(ret = bch2_move_ratelimit(ctxt))) {
bch2_trans_begin(trans);
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
+ struct bbpos start,
+ struct bbpos end,
move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
- bool kthread = (current->flags & PF_KTHREAD) != 0;
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct moving_context ctxt;
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
- enum btree_id id;
+ enum btree_id btree;
struct data_update_opts data_opts;
int ret = 0;
stats->data_type = BCH_DATA_btree;
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
- id++) {
- stats->pos = BBPOS(id, POS_MIN);
+ for (btree = start.btree;
+ btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+ btree ++) {
+ stats->pos = BBPOS(btree, POS_MIN);
- if (!bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, btree)->b)
continue;
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
while (bch2_trans_begin(trans),
(b = bch2_btree_iter_peek_node(&iter)) &&
!(ret = PTR_ERR_OR_ZERO(b))) {
- if (kthread && kthread_should_stop())
+ if (kthread_should_stop())
break;
- if ((cmp_int(id, end_btree_id) ?:
- bpos_cmp(b->key.k.p, end_pos)) > 0)
+ if ((cmp_int(btree, end.btree) ?:
+ bpos_cmp(b->key.k.p, end.pos)) > 0)
break;
stats->pos = BBPOS(iter.btree_id, iter.pos);
bch2_trans_iter_exit(trans, &iter);
- if (kthread && kthread_should_stop())
+ if (kthread_should_stop())
break;
}
int ret;
ret = bch2_move_btree(c,
- 0, POS_MIN,
- BTREE_ID_NR, SPOS_MAX,
+ BBPOS_MIN,
+ BBPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
return ret;
}
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= replicas) {
+ data_opts->kill_ptrs |= BIT(i);
+ durability -= d;
+ }
+
+ i++;
+ }
+
+ return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
+ struct bbpos start = BBPOS(op.start_btree, op.start_pos);
+ struct bbpos end = BBPOS(op.end_btree, op.end_pos);
int ret = 0;
+ if (op.op >= BCH_DATA_OP_NR)
+ return -EINVAL;
+
+ bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
switch (op.op) {
- case BCH_DATA_OP_REREPLICATE:
- bch2_move_stats_init(stats, "rereplicate");
+ case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_MIGRATE:
+ case BCH_DATA_OP_migrate:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch2_move_stats_init(stats, "rewrite_old_nodes");
+ case BCH_DATA_OP_rewrite_old_nodes:
ret = bch2_scan_old_btree_nodes(c, stats);
- bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_drop_extra_replicas:
+ ret = bch2_move_btree(c, start, end,
+ drop_extra_replicas_btree_pred, c, stats) ?: ret;
+ ret = bch2_move_data(c, start, end, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
ret = -EINVAL;
}
+ bch2_move_stats_exit(stats, c);
return ret;
}
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
+extern const char * const bch2_data_ops_strs[];
+
void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
move_buckets_wait(ctxt, buckets_in_flight, false);
- ret = bch2_btree_write_buffer_flush(trans);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ return ret;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
__func__, bch2_err_str(ret)))
return ret;
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(btree_write_buffer_size, u32, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(16, (1U << 20) - 1), \
- BCH2_NO_SB_OPT, 1U << 13, \
- NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
goto err;
}
+ BUG_ON(!atomic_read(&keys->ref));
+
/*
* First, attempt to replay keys in sorted order. This is more
* efficient - better locality of btree access - but some might fail if
bch2_trans_put(trans);
trans = NULL;
+ if (!c->opts.keep_journal)
+ bch2_journal_keys_put_initial(c);
+
replay_now_at(j, j->replay_journal_seq_end);
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
- if (keys->nr && !ret)
+ if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
if (trans)
bch2_move_stats_init(&stats, "recovery");
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c) ?:
+ struct printbuf buf = PRINTBUF;
+ bch2_version_to_text(&buf, c->sb.version_min);
+ bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+ printbuf_exit(&buf);
+
+ ret = bch2_fs_read_write_early(c) ?:
bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal &&
- test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
- bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(c);
- }
+ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+ bch2_journal_keys_put_initial(c);
kfree(clean);
if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
/* Replicas tracking - in memory: */
-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
#endif
}
-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
}
void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry *e)
+ struct bch_replicas_entry_v1 *e)
{
unsigned i;
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_cpu_replicas_entry(r, e) {
}
static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
}
static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
r->devs[r->nr_devs++] = ptr->dev;
}
-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
bch2_replicas_entry_sort(e);
}
-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
unsigned i;
struct bch_replicas_cpu new = {
}
static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
int idx, entry_size = replicas_entry_bytes(search);
}
int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bch2_replicas_entry_sort(search);
}
static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
return __replicas_entry_idx(r, search) >= 0;
}
bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bool marked;
static unsigned reserve_journal_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned journal_res_u64s = 0;
/* nr_inodes: */
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
int ret = 0;
goto out;
}
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r);
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned i = 0;
lockdep_assert_held(&c->replicas_gc_lock);
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
}
int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
u64 sectors)
{
int ret, idx = bch2_replicas_entry_idx(c, r);
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
struct bch_replicas_cpu *cpu_r)
{
- struct bch_replicas_entry *e, *dst;
+ struct bch_replicas_entry_v1 *e, *dst;
unsigned nr = 0, entry_size = 0, idx = 0;
for_each_replicas_entry(sb_r, e) {
nr++;
}
- entry_size += sizeof(struct bch_replicas_entry) -
+ entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0);
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry *dst =
+ struct bch_replicas_entry_v1 *dst =
cpu_replicas_entry(cpu_r, idx++);
dst->data_type = e->data_type;
{
struct bch_sb_field_replicas_v0 *sb_r;
struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry *src;
+ struct bch_replicas_entry_v1 *src;
size_t bytes;
bytes = sizeof(struct bch_sb_field_replicas);
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *dst, *src;
+ struct bch_replicas_entry_v1 *dst, *src;
bool need_v1 = false;
size_t bytes;
memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
if (e->data_type >= BCH_DATA_NR) {
}
if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry *n =
+ struct bch_replicas_entry_v1 *n =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
struct bch_sb_field *f)
{
struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_replicas_entry(r, e) {
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, bool print)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool ret = true;
percpu_down_read(&c->mark_lock);
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
- struct bch_replicas_entry *r;
+ struct bch_replicas_entry_v1 *r;
for_each_replicas_entry(replicas, r)
for (i = 0; i < r->nr_devs; i++)
#include "eytzinger.h"
#include "replicas_types.h"
-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
unsigned dev)
{
e->data_type = BCH_DATA_cached;
int bch2_replicas_gc2(struct bch_fs *);
int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *,
u64);
#define for_each_cpu_replicas_entry(_r, _i) \
struct bch_replicas_cpu {
unsigned nr;
unsigned entry_size;
- struct bch_replicas_entry *entries;
+ struct bch_replicas_entry_v1 *entries;
};
struct replicas_delta {
s64 delta;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct replicas_delta_list {
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243)
+ x(hash_table_key_wrong_offset, 243) \
+ x(unlinked_inode_not_on_deleted_list, 244)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
}
EXPORT_SYMBOL_GPL(six_relock_ip);
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
static inline bool six_owner_running(struct six_lock *lock)
{
if (!IS_ERR_OR_NULL(sb->bdev))
blkdev_put(sb->bdev, sb->holder);
kfree(sb->holder);
+ kfree(sb->sb_name);
kfree(sb->sb);
memset(sb, 0, sizeof(*sb));
return 0;
}
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
+int __bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
+ struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
if (!sb->holder)
return -ENOMEM;
+ sb->sb_name = kstrdup(path, GFP_KERNEL);
+ if (!sb->sb_name)
+ return -ENOMEM;
+
#ifndef __KERNEL__
if (opt_get(*opts, direct_io) == false)
sb->mode |= BLK_OPEN_BUFFERED;
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
+ if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+ printk(KERN_INFO "%s", err2.buf);
+ else
+ printk(KERN_ERR "%s", err2.buf);
+
+ printbuf_exit(&err2);
printbuf_reset(&err);
/*
goto out;
}
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, true);
+}
+
/* write superblock: */
static void write_super_endio(struct bio *bio)
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.state.nr);
+ BUG_ON(c->btree_write_buffer.inc.keys.nr);
+ BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
bch2_io_clock_exit(&c->io_clock[WRITE]);
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
- bch2_journal_keys_free(&c->journal_keys);
- bch2_journal_entries_free(c);
+ bch2_journal_keys_put_initial(c);
+ BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
+ atomic_set(&c->journal_keys.ref, 1);
+ c->journal_keys.initial_ref_held = true;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_init(&c->times[i]);
struct bch_sb_handle {
struct bch_sb *sb;
struct block_device *bdev;
+ char *sb_name;
struct bio *bio;
void *holder;
size_t buffer_size;
sc.gfp_mask = GFP_KERNEL;
sc.nr_to_scan = strtoul_or_return(buf);
- c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+ c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
}
if (attr == &sysfs_btree_wakeup)
TP_ARGS(c)
);
+TRACE_EVENT(journal_entry_close,
+ TP_PROTO(struct bch_fs *c, unsigned bytes),
+ TP_ARGS(c, bytes),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __field(u32, bytes )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = c->dev;
+ __entry->bytes = bytes;
+ ),
+
+ TP_printk("%d,%d entry bytes %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->bytes)
+);
+
DEFINE_EVENT(bio, journal_write,
TP_PROTO(struct bio *bio),
TP_ARGS(bio)
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
+TRACE_EVENT(write_buffer_flush_sync,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
- TP_ARGS(trans, nr, size),
+ TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+ TP_ARGS(trans, slowpath, total),
TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, size )
+ __field(size_t, slowpath )
+ __field(size_t, total )
),
TP_fast_assign(
- __entry->nr = nr;
- __entry->size = size;
+ __entry->slowpath = slowpath;
+ __entry->total = total;
),
- TP_printk("%zu/%zu", __entry->nr, __entry->size)
+ TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
);
#endif /* _TRACE_BCACHEFS_H */
closure_debug_destroy(cl);
if (destructor)
- destructor(cl);
+ destructor(&cl->work);
if (parent)
closure_put(parent);
int done;
};
-static void closure_sync_fn(struct closure *cl)
+static CLOSURE_CALLBACK(closure_sync_fn)
{
+ struct closure *cl = container_of(ws, struct closure, work);
struct closure_syncer *s = cl->s;
struct task_struct *p;
static LIST_HEAD(shrinker_list);
static DEFINE_MUTEX(shrinker_lock);
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+ return calloc(sizeof(struct shrinker), 1);
+}
+
+int shrinker_register(struct shrinker *shrinker)
{
mutex_lock(&shrinker_lock);
list_add_tail(&shrinker->list, &shrinker_list);