-8436db7aac9ced2118bf19b8f1bf3682f479d17e
+1d669389f79de8571732c13fdf4d23039e2308fd
struct bkey_inode_buf packed;
int ret;
- bch2_inode_pack(&packed, inode);
+ bch2_inode_pack(c, &packed, inode);
ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL, 0);
if (ret)
#define cpu_present(cpu) ((cpu) == 0)
#define cpu_active(cpu) ((cpu) == 0)
+#define raw_smp_processor_id() 0U
+
#define for_each_cpu(cpu, mask) \
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
#define for_each_cpu_not(cpu, mask) \
#define kmap_atomic(page) page_address(page)
#define kunmap_atomic(addr) do {} while (0)
+#define PageHighMem(page) false
+
static const char zero_page[PAGE_SIZE];
#define ZERO_PAGE(o) ((struct page *) &zero_page[0])
TP_ARGS(ip)
);
-DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock,
- TP_PROTO(unsigned long ip),
- TP_ARGS(ip)
+TRACE_EVENT(trans_restart_would_deadlock,
+ TP_PROTO(unsigned long trans_ip,
+ unsigned long caller_ip,
+ unsigned reason,
+ enum btree_id have_btree_id,
+ unsigned have_iter_type,
+ enum btree_id want_btree_id,
+ unsigned want_iter_type),
+ TP_ARGS(trans_ip, caller_ip, reason,
+ have_btree_id, have_iter_type,
+ want_btree_id, want_iter_type),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, trans_ip )
+ __field(unsigned long, caller_ip )
+ __field(u8, reason )
+ __field(u8, have_btree_id )
+ __field(u8, have_iter_type )
+ __field(u8, want_btree_id )
+ __field(u8, want_iter_type )
+ ),
+
+ TP_fast_assign(
+ __entry->trans_ip = trans_ip;
+ __entry->caller_ip = caller_ip;
+ __entry->reason = reason;
+ __entry->have_btree_id = have_btree_id;
+ __entry->have_iter_type = have_iter_type;
+ __entry->want_btree_id = want_btree_id;
+ __entry->want_iter_type = want_iter_type;
+ ),
+
+ TP_printk("%pF %pF because %u have %u:%u want %u:%u",
+ (void *) __entry->trans_ip,
+ (void *) __entry->caller_ip,
+ __entry->reason,
+ __entry->have_btree_id,
+ __entry->have_iter_type,
+ __entry->want_btree_id,
+ __entry->want_iter_type)
);
TRACE_EVENT(trans_restart_iters_realloced,
static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
- if (expensive_debug_checks(c)) {
+ if (bch2_expensive_debug_checks) {
size_t iter;
long i;
unsigned j;
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \
+ "Verify btree accounting for keys within a node") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
+BCH_DEBUG_PARAMS()
+#undef BCH_DEBUG_PARAM
+
+#ifndef CONFIG_BCACHEFS_DEBUG
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+BCH_DEBUG_PARAMS_DEBUG()
+#undef BCH_DEBUG_PARAM
+#endif
+
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
x(btree_node_split) \
u64 journal_seq_base;
};
+struct btree_iter_buf {
+ struct btree_iter *iter;
+};
+
struct bch_fs {
struct closure cl;
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
mempool_t btree_iters_pool;
+ struct btree_iter_buf __percpu *btree_iters_bufs;
struct btree_key_cache btree_key_cache;
struct mutex verify_lock;
#endif
- u64 unused_inode_hint;
+ u64 *unused_inode_hints;
+ unsigned inode_shard_bits;
/*
* A btree node on disk could have too many bsets for an iterator to fit
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
-#define BCH_DEBUG_PARAM(name, description) bool name;
- BCH_DEBUG_PARAMS_ALL()
-#undef BCH_DEBUG_PARAM
-
struct time_stats times[BCH_TIME_STAT_NR];
};
} __attribute__((packed, aligned(8)));
#define BCH_INODE_FIELDS() \
- x(bi_atime, 64) \
- x(bi_ctime, 64) \
- x(bi_mtime, 64) \
- x(bi_otime, 64) \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
x(bi_size, 64) \
x(bi_sectors, 64) \
x(bi_uid, 32) \
#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
/* Dirents */
x(btree_ptr_v2, 11) \
x(extents_above_btree_updates, 12) \
x(btree_updates_journalled, 13) \
- x(reflink_inline_data, 14)
+ x(reflink_inline_data, 14) \
+ x(new_varint, 15)
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
- (1ULL << BCH_FEATURE_extents_above_btree_updates))
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
+ (1ULL << BCH_FEATURE_new_varint))\
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
if ((*p & mask) != mask) {
*p += 1ULL << offset;
- EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0);
+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
return true;
}
}
__pure __flatten
-int __bch2_bkey_cmp_packed(const struct bkey_packed *l,
- const struct bkey_packed *r,
- const struct btree *b)
+int bch2_bkey_cmp_packed(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r)
{
struct bkey unpacked;
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
-#define bkey_packed_typecheck(_k) \
-({ \
- BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
- !type_is(_k, struct bkey_packed *)); \
- type_is(_k, struct bkey_packed *); \
-})
-
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
BKEY_PACKED_NONE,
};
-#define bkey_lr_packed_typecheck(_l, _r) \
- (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
-
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
const struct bpos *);
__pure
-int __bch2_bkey_cmp_packed(const struct bkey_packed *,
- const struct bkey_packed *,
- const struct btree *);
+int bch2_bkey_cmp_packed(const struct btree *,
+ const struct bkey_packed *,
+ const struct bkey_packed *);
__pure
int __bch2_bkey_cmp_left_packed(const struct btree *,
return bkey_cmp_left_packed(b, l, &r);
}
-/*
- * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
- * skip dispatching on k->format:
- */
-#define bkey_cmp_packed(_b, _l, _r) \
-({ \
- int _cmp; \
- \
- switch (bkey_lr_packed_typecheck(_l, _r)) { \
- case BKEY_PACKED_NONE: \
- _cmp = bkey_cmp(((struct bkey *) (_l))->p, \
- ((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_LEFT: \
- _cmp = bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_l), \
- &((struct bkey *) (_r))->p); \
- break; \
- case BKEY_PACKED_RIGHT: \
- _cmp = -bkey_cmp_left_packed((_b), \
- (struct bkey_packed *) (_r), \
- &((struct bkey *) (_l))->p); \
- break; \
- case BKEY_PACKED_BOTH: \
- _cmp = __bch2_bkey_cmp_packed((void *) (_l), \
- (void *) (_r), (_b)); \
- break; \
- } \
- _cmp; \
-})
-
#if 1
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
enum merge_result ret;
- if (key_merging_disabled(c) ||
+ if (bch2_key_merging_disabled ||
!ops->key_merge ||
l.k->type != r.k->type ||
bversion_cmp(l.k->version, r.k->version) ||
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
cmp_int((unsigned long) l, (unsigned long) r);
}
* and should be dropped.
*/
return iter->used >= 2 &&
- !bkey_cmp_packed(iter->b,
+ !bch2_bkey_cmp_packed(iter->b,
iter->data[0].k,
iter->data[1].k);
}
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
(int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
(int) l->needs_whiteout - (int) r->needs_whiteout;
}
continue;
while ((next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
+ !bch2_bkey_cmp_packed(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
needs_whiteout |= in->needs_whiteout;
struct bkey_packed *l,
struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r) ?:
+ return bch2_bkey_cmp_packed(b, l, r) ?:
(int) bkey_deleted(l) - (int) bkey_deleted(r);
}
return ro_aux_tree_base(b, t)->f + idx;
}
-static void bset_aux_tree_verify(struct btree *b)
+static void bset_aux_tree_verify(const struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
- struct bset_tree *t;
+ const struct bset_tree *t;
for_each_bset(b, t) {
if (t->aux_data_offset == U16_MAX)
#endif
}
-void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks)
+void bch2_btree_keys_init(struct btree *b)
{
unsigned i;
b->nsets = 0;
memset(&b->nr, 0, sizeof(b->nr));
-#ifdef CONFIG_BCACHEFS_DEBUG
- b->expensive_debug_checks = expensive_debug_checks;
-#endif
+
for (i = 0; i < MAX_BSETS; i++)
b->set[i].data_offset = U16_MAX;
struct bkey_packed *k = btree_bkey_first(b, t);
unsigned j = 0;
- if (!btree_keys_expensive_checks(b))
+ if (!bch2_expensive_debug_checks)
return;
BUG_ON(bset_has_ro_aux_tree(t));
}
/* bytes remaining - only valid for last bset: */
-static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
bset_aux_tree_verify(b);
return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
}
-static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) /
(sizeof(struct bkey_float) + sizeof(u8));
}
-static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t)
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
{
return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
}
k = p;
}
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
BUG_ON(ret >= orig_k);
for (i = ret
__flatten
static struct bkey_packed *bset_search_tree(const struct btree *b,
- struct bset_tree *t,
- struct bpos *search,
+ const struct bset_tree *t,
+ const struct bpos *search,
const struct bkey_packed *packed_search)
{
struct ro_aux_tree *base = ro_aux_tree_base(b, t);
bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
struct btree *b)
{
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
bch2_btree_node_iter_verify(iter, b);
bch2_btree_node_iter_next_check(iter, b);
}
struct bset_tree *t;
unsigned end = 0;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
for_each_bset(b, t) {
iter->data[0].k = __btree_node_key_to_offset(b, prev);
iter->data[0].end = end;
- if (btree_keys_expensive_checks(b))
+ if (bch2_expensive_debug_checks)
bch2_btree_node_iter_verify(iter, b);
return prev;
}
#include <linux/kernel.h>
#include <linux/types.h>
-#include "bcachefs_format.h"
+#include "bcachefs.h"
#include "bkey.h"
#include "bkey_methods.h"
#include "btree_types.h"
* first key in that range of bytes again.
*/
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(const struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
- return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
- return false;
-#endif
-}
-
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
#define BSET_CACHELINE 128
-static inline size_t btree_keys_cachelines(struct btree *b)
+static inline size_t btree_keys_cachelines(const struct btree *b)
{
return (1U << b->byte_order) / BSET_CACHELINE;
}
-static inline size_t btree_aux_data_bytes(struct btree *b)
+static inline size_t btree_aux_data_bytes(const struct btree *b)
{
return btree_keys_cachelines(b) * 8;
}
-static inline size_t btree_aux_data_u64s(struct btree *b)
+static inline size_t btree_aux_data_u64s(const struct btree *b)
{
return btree_aux_data_bytes(b) / sizeof(u64);
}
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
- if (btree_keys_expensive_checks(b)) {
+ if (bch2_expensive_debug_checks) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
-void bch2_btree_keys_init(struct btree *, bool *);
+void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
- return bkey_cmp_packed(b, l, r)
+ return bch2_bkey_cmp_packed(b, l, r)
?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
?: cmp_int(l, r);
}
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
- if (btree_keys_expensive_checks(b))
+ if (bch2_debug_check_btree_accounting)
__bch2_verify_btree_nr_keys(b);
}
* - unless btree verify mode is enabled, since it runs out of
* the post write cleanup:
*/
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent);
else
__bch2_btree_node_write(c, b, SIX_LOCK_read);
unsigned long freed = 0;
unsigned i, flags;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
btree_cache.shrink);
struct btree_cache *bc = &c->btree_cache;
- if (btree_shrinker_disabled(c))
+ if (bch2_btree_shrinker_disabled)
return 0;
return btree_cache_can_free(bc) * btree_pages(c);
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
- bch2_btree_keys_init(b, &c->expensive_debug_checks);
+ bch2_btree_keys_init(b);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
*/
struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
- enum six_lock_type lock_type)
+ enum six_lock_type lock_type,
+ unsigned long trace_ip)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
btree_node_unlock(iter, level + 1);
if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
- lock_node_check_fn, (void *) k)) {
+ lock_node_check_fn, (void *) k, trace_ip)) {
if (b->hash_val != btree_ptr_hash_val(k))
goto retry;
return ERR_PTR(-EINTR);
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
+ SIX_LOCK_intent, _THIS_IP_);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
struct btree_iter *linked;
* holding other locks that would cause us to deadlock:
*/
trans_for_each_iter(trans, linked)
- if (btree_iter_cmp(iter, linked) < 0)
+ if (btree_iter_lock_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
- SIX_LOCK_intent);
+ SIX_LOCK_intent, _THIS_IP_);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
- enum six_lock_type);
+ enum six_lock_type, unsigned long);
struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
enum btree_id, unsigned);
int ret = 0;
if (initial) {
- BUG_ON(journal_seq_verify(c) &&
+ BUG_ON(bch2_journal_seq_verify &&
k.k->version.lo > journal_cur_seq(&c->journal));
/* XXX change to fsck check */
struct btree_iter *iter;
struct btree *b;
unsigned depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
+ : bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
BTREE_INSERT_USE_RESERVE|
BTREE_INSERT_NOWAIT|
BTREE_INSERT_GC_LOCK_HELD);
- else if (!btree_gc_rewrite_disabled(c) &&
- (btree_gc_always_rewrite(c) || max_stale > 16))
+ else if (!bch2_btree_gc_rewrite_disabled &&
+ (bch2_btree_gc_always_rewrite || max_stale > 16))
bch2_btree_node_rewrite(c, iter,
b->data->keys.seq,
BTREE_INSERT_NOWAIT|
{
struct btree *b;
unsigned target_depth = metadata_only ? 1
- : expensive_debug_checks(c) ? 0
+ : bch2_expensive_debug_checks ? 0
: !btree_node_type_needs_gc(btree_id) ? 1
: 0;
u8 max_stale = 0;
out:
if (!ret &&
(test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
- (!iter && test_restart_gc(c)))) {
+ (!iter && bch2_test_restart_gc))) {
/*
* XXX: make sure gens we fixed got saved
*/
BUG_ON(extents
? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
: bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
- //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
+ //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0);
}
#endif
}
break;
for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
- b = bkey_cmp_packed(bt,
+ b = bch2_bkey_cmp_packed(bt,
ptrs[c],
ptrs[d]) >= 0 ? c : d;
if (d == n)
b = c;
while (b != a &&
- bkey_cmp_packed(bt,
+ bch2_bkey_cmp_packed(bt,
ptrs[a],
ptrs[b]) >= 0)
b = (b - 1) / 2;
const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
if (invalid ||
- (inject_invalid_keys(c) &&
+ (bch2_inject_invalid_keys &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
unsigned level, struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn,
- void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
- struct btree_iter *linked;
+ struct btree_iter *linked, *deadlock_iter = NULL;
u64 start_time = local_clock();
- bool ret = true;
+ unsigned reason = 9;
/* Check if it's safe to block: */
trans_for_each_iter(trans, linked) {
linked->locks_want = max_t(unsigned,
linked->locks_want,
__fls(linked->nodes_locked) + 1);
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 1;
+ }
} else {
- ret = false;
+ deadlock_iter = linked;
+ reason = 2;
+ }
+ }
+
+ if (linked->btree_id != iter->btree_id) {
+ if (linked->btree_id > iter->btree_id) {
+ deadlock_iter = linked;
+ reason = 3;
+ }
+ continue;
+ }
+
+ /*
+ * Within the same btree, cached iterators come before non
+ * cached iterators:
+ */
+ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
+ if (btree_iter_is_cached(iter)) {
+ deadlock_iter = linked;
+ reason = 4;
}
+ continue;
}
/*
* another iterator has possible descendants locked of the node
* we're about to lock, it must have the ancestors locked too:
*/
- if (linked->btree_id == iter->btree_id &&
- level > __fls(linked->nodes_locked)) {
+ if (level > __fls(linked->nodes_locked)) {
if (!(trans->nounlock)) {
linked->locks_want =
max(level + 1, max_t(unsigned,
linked->locks_want,
iter->locks_want));
- if (!btree_iter_get_locks(linked, true, false))
- ret = false;
+ if (!btree_iter_get_locks(linked, true, false)) {
+ deadlock_iter = linked;
+ reason = 5;
+ }
} else {
- ret = false;
+ deadlock_iter = linked;
+ reason = 6;
}
}
/* Must lock btree nodes in key order: */
- if ((cmp_int(iter->btree_id, linked->btree_id) ?:
- -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0)
- ret = false;
-
- if (iter->btree_id == linked->btree_id &&
- btree_node_locked(linked, level) &&
+ if (btree_node_locked(linked, level) &&
bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
- btree_iter_type(linked))) <= 0)
- ret = false;
+ btree_iter_type(linked))) <= 0) {
+ deadlock_iter = linked;
+ reason = 7;
+ }
/*
* Recheck if this is a node we already have locked - since one
}
}
- if (unlikely(!ret)) {
- trace_trans_restart_would_deadlock(iter->trans->ip);
+ if (unlikely(deadlock_iter)) {
+ trace_trans_restart_would_deadlock(iter->trans->ip, ip,
+ reason,
+ deadlock_iter->btree_id,
+ btree_iter_type(deadlock_iter),
+ iter->btree_id,
+ btree_iter_type(iter));
return false;
}
char buf1[100], buf2[100];
const char *msg;
- if (!debug_check_iterators(iter->trans->c))
+ if (!bch2_debug_check_iterators)
return;
if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
{
struct btree_iter *iter;
- if (!debug_check_iterators(trans->c))
+ if (!bch2_debug_check_iterators)
return;
trans_for_each_iter_with_node(trans, b, iter)
__bch2_btree_node_iter_fix(iter, b, node_iter, t,
where, clobber_u64s, new_u64s);
- if (debug_check_iterators(iter->trans->c))
+ if (bch2_debug_check_iterators)
bch2_btree_node_iter_verify(node_iter, b);
}
ret = bkey_disassemble(l->b, k, u);
- if (debug_check_bkeys(iter->trans->c))
+ if (bch2_debug_check_bkeys)
bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
return ret;
}
static inline int btree_iter_lock_root(struct btree_iter *iter,
- unsigned depth_want)
+ unsigned depth_want,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
lock_type = __btree_lock_want(iter, iter->level);
if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
iter, lock_type,
- lock_root_check_fn, rootp)))
+ lock_root_check_fn, rootp,
+ trace_ip)))
return -EINTR;
if (likely(b == READ_ONCE(*rootp) &&
btree_node_unlock(iter, plevel);
}
-static __always_inline int btree_iter_down(struct btree_iter *iter)
+static __always_inline int btree_iter_down(struct btree_iter *iter,
+ unsigned long trace_ip)
{
struct bch_fs *c = iter->trans->c;
struct btree_iter_level *l = &iter->l[iter->level];
bch2_bkey_unpack(l->b, &tmp.k,
bch2_btree_node_iter_peek(&l->iter, l->b));
- b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type);
+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
if (unlikely(IS_ERR(b)))
return PTR_ERR(b);
btree_node_unlock(iter, iter->level++);
}
-static int btree_iter_traverse_one(struct btree_iter *);
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
{
sorted[nr_sorted++] = iter->idx;
#define btree_iter_cmp_by_idx(_l, _r) \
- btree_iter_cmp(&trans->iters[_l], &trans->iters[_r])
+ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
bch2_trans_unlock(trans);
+ cond_resched();
if (unlikely(ret == -ENOMEM)) {
struct closure cl;
if (!(trans->iters_linked & (1ULL << idx)))
continue;
- ret = btree_iter_traverse_one(&trans->iters[idx]);
+ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
if (ret)
goto retry_all;
}
* On error, caller (peek_node()/peek_key()) must return NULL; the error is
* stashed in the iterator and returned from bch2_trans_exit().
*/
-static int btree_iter_traverse_one(struct btree_iter *iter)
+static int btree_iter_traverse_one(struct btree_iter *iter,
+ unsigned long trace_ip)
{
unsigned depth_want = iter->level;
*/
while (iter->level > depth_want) {
int ret = btree_iter_node(iter, iter->level)
- ? btree_iter_down(iter)
- : btree_iter_lock_root(iter, depth_want);
+ ? btree_iter_down(iter, trace_ip)
+ : btree_iter_lock_root(iter, depth_want, trace_ip);
if (unlikely(ret)) {
if (ret == 1)
return 0;
int ret;
ret = bch2_trans_cond_resched(trans) ?:
- btree_iter_traverse_one(iter);
+ btree_iter_traverse_one(iter, _RET_IP_);
if (unlikely(ret))
ret = __btree_iter_traverse_all(trans, ret);
ret.v = bkeyp_val(&l->b->format, _k);
- if (debug_check_iterators(iter->trans->c)) {
+ if (bch2_debug_check_iterators) {
struct bkey k = bkey_unpack_key(l->b, _k);
BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
}
- if (debug_check_bkeys(iter->trans->c))
+ if (bch2_debug_check_bkeys)
bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
}
return bch2_trans_iter_put(trans, iter);
}
+#if 0
static int bch2_trans_realloc_iters(struct btree_trans *trans,
unsigned new_size)
{
sizeof(struct btree_iter) * trans->nr_iters +
sizeof(struct btree_insert_entry) * trans->nr_iters);
- if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
+ kfree(trans->iters);
trans->iters = new_iters;
trans->updates = new_updates;
return 0;
}
+#endif
static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
{
goto got_slot;
if (trans->nr_iters == trans->size) {
- int ret;
-
- if (trans->nr_iters >= BTREE_ITER_MAX) {
- struct btree_iter *iter;
-
- trans_for_each_iter(trans, iter) {
- pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
- bch2_btree_ids[iter->btree_id],
- iter->pos.inode,
- iter->pos.offset,
- (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
- (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
- iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
- (void *) iter->ip_allocated);
- }
+ struct btree_iter *iter;
- panic("trans iter oveflow\n");
+ BUG_ON(trans->size < BTREE_ITER_MAX);
+
+ trans_for_each_iter(trans, iter) {
+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
+ bch2_btree_ids[iter->btree_id],
+ iter->pos.inode,
+ iter->pos.offset,
+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
+ (void *) iter->ip_allocated);
}
+ panic("trans iter oveflow\n");
+#if 0
ret = bch2_trans_realloc_iters(trans, trans->size * 2);
if (ret)
return ERR_PTR(ret);
+#endif
}
idx = trans->nr_iters++;
bch2_btree_iter_traverse_all(trans);
}
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+{
+ unsigned new_size = BTREE_ITER_MAX;
+ size_t iters_bytes = sizeof(struct btree_iter) * new_size;
+ size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size;
+ void *p;
+
+ BUG_ON(trans->used_mempool);
+
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
+ mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+
+ trans->iters = p; p += iters_bytes;
+ trans->updates = p; p += updates_bytes;
+ trans->updates2 = p; p += updates_bytes;
+ trans->size = new_size;
+}
+
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
unsigned expected_nr_iters,
size_t expected_mem_bytes)
{
- memset(trans, 0, offsetof(struct btree_trans, iters_onstack));
+ memset(trans, 0, sizeof(*trans));
+ trans->c = c;
+ trans->ip = _RET_IP_;
/*
* reallocating iterators currently completely breaks
- * bch2_trans_iter_put():
+ * bch2_trans_iter_put(), we always allocate the max:
*/
- expected_nr_iters = BTREE_ITER_MAX;
-
- trans->c = c;
- trans->ip = _RET_IP_;
- trans->size = ARRAY_SIZE(trans->iters_onstack);
- trans->iters = trans->iters_onstack;
- trans->updates = trans->updates_onstack;
- trans->updates2 = trans->updates2_onstack;
- trans->fs_usage_deltas = NULL;
-
- if (expected_nr_iters > trans->size)
- bch2_trans_realloc_iters(trans, expected_nr_iters);
+ bch2_trans_alloc_iters(trans, c);
if (expected_mem_bytes)
bch2_trans_preload_mem(trans, expected_mem_bytes);
int bch2_trans_exit(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
+
bch2_trans_unlock(trans);
#ifdef CONFIG_BCACHEFS_DEBUG
kfree(trans->fs_usage_deltas);
kfree(trans->mem);
- if (trans->used_mempool)
+
+ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+ if (trans->iters)
mempool_free(trans->iters, &trans->c->btree_iters_pool);
- else if (trans->iters != trans->iters_onstack)
- kfree(trans->iters);
+
trans->mem = (void *) 0x1;
trans->iters = (void *) 0x1;
return trans->error ? -EIO : 0;
}
-static void bch2_btree_iter_node_to_text(struct printbuf *out,
- struct btree_bkey_cached_common *_b,
- enum btree_iter_type type)
+static void __maybe_unused
+bch2_btree_iter_node_to_text(struct printbuf *out,
+ struct btree_bkey_cached_common *_b,
+ enum btree_iter_type type)
{
pr_buf(out, " %px l=%u %s:",
_b, _b->level, bch2_btree_ids[_b->btree_id]);
void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
-static inline int btree_iter_cmp(const struct btree_iter *l,
- const struct btree_iter *r)
+/* Sort order for locking btree iterators: */
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
+ const struct btree_iter *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
- -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?:
+ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
bkey_cmp(l->pos, r->pos);
}
};
__flatten
-static inline struct bkey_cached *
-btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
+inline struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
{
struct bkey_cached_key key = {
.btree_id = btree_id,
!bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1;
}
+__flatten
int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
goto fill;
}
retry:
- ck = btree_key_cache_find(c, iter->btree_id, iter->pos);
+ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
if (!ck) {
if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
iter->l[0].b = NULL;
enum six_lock_type lock_want = __btree_lock_want(iter, 0);
if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
- bkey_cached_check_fn, iter)) {
+ bkey_cached_check_fn, iter, _THIS_IP_)) {
if (ck->key.btree_id != iter->btree_id ||
bkey_cmp(ck->key.pos, iter->pos)) {
goto retry;
struct bkey_cached_key key = { id, pos };
/* Fastpath - assume it won't be found: */
- if (!btree_key_cache_find(c, id, pos))
+ if (!bch2_btree_key_cache_find(c, id, pos))
return 0;
return btree_key_cache_flush_pos(trans, key, 0, true);
void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
enum btree_id id, struct bpos pos)
{
- BUG_ON(btree_key_cache_find(trans->c, id, pos));
+ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
}
#endif
#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
#define _BCACHEFS_BTREE_KEY_CACHE_H
+struct bkey_cached *
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
+
int bch2_btree_iter_traverse_cached(struct btree_iter *);
bool bch2_btree_insert_key_cached(struct btree_trans *,
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type,
- six_lock_should_sleep_fn, void *);
+ six_lock_should_sleep_fn, void *,
+ unsigned long);
static inline bool btree_node_lock(struct btree *b,
struct bpos pos, unsigned level,
struct btree_iter *iter,
enum six_lock_type type,
- six_lock_should_sleep_fn should_sleep_fn, void *p)
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
+ unsigned long ip)
{
struct btree_trans *trans = iter->trans;
bool ret;
ret = likely(six_trylock_type(&b->c.lock, type)) ||
btree_node_lock_increment(trans, b, level, type) ||
__bch2_btree_node_lock(b, pos, level, iter, type,
- should_sleep_fn, p);
+ should_sleep_fn, p, ip);
#ifdef CONFIG_BCACHEFS_DEBUG
trans->locking = NULL;
struct btree_write writes[2];
-#ifdef CONFIG_BCACHEFS_DEBUG
- bool *expensive_debug_checks;
-#endif
-
/* Key/pointer for this btree node */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
return iter->flags & BTREE_ITER_TYPE;
}
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
+{
+ return btree_iter_type(iter) == BTREE_ITER_CACHED;
+}
+
static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
{
return iter->l + iter->level;
unsigned journal_u64s;
unsigned journal_preres_u64s;
struct replicas_delta_list *fs_usage_deltas;
-
- struct btree_iter iters_onstack[2];
- struct btree_insert_entry updates_onstack[2];
- struct btree_insert_entry updates2_onstack[2];
};
#define BTREE_FLAG(flag) \
* the node the iterator points to:
*/
while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
- (bkey_cmp_packed(b, k, &insert->k) >= 0))
+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
;
for_each_keylist_key(keys, insert)
EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && bkey_cmp_packed(b, k, &insert->k))
+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
k = NULL;
/* @k is the key being overwritten/deleted, if any: */
struct bch_fs *c = trans->c;
BUG_ON(bkey_cmp(insert->k.p, iter->pos));
- BUG_ON(debug_check_bkeys(c) &&
+ BUG_ON(bch2_debug_check_bkeys &&
bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
__btree_node_type(iter->level, iter->btree_id)));
}
*/
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
- if (journal_seq_verify(c))
+ if (bch2_journal_seq_verify)
trans_for_each_update2(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
- else if (inject_invalid_keys(c))
+ else if (bch2_inject_invalid_keys)
trans_for_each_update2(trans, i)
i->k->k.version = MAX_VERSION;
}
return 0;
}
+static inline int btree_iter_pos_cmp(const struct btree_iter *l,
+ const struct btree_iter *r)
+{
+ return cmp_int(l->btree_id, r->btree_id) ?:
+ bkey_cmp(l->pos, r->pos);
+}
+
static void bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
- if (btree_iter_cmp(n.iter, i->iter) == 0) {
+ if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
}
- if (btree_iter_cmp(n.iter, i->iter) <= 0)
+ if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
break;
}
* Pending updates are kept sorted: first, find position of new update:
*/
trans_for_each_update(trans, i)
- if (btree_iter_cmp(iter, i->iter) <= 0)
+ if (btree_iter_pos_cmp(iter, i->iter) <= 0)
break;
/*
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
- if (!IS_ENABLED(CONFIG_HIGHMEM) &&
+ if (!PageHighMem(bio_iter_page(bio, start)) &&
bio_phys_contig(bio, start))
return (struct bbuf) {
.b = page_address(bio_iter_page(bio, start)) +
v->written = 0;
v->c.level = b->c.level;
v->c.btree_id = b->c.btree_id;
- bch2_btree_keys_init(v, &c->expensive_debug_checks);
+ bch2_btree_keys_init(v);
if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
NULL, &pick) <= 0)
struct btree;
struct bch_fs;
-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
-BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_ALWAYS()
-#undef BCH_DEBUG_PARAM
-
#ifdef CONFIG_BCACHEFS_DEBUG
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) \
- { return bch2_##name || c->name; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
void __bch2_btree_verify(struct bch_fs *, struct btree *);
-
-#define bypass_torture_test(d) ((d)->bypass_torture_test)
-
-#else /* DEBUG */
-
-#define BCH_DEBUG_PARAM(name, description) \
- static inline bool name(struct bch_fs *c) { return false; }
-BCH_DEBUG_PARAMS_DEBUG()
-#undef BCH_DEBUG_PARAM
-
+#else
static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-
-#define bypass_torture_test(d) 0
-
#endif
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
- if (verify_btree_ondisk(c))
+ if (bch2_verify_btree_ondisk)
__bch2_btree_verify(c, b);
}
size_t i;
spin_lock(&c->ec_stripes_heap_lock);
- for (i = 0; i < min(h->used, 20UL); i++) {
+ for (i = 0; i < min_t(size_t, h->used, 20); i++) {
m = genradix_ptr(&c->stripes[0], h->data[i].idx);
pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
return bch2_rand_range(l1 + l2) > l1;
}
- if (force_reconstruct_read(c))
+ if (bch2_force_reconstruct_read)
return p1.idx > p2.idx;
return p1.idx < p2.idx;
!bch2_dev_is_readable(ca))
p.idx++;
- if (force_reconstruct_read(c) &&
+ if (bch2_force_reconstruct_read &&
!p.idx && p.has_ec)
p.idx++;
if (!name)
new_inode->bi_flags |= BCH_INODE_UNLINKED;
- ret = bch2_inode_create(trans, new_inode,
- BLOCKDEV_INODE_MAX, 0,
- &c->unused_inode_hint);
+ ret = bch2_inode_create(trans, new_inode);
if (ret)
goto err;
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = __bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ kfree(detach_page_private(page));
}
static void bch2_page_state_release(struct page *page)
{
- struct bch_page_state *s = bch2_page_state(page);
-
- if (!s)
- return;
-
- ClearPagePrivate(page);
- set_page_private(page, 0);
- put_page(page);
- kfree(s);
+ EBUG_ON(!PageLocked(page));
+ __bch2_page_state_release(page);
}
/* for newly allocated pages: */
return NULL;
spin_lock_init(&s->lock);
- /*
- * migrate_page_move_mapping() assumes that pages with private data
- * have their count elevated by 1.
- */
- get_page(page);
- set_page_private(page, (unsigned long) s);
- SetPagePrivate(page);
+ attach_page_private(page, s);
return s;
}
if (ret != MIGRATEPAGE_SUCCESS)
return ret;
- if (PagePrivate(page)) {
- ClearPagePrivate(page);
- get_page(newpage);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- SetPagePrivate(newpage);
- }
+ if (PagePrivate(page))
+ attach_page_private(newpage, detach_page_private(page));
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
bio_put(bio);
}
-static inline void page_state_init_for_read(struct page *page)
-{
- SetPagePrivate(page);
- page->private = 0;
-}
-
struct readpages_iter {
struct address_space *mapping;
struct page **pages;
unsigned nr_pages;
- unsigned nr_added;
unsigned idx;
pgoff_t offset;
};
static int readpages_iter_init(struct readpages_iter *iter,
- struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+ struct readahead_control *ractl)
{
+ unsigned i, nr_pages = readahead_count(ractl);
+
memset(iter, 0, sizeof(*iter));
- iter->mapping = mapping;
- iter->offset = list_last_entry(pages, struct page, lru)->index;
+ iter->mapping = ractl->mapping;
+ iter->offset = readahead_index(ractl);
+ iter->nr_pages = nr_pages;
iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!iter->pages)
return -ENOMEM;
- while (!list_empty(pages)) {
- struct page *page = list_last_entry(pages, struct page, lru);
-
- __bch2_page_state_create(page, __GFP_NOFAIL);
-
- iter->pages[iter->nr_pages++] = page;
- list_del(&page->lru);
+ __readahead_batch(ractl, iter->pages, nr_pages);
+ for (i = 0; i < nr_pages; i++) {
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
+ put_page(iter->pages[i]);
}
return 0;
static inline struct page *readpage_iter_next(struct readpages_iter *iter)
{
- struct page *page;
- unsigned i;
- int ret;
-
- BUG_ON(iter->idx > iter->nr_added);
- BUG_ON(iter->nr_added > iter->nr_pages);
-
- if (iter->idx < iter->nr_added)
- goto out;
-
- while (1) {
- if (iter->idx == iter->nr_pages)
- return NULL;
-
- ret = add_to_page_cache_lru_vec(iter->mapping,
- iter->pages + iter->nr_added,
- iter->nr_pages - iter->nr_added,
- iter->offset + iter->nr_added,
- GFP_NOFS);
- if (ret > 0)
- break;
-
- page = iter->pages[iter->nr_added];
- iter->idx++;
- iter->nr_added++;
-
- __bch2_page_state_release(page);
- put_page(page);
- }
-
- iter->nr_added += ret;
+ if (iter->idx >= iter->nr_pages)
+ return NULL;
- for (i = iter->idx; i < iter->nr_added; i++)
- put_page(iter->pages[i]);
-out:
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
return iter->pages[iter->idx];
bkey_on_stack_exit(&sk, c);
}
-int bch2_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+void bch2_readahead(struct readahead_control *ractl)
{
- struct bch_inode_info *inode = to_bch_ei(mapping->host);
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
struct btree_trans trans;
struct readpages_iter readpages_iter;
int ret;
- ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages);
+ ret = readpages_iter_init(&readpages_iter, ractl);
BUG_ON(ret);
bch2_trans_init(&trans, c, 0, 0);
bch2_trans_exit(&trans);
kfree(readpages_iter.pages);
-
- return 0;
}
static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
int bch2_readpage(struct file *, struct page *);
int bch2_writepages(struct address_space *, struct writeback_control *);
-int bch2_readpages(struct file *, struct address_space *,
- struct list_head *, unsigned);
+void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page **, void **);
struct bch_inode_info *dst,
u64 journal_seq)
{
+ /*
+ * atomic64_cmpxchg has a fallback for archs that don't support it,
+ * cmpxchg does not:
+ */
+ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
u64 old, v = READ_ONCE(dst->ei_journal_seq);
do {
if (old >= journal_seq)
break;
- } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old);
+ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
}
return &inode->v;
}
+static int inum_test(struct inode *inode, void *p)
+{
+ unsigned long *ino = p;
+
+ return *ino == inode->i_ino;
+}
+
static struct bch_inode_info *
__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
umode_t mode, dev_t rdev, bool tmpfile)
* thread pulling the inode in and modifying it:
*/
- old = to_bch_ei(insert_inode_locked2(&inode->v));
- if (unlikely(old)) {
+ inode->v.i_state |= I_CREATING;
+ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
+ inum_test, NULL, &inode->v.i_ino));
+ BUG_ON(!old);
+
+ if (unlikely(old != inode)) {
/*
* We raced, another process pulled the new inode into cache
* before us:
struct fiemap_extent_info *info,
struct bkey_s_c k, unsigned flags)
{
- if (bkey_extent_is_data(k.k)) {
+ if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
}
return 0;
+ } else if (bkey_extent_is_inline_data(k.k)) {
+ return fiemap_fill_next_extent(info,
+ bkey_start_offset(k.k) << 9,
+ 0, k.k->size << 9,
+ flags|
+ FIEMAP_EXTENT_DATA_INLINE);
} else if (k.k->type == KEY_TYPE_reservation) {
return fiemap_fill_next_extent(info,
bkey_start_offset(k.k) << 9,
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
- bkey_on_stack_realloc(&cur, c, k.k->u64s);
- bkey_on_stack_realloc(&prev, c, k.k->u64s);
- bkey_reassemble(cur.k, k);
+ bkey_on_stack_reassemble(&cur, c, k);
ret = bch2_read_indirect_extent(&trans,
&offset_into_extent, &cur);
break;
k = bkey_i_to_s_c(cur.k);
+ bkey_on_stack_realloc(&prev, c, k.k->u64s);
sectors = min(sectors, k.k->size - offset_into_extent);
- if (offset_into_extent)
- bch2_cut_front(POS(k.k->p.inode,
- bkey_start_offset(k.k) +
- offset_into_extent),
- cur.k);
+ bch2_cut_front(POS(k.k->p.inode,
+ bkey_start_offset(k.k) +
+ offset_into_extent),
+ cur.k);
bch2_key_resize(&cur.k->k, sectors);
cur.k->k.p = iter->pos;
cur.k->k.p.offset += cur.k->k.size;
bkey_copy(prev.k, cur.k);
have_extent = true;
- if (k.k->type == KEY_TYPE_reflink_v)
- bch2_btree_iter_set_pos(iter, k.k->p);
- else
- bch2_btree_iter_next(iter);
+ bch2_btree_iter_set_pos(iter,
+ POS(iter->pos.inode, iter->pos.offset + sectors));
}
if (ret == -EINTR)
.writepage = bch2_writepage,
.readpage = bch2_readpage,
.writepages = bch2_writepages,
- .readpages = bch2_readpages,
+ .readahead = bch2_readahead,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = bch2_write_begin,
.write_end = bch2_write_end,
struct bch_fs *c = sb->s_fs_info;
struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
unsigned shift = sb->s_blocksize_bits - 9;
+ /*
+ * this assumes inodes take up 64 bytes, which is a decent average
+ * number:
+ */
+ u64 avail_inodes = ((usage.capacity - usage.used) << 3);
u64 fsid;
buf->f_type = BCACHEFS_STATFS_MAGIC;
buf->f_blocks = usage.capacity >> shift;
buf->f_bfree = (usage.capacity - usage.used) >> shift;
buf->f_bavail = buf->f_bfree;
- buf->f_files = 0;
- buf->f_ffree = 0;
+
+ buf->f_files = usage.nr_inodes + avail_inodes;
+ buf->f_ffree = avail_inodes;
fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
bch2_trans_unlock(&trans);
- bch2_inode_pack(&p, &w.inode);
+ bch2_inode_pack(c, &p, &w.inode);
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&p.inode.k_i, NULL, NULL,
0, NULL);
root_inode->bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed, root_inode);
+ bch2_inode_pack(c, &packed, root_inode);
return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i,
NULL, NULL,
return ret;
}
-struct inode_bitmap {
- unsigned long *bits;
- size_t size;
-};
+typedef GENRADIX(unsigned long) inode_bitmap;
-static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr)
+static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr)
{
- return nr < b->size ? test_bit(nr, b->bits) : false;
+ unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG);
+ return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false;
}
-static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr)
+static inline int inode_bitmap_set(inode_bitmap *b, size_t nr)
{
- if (nr >= b->size) {
- size_t new_size = max_t(size_t, max_t(size_t,
- PAGE_SIZE * 8,
- b->size * 2),
- nr + 1);
- void *n;
-
- new_size = roundup_pow_of_two(new_size);
- n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO);
- if (!n) {
- return -ENOMEM;
- }
+ unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL);
- b->bits = n;
- b->size = new_size;
- }
+ if (!w)
+ return -ENOMEM;
- __set_bit(nr, b->bits);
+ *w |= 1UL << (nr & (BITS_PER_LONG - 1));
return 0;
}
static int check_directory_structure(struct bch_fs *c,
struct bch_inode_unpacked *lostfound_inode)
{
- struct inode_bitmap dirs_done = { NULL, 0 };
+ inode_bitmap dirs_done;
struct pathbuf path = { 0, 0, NULL };
struct pathbuf_entry *e;
struct btree_trans trans;
/* DFS: */
restart_dfs:
+ genradix_init(&dirs_done);
had_unreachable = false;
ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO);
if (had_unreachable) {
bch_info(c, "reattached unreachable directories, restarting pass to check for loops");
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
memset(&dirs_done, 0, sizeof(dirs_done));
memset(&path, 0, sizeof(path));
err:
fsck_err:
ret = bch2_trans_exit(&trans) ?: ret;
- kfree(dirs_done.bits);
+ genradix_free(&dirs_done);
kfree(path.entries);
return ret;
}
if (do_update) {
struct bkey_inode_buf p;
- bch2_inode_pack(&p, &u);
+ bch2_inode_pack(c, &p, &u);
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
+#include "btree_key_cache.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "str_hash.h"
+#include "varint.h"
#include <linux/random.h>
return bytes;
}
-void bch2_inode_pack(struct bkey_inode_buf *packed,
- const struct bch_inode_unpacked *inode)
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
{
- u8 *out = packed->inode.v.fields;
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
unsigned bytes;
- bkey_inode_init(&packed->inode.k_i);
- packed->inode.k.p.offset = inode->bi_inum;
- packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
- packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
- packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
-
-#define x(_name, _bits) \
+#define x(_name, _bits) \
out += inode_encode_field(out, end, 0, inode->_name); \
nr_fields++; \
\
set_bkey_val_bytes(&packed->inode.k, bytes);
memset_u64s_tail(&packed->inode.v, 0, bytes);
- SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ struct bkey_i_inode *k = &packed->inode;
+ u8 *out = k->v.fields;
+ u8 *end = (void *) &packed[1];
+ u8 *last_nonzero_field = out;
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+ unsigned bytes;
+ int ret;
+
+#define x(_name, _bits) \
+ nr_fields++; \
+ \
+ if (inode->_name) { \
+ ret = bch2_varint_encode(out, inode->_name); \
+ out += ret; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ \
+ last_nonzero_field = out; \
+ last_nonzero_fieldnr = nr_fields; \
+ } else { \
+ *out++ = 0; \
+ \
+ if (_bits > 64) \
+ *out++ = 0; \
+ }
+
+ BCH_INODE_FIELDS()
+#undef x
+ BUG_ON(out > end);
+
+ out = last_nonzero_field;
+ nr_fields = last_nonzero_fieldnr;
+
+ bytes = out - (u8 *) &packed->inode.v;
+ set_bkey_val_bytes(&packed->inode.k, bytes);
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
+
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
+}
+
+void bch2_inode_pack(struct bch_fs *c,
+ struct bkey_inode_buf *packed,
+ const struct bch_inode_unpacked *inode)
+{
+ bkey_inode_init(&packed->inode.k_i);
+ packed->inode.k.p.offset = inode->bi_inum;
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
+
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
+ SET_INODE_NEW_VARINT(&packed->inode.v, true);
+ bch2_inode_pack_v2(packed, inode);
+ } else {
+ bch2_inode_pack_v1(packed, inode);
+ }
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
-#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name);
+#define x(_name, _bits) if (unpacked._name != inode->_name) \
+ panic("unpacked %llu should be %llu", \
+ (u64) unpacked._name, (u64) inode->_name);
BCH_INODE_FIELDS()
#undef x
}
}
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
- struct bch_inode_unpacked *unpacked)
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
{
const u8 *in = inode.v->fields;
- const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
+ const u8 *end = bkey_val_end(inode);
u64 field[2];
unsigned fieldnr = 0, field_bits;
int ret;
- unpacked->bi_inum = inode.k->p.offset;
- unpacked->bi_hash_seed = inode.v->bi_hash_seed;
- unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
- unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
-
#define x(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
memset(&unpacked->_name, 0, \
#undef x
/* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ const u8 *in = inode.v->fields;
+ const u8 *end = bkey_val_end(inode);
+ unsigned fieldnr = 0;
+ int ret;
+ u64 v[2];
+
+#define x(_name, _bits) \
+ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
+ ret = bch2_varint_decode(in, end, &v[0]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ \
+ if (_bits > 64) { \
+ ret = bch2_varint_decode(in, end, &v[1]); \
+ if (ret < 0) \
+ return ret; \
+ in += ret; \
+ } else { \
+ v[1] = 0; \
+ } \
+ } else { \
+ v[0] = v[1] = 0; \
+ } \
+ \
+ unpacked->_name = v[0]; \
+ if (v[1] || v[0] != unpacked->_name) \
+ return -1; \
+ fieldnr++;
+
+ BCH_INODE_FIELDS()
+#undef x
+
+ /* XXX: signal if there were more fields than expected? */
+ return 0;
+}
+
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
+ struct bch_inode_unpacked *unpacked)
+{
+ unpacked->bi_inum = inode.k->p.offset;
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
+
+ if (INODE_NEW_VARINT(inode.v)) {
+ return bch2_inode_unpack_v2(inode, unpacked);
+ } else {
+ return bch2_inode_unpack_v1(inode, unpacked);
+ }
return 0;
}
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum),
- BTREE_ITER_SLOTS|flags);
+ BTREE_ITER_CACHED|flags);
if (IS_ERR(iter))
return iter;
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
goto err;
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
- bch2_inode_pack(inode_p, inode);
+ bch2_inode_pack(trans->c, inode_p, inode);
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
return;
}
+ pr_buf(out, "mode: %o ", unpacked.bi_mode);
+
#define x(_name, _bits) \
pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
}
int bch2_inode_create(struct btree_trans *trans,
- struct bch_inode_unpacked *inode_u,
- u64 min, u64 max, u64 *hint)
+ struct bch_inode_unpacked *inode_u)
{
+ struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
struct btree_iter *iter = NULL;
struct bkey_s_c k;
- u64 start;
+ u64 min, max, start, *hint;
int ret;
- if (!max)
- max = ULLONG_MAX;
+ unsigned cpu = raw_smp_processor_id();
+ unsigned bits = (c->opts.inodes_32bit
+ ? 31 : 63) - c->inode_shard_bits;
- if (trans->c->opts.inodes_32bit)
- max = min_t(u64, max, U32_MAX);
+ min = (cpu << bits);
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
+
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+ hint = c->unused_inode_hints + cpu;
start = READ_ONCE(*hint);
if (bkey_cmp(iter->pos, POS(0, max)) > 0)
break;
- if (k.k->type != KEY_TYPE_inode)
+ /*
+ * There's a potential cache coherency issue with the btree key
+ * cache code here - we're iterating over the btree, skipping
+ * that cache. We should never see an empty slot that isn't
+ * actually empty due to a pending update in the key cache
+ * because the update that creates the inode isn't done with a
+ * cached iterator, but - better safe than sorry, check the
+ * cache before using a slot:
+ */
+ if (k.k->type != KEY_TYPE_inode &&
+ !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos))
goto found_slot;
}
inode_u->bi_inum = k.k->p.offset;
inode_u->bi_generation = bkey_generation(k);
- bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
- bch2_trans_iter_put(trans, iter);
- return 0;
+ return bch2_inode_write(trans, iter, inode_u);
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
struct bkey_i_inode_generation delete;
struct bpos start = POS(inode_nr, 0);
struct bpos end = POS(inode_nr + 1, 0);
+ struct bkey_s_c k;
+ u64 bi_generation;
int ret;
/*
return ret;
bch2_trans_init(&trans, c, 0, 0);
+retry:
+ bch2_trans_begin(&trans);
+
+ bi_generation = 0;
+
+ ret = bch2_btree_key_cache_flush(&trans, BTREE_ID_INODES, POS(0, inode_nr));
+ if (ret) {
+ if (ret != -EINTR)
+ bch_err(c, "error flushing btree key cache: %i", ret);
+ goto err;
+ }
iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- do {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
- u32 bi_generation = 0;
+ k = bch2_btree_iter_peek_slot(iter);
- ret = bkey_err(k);
- if (ret)
- break;
+ ret = bkey_err(k);
+ if (ret)
+ goto err;
- bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
- "inode %llu not found when deleting",
- inode_nr);
+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c,
+ "inode %llu not found when deleting",
+ inode_nr);
- switch (k.k->type) {
- case KEY_TYPE_inode: {
- struct bch_inode_unpacked inode_u;
+ switch (k.k->type) {
+ case KEY_TYPE_inode: {
+ struct bch_inode_unpacked inode_u;
- if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
- bi_generation = inode_u.bi_generation + 1;
- break;
- }
- case KEY_TYPE_inode_generation: {
- struct bkey_s_c_inode_generation g =
- bkey_s_c_to_inode_generation(k);
- bi_generation = le32_to_cpu(g.v->bi_generation);
- break;
- }
- }
+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
+ bi_generation = inode_u.bi_generation + 1;
+ break;
+ }
+ case KEY_TYPE_inode_generation: {
+ struct bkey_s_c_inode_generation g =
+ bkey_s_c_to_inode_generation(k);
+ bi_generation = le32_to_cpu(g.v->bi_generation);
+ break;
+ }
+ }
- if (!bi_generation) {
- bkey_init(&delete.k);
- delete.k.p.offset = inode_nr;
- } else {
- bkey_inode_generation_init(&delete.k_i);
- delete.k.p.offset = inode_nr;
- delete.v.bi_generation = cpu_to_le32(bi_generation);
- }
+ if (!bi_generation) {
+ bkey_init(&delete.k);
+ delete.k.p.offset = inode_nr;
+ } else {
+ bkey_inode_generation_init(&delete.k_i);
+ delete.k.p.offset = inode_nr;
+ delete.v.bi_generation = cpu_to_le32(bi_generation);
+ }
- bch2_trans_update(&trans, iter, &delete.k_i, 0);
+ bch2_trans_update(&trans, iter, &delete.k_i, 0);
- ret = bch2_trans_commit(&trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
- } while (ret == -EINTR);
+ ret = bch2_trans_commit(&trans, NULL, NULL,
+ BTREE_INSERT_NOFAIL);
+err:
+ if (ret == -EINTR)
+ goto retry;
bch2_trans_exit(&trans);
return ret;
int ret;
iter = bch2_trans_get_iter(trans, BTREE_ID_INODES,
- POS(0, inode_nr), BTREE_ITER_SLOTS);
+ POS(0, inode_nr), BTREE_ITER_CACHED);
if (IS_ERR(iter))
return PTR_ERR(iter);
- k = bch2_btree_iter_peek_slot(iter);
+ k = bch2_btree_iter_peek_cached(iter);
ret = bkey_err(k);
if (ret)
goto err;
return bch2_trans_do(c, NULL, NULL, 0,
bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void)
-{
- struct bch_inode_unpacked *u, test_inodes[] = {
- {
- .bi_atime = U64_MAX,
- .bi_ctime = U64_MAX,
- .bi_mtime = U64_MAX,
- .bi_otime = U64_MAX,
- .bi_size = U64_MAX,
- .bi_sectors = U64_MAX,
- .bi_uid = U32_MAX,
- .bi_gid = U32_MAX,
- .bi_nlink = U32_MAX,
- .bi_generation = U32_MAX,
- .bi_dev = U32_MAX,
- },
- };
-
- for (u = test_inodes;
- u < test_inodes + ARRAY_SIZE(test_inodes);
- u++) {
- struct bkey_inode_buf p;
-
- bch2_inode_pack(&p, u);
- }
-}
-#endif
.val_to_text = bch2_inode_generation_to_text, \
}
+#if 0
+typedef struct {
+ u64 lo;
+ u32 hi;
+} __packed __aligned(4) u96;
+#endif
+typedef u64 u96;
+
struct bch_inode_unpacked {
u64 bi_inum;
__le64 bi_hash_seed;
#undef x
} __attribute__((packed, aligned(8)));
-void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
+ const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
struct btree_iter *bch2_inode_peek(struct btree_trans *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
-int bch2_inode_create(struct btree_trans *,
- struct bch_inode_unpacked *,
- u64, u64, u64 *);
+int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *);
int bch2_inode_rm(struct bch_fs *, u64);
}
}
-#ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_inode_pack_test(void);
-#else
-static inline void bch2_inode_pack_test(void) {}
-#endif
-
#endif /* _BCACHEFS_INODE_H */
while (size) {
struct page *page = __bio_alloc_page_pool(c, &using_mempool);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
BUG_ON(!bio_add_page(bio, page, len, 0));
size -= len;
inode_u.bi_sectors += delta;
if (delta || new_i_size) {
- bch2_inode_pack(&inode_p, &inode_u);
+ bch2_inode_pack(trans->c, &inode_p, &inode_u);
bch2_trans_update(trans, inode_iter,
&inode_p.inode.k_i, 0);
}
wait_event(j->wait, journal_entry_close(j));
- /* do we need to write another journal entry? */
- if (test_bit(JOURNAL_NOT_EMPTY, &j->flags))
- bch2_journal_meta(j);
+ /*
+ * Always write a new journal entry, to make sure the clock hands are up
+ * to date (and match the superblock)
+ */
+ bch2_journal_meta(j);
journal_quiesce(j);
return ret;
}
-/**
- * bch2_journal_reclaim - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-void bch2_journal_reclaim(struct journal *j)
+static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
- unsigned iter, min_nr = 0;
u64 seq_to_flush = 0;
-
- lockdep_assert_held(&j->reclaim_lock);
-
- bch2_journal_do_discards(j);
+ unsigned iter;
spin_lock(&j->lock);
(j->pin.size >> 1));
spin_unlock(&j->lock);
- /*
- * If it's been longer than j->reclaim_delay_ms since we last flushed,
- * make sure to flush at least one journal pin:
- */
- if (time_after(jiffies, j->last_flushed +
- msecs_to_jiffies(j->reclaim_delay_ms)))
- min_nr = 1;
+ return seq_to_flush;
+}
- if (j->prereserved.reserved * 2 > j->prereserved.remaining) {
- seq_to_flush = max(seq_to_flush, journal_last_seq(j));
- min_nr = 1;
- }
+/**
+ * bch2_journal_reclaim - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim(struct journal *j)
+{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ unsigned min_nr = 0;
+ u64 seq_to_flush = 0;
+
+ lockdep_assert_held(&j->reclaim_lock);
+
+ do {
+ bch2_journal_do_discards(j);
+
+ seq_to_flush = journal_seq_to_flush(j);
+ min_nr = 0;
+
+ /*
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
+ * make sure to flush at least one journal pin:
+ */
+ if (time_after(jiffies, j->last_flushed +
+ msecs_to_jiffies(j->reclaim_delay_ms)))
+ min_nr = 1;
- journal_flush_pins(j, seq_to_flush, min_nr);
+ if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+ min_nr = 1;
+ } while (journal_flush_pins(j, seq_to_flush, min_nr));
if (!bch2_journal_error(j))
queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
- bch2_inode_pack(&packed_inode, &root_inode);
+ bch2_inode_pack(c, &packed_inode, &root_inode);
err = "error creating root directory";
ret = bch2_btree_insert(c, BTREE_ID_INODES,
static void __bch2_fs_free(struct bch_fs *c)
{
unsigned i;
+ int cpu;
for (i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
free_percpu(c->usage[1]);
free_percpu(c->usage[0]);
kfree(c->usage_base);
+
+ if (c->btree_iters_bufs)
+ for_each_possible_cpu(cpu)
+ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+
+ free_percpu(c->btree_iters_bufs);
free_percpu(c->pcpu);
mempool_exit(&c->large_bkey_pool);
mempool_exit(&c->btree_bounce_pool);
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
+ kfree(c->unused_inode_hints);
free_heap(&c->copygc_heap);
if (c->journal_reclaim_wq)
(btree_blocks(c) + 1) * 2 *
sizeof(struct sort_iter_set);
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
+
if (!(c->wq = alloc_workqueue("bcachefs",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->copygc_wq = alloc_workqueue("bcache_copygc",
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
- !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal",
+ !(c->journal_reclaim_wq = alloc_workqueue("bcachefs_journal_reclaim",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
+ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
+ sizeof(u64), GFP_KERNEL)) ||
bch2_io_clock_init(&c->io_clock[READ]) ||
bch2_io_clock_init(&c->io_clock[WRITE]) ||
bch2_fs_journal_init(&c->journal) ||
static int __init bcachefs_init(void)
{
bch2_bkey_pack_test();
- bch2_inode_pack_test();
if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
bch2_chardev_init() ||
write_attribute(perf_test);
#endif /* CONFIG_BCACHEFS_TESTS */
-#define BCH_DEBUG_PARAM(name, description) \
- rw_attribute(name);
-
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
#define x(_name) \
static struct attribute sysfs_time_stat_##_name = \
{ .name = #_name, .mode = S_IRUGO };
return out.pos - buf;
}
-#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
return 0;
}
/* Debugging: */
-#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EPERM;
&sysfs_io_timers_write,
&sysfs_internal_uuid,
-
-#define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
- BCH_DEBUG_PARAMS()
-#undef BCH_DEBUG_PARAM
-
NULL
};
{
while (size) {
struct page *page = alloc_page(gfp_mask);
- unsigned len = min(PAGE_SIZE, size);
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
if (!page)
return -ENOMEM;
#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
-#define memcpy(dst, src, len) \
-({ \
- void *_dst = (dst); \
- const void *_src = (src); \
- size_t _len = (len); \
- \
- BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \
- (void *) (_dst) + (_len) <= (void *) (_src))); \
- memcpy(_dst, _src, _len); \
-})
-
#else /* DEBUG */
#define EBUG_ON(cond)
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#include "varint.h"
+
+int bch2_varint_encode(u8 *out, u64 v)
+{
+ unsigned bits = fls64(v|1);
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+ if (likely(bytes < 9)) {
+ v <<= bytes;
+ v |= ~(~0 << (bytes - 1));
+ } else {
+ *out++ = 255;
+ bytes = 9;
+ }
+
+ put_unaligned_le64(v, out);
+ return bytes;
+}
+
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+ u64 v = get_unaligned_le64(in);
+ unsigned bytes = ffz(v & 255) + 1;
+
+ if (unlikely(in + bytes > end))
+ return -1;
+
+ if (likely(bytes < 9)) {
+ v >>= bytes;
+ v &= ~(~0ULL << (7 * bytes));
+ } else {
+ v = get_unaligned_le64(++in);
+ }
+
+ *out = v;
+ return bytes;
+}
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_VARINT_H
+#define _BCACHEFS_VARINT_H
+
+int bch2_varint_encode(u8 *, u64);
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
+
+#endif /* _BCACHEFS_VARINT_H */