-d763e8ab17ff1f5bdd9c5474ac15eb8791d31582
+9017d858547faedabdef6ca21317e317791526bd
return kobj;
}
-static inline void kset_unregister(struct kset *kset) {}
+static inline void kset_unregister(struct kset *kset)
+{
+ kfree(kset);
+}
#define kset_create_and_add(_name, _u, _parent) \
((struct kset *) kzalloc(sizeof(struct kset), GFP_KERNEL))
int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct btree_trans trans;
- struct btree_iter *iter;
+ struct btree_and_journal_iter iter;
struct bkey_s_c k;
struct bch_dev *ca;
- struct journal_key *j;
unsigned i;
- int ret;
+ int ret = 0;
bch2_trans_init(&trans, c, 0, 0);
- for_each_btree_key(&trans, iter, BTREE_ID_ALLOC, POS_MIN, 0, k, ret)
+ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+ BTREE_ID_ALLOC, POS_MIN);
+
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
+ bch2_btree_and_journal_iter_advance(&iter);
+ }
+
ret = bch2_trans_exit(&trans) ?: ret;
if (ret) {
bch_err(c, "error reading alloc info: %i", ret);
return ret;
}
- for_each_journal_key(*journal_keys, j)
- if (j->btree_id == BTREE_ID_ALLOC)
- bch2_mark_key(c, bkey_i_to_s_c(j->k),
- 0, 0, NULL, 0,
- BTREE_TRIGGER_ALLOC_READ|
- BTREE_TRIGGER_NOATOMIC);
-
percpu_down_write(&c->mark_lock);
bch2_dev_usage_from_buckets(c);
percpu_up_write(&c->mark_lock);
x(stripe, 14) \
x(reflink_p, 15) \
x(reflink_v, 16) \
- x(inline_data, 17)
+ x(inline_data, 17) \
+ x(btree_ptr_v2, 18)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
__u64 _data[0];
} __attribute__((packed, aligned(8)));
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ /* In case we ever decide to do variable size btree nodes: */
+ __le16 sectors;
+ struct bpos min_key;
+ struct bch_extent_ptr start[0];
+ __u64 _data[0];
+} __attribute__((packed, aligned(8)));
+
struct bch_extent {
struct bch_val v;
/* Btree pointers don't carry around checksums: */
#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_extent_ptr)) / sizeof(u64) * BCH_REPLICAS_MAX)
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
#define BKEY_BTREE_PTR_U64s_MAX \
(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
x(reflink, 6) \
x(new_siphash, 7) \
x(inline_data, 8) \
- x(new_extent_overwrite, 9)
+ x(new_extent_overwrite, 9) \
+ x(incompressible, 10) \
+ x(btree_ptr_v2, 11)
enum bch_sb_feature {
#define x(f, n) BCH_FEATURE_##f,
};
#define BCH_COMPRESSION_TYPES() \
- x(none, 0) \
- x(lz4_old, 1) \
- x(gzip, 2) \
- x(lz4, 3) \
- x(zstd, 4)
+ x(none, 0) \
+ x(lz4_old, 1) \
+ x(gzip, 2) \
+ x(lz4, 3) \
+ x(zstd, 4) \
+ x(incompressible, 5)
enum bch_compression_type {
#define x(t, n) BCH_COMPRESSION_TYPE_##t,
BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v);
BKEY_VAL_ACCESSORS(inline_data);
+BKEY_VAL_ACCESSORS(btree_ptr_v2);
/* byte order helpers */
bch2_val_to_text(out, c, k);
}
-void bch2_bkey_swab(const struct bkey_format *f,
- struct bkey_packed *k)
+void bch2_bkey_swab_val(struct bkey_s k)
{
- const struct bkey_ops *ops = &bch2_bkey_ops[k->type];
-
- bch2_bkey_swab_key(f, k);
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
if (ops->swab)
- ops->swab(f, k);
+ ops->swab(k);
}
bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c);
void (*val_to_text)(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
- void (*swab)(const struct bkey_format *, struct bkey_packed *);
+ void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
enum merge_result (*key_merge)(struct bch_fs *,
struct bkey_s, struct bkey_s);
void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-void bch2_bkey_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_bkey_swab_val(struct bkey_s);
bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
bool filter_whiteouts)
{
struct bkey_packed *prev = NULL, *k_packed;
- struct bkey_s k;
+ struct bkey_on_stack k;
struct btree_nr_keys nr;
- struct bkey unpacked;
memset(&nr, 0, sizeof(nr));
+ bkey_on_stack_init(&k);
while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
if (filter_whiteouts && bkey_whiteout(k_packed))
continue;
- k = __bkey_disassemble(src, k_packed, &unpacked);
+ /*
+ * NOTE:
+ * bch2_bkey_normalize may modify the key we pass it (dropping
+ * stale pointers) and we don't have a write lock on the src
+ * node; we have to make a copy of the entire key before calling
+ * normalize
+ */
+ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+ bch2_bkey_unpack(src, k.k, k_packed);
if (filter_whiteouts &&
- bch2_bkey_normalize(c, k))
+ bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
continue;
- extent_sort_append(c, out_f, &nr, vstruct_last(dst), &prev, k);
+ extent_sort_append(c, out_f, &nr, vstruct_last(dst),
+ &prev, bkey_i_to_s(k.k));
}
extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+ bkey_on_stack_exit(&k, c);
return nr;
}
sort_iter_sort(iter, sort_keys_cmp);
while ((in = sort_iter_next(iter, sort_keys_cmp))) {
+ bool needs_whiteout = false;
+
if (bkey_whiteout(in) &&
(filter_whiteouts || !in->needs_whiteout))
continue;
- if (bkey_whiteout(in) &&
- (next = sort_iter_peek(iter)) &&
- !bkey_cmp_packed(iter->b, in, next)) {
+ while ((next = sort_iter_peek(iter)) &&
+ !bkey_cmp_packed(iter->b, in, next)) {
BUG_ON(in->needs_whiteout &&
next->needs_whiteout);
- /*
- * XXX racy, called with read lock from write path
- *
- * leads to spurious BUG_ON() in bkey_unpack_key() in
- * debug mode
- */
- next->needs_whiteout |= in->needs_whiteout;
- continue;
+ needs_whiteout |= in->needs_whiteout;
+ in = sort_iter_next(iter, sort_keys_cmp);
}
if (bkey_whiteout(in)) {
} else {
bkey_copy(out, in);
}
+ out->needs_whiteout |= needs_whiteout;
out = bkey_next(out);
}
memcpy_u64s(bkeyp_val(f, where), &insert->v,
bkeyp_val_u64s(f, src));
- bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
+ if (src->u64s != clobber_u64s)
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
bch2_verify_btree_nr_keys(b);
}
{
if (lossy_packed_search)
while (m != btree_bkey_last(b, t) &&
- bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
- m) > 0)
+ bkey_iter_cmp_p_or_unp(b, m,
+ lossy_packed_search, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (!packed_search)
while (m != btree_bkey_last(b, t) &&
- bkey_iter_pos_cmp(b, search, m) > 0)
+ bkey_iter_pos_cmp(b, m, search) < 0)
m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
if (btree_keys_expensive_checks(b)) {
struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
BUG_ON(prev &&
- bkey_iter_cmp_p_or_unp(b, search, packed_search,
- prev) <= 0);
+ bkey_iter_cmp_p_or_unp(b, prev,
+ packed_search, search) >= 0);
}
return m;
if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
- /*
- * hack around a harmless race when compacting whiteouts
- * for a write:
- */
- dst2.needs_whiteout = dst->needs_whiteout;
-
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
}
}
static inline int bkey_cmp_p_or_unp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r_packed,
- struct bpos *r)
+ const struct bpos *r)
{
EBUG_ON(r_packed && !bkey_packed(r_packed));
* XXX: only need to compare pointers for keys that are both within a
* btree_node_iterator - we need to break ties for prev() to work correctly
*/
-static inline int bkey_iter_cmp(struct btree *b,
+static inline int bkey_iter_cmp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r)
{
?: cmp_int(l, r);
}
-static inline int btree_node_iter_cmp(struct btree *b,
+static inline int btree_node_iter_cmp(const struct btree *b,
struct btree_node_iter_set l,
struct btree_node_iter_set r)
{
__btree_node_offset_to_key(b, r.k));
}
-/* These assume l (the search key) is not a deleted key: */
-static inline int bkey_iter_pos_cmp(struct btree *b,
- struct bpos *l,
- const struct bkey_packed *r)
+/* These assume r (the search key) is not a deleted key: */
+static inline int bkey_iter_pos_cmp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bpos *r)
{
- return -bkey_cmp_left_packed(b, r, l)
- ?: (int) bkey_deleted(r);
+ return bkey_cmp_left_packed(b, l, r)
+ ?: -((int) bkey_deleted(l));
}
-static inline int bkey_iter_cmp_p_or_unp(struct btree *b,
- struct bpos *l,
- const struct bkey_packed *l_packed,
- const struct bkey_packed *r)
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
+ const struct bkey_packed *l,
+ const struct bkey_packed *r_packed,
+ const struct bpos *r)
{
- return -bkey_cmp_p_or_unp(b, r, l_packed, l)
- ?: (int) bkey_deleted(r);
+ return bkey_cmp_p_or_unp(b, l, r_packed, r)
+ ?: -((int) bkey_deleted(l));
}
static inline struct bkey_packed *
const struct btree *b = obj;
const u64 *v = arg->key;
- return PTR_HASH(&b->key) == *v ? 0 : 1;
+ return b->hash_val == *v ? 0 : 1;
}
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
- .key_offset = offsetof(struct btree, key.v),
- .key_len = sizeof(struct bch_extent_ptr),
+ .key_offset = offsetof(struct btree, hash_val),
+ .key_len = sizeof(u64),
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
/* Cause future lookups for this node to fail: */
- PTR_HASH(&b->key) = 0;
+ b->hash_val = 0;
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(b->hash_val);
+ b->hash_val = btree_ptr_hash_val(&b->key);
+
return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
}
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
{
- return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
- bch_btree_cache_params);
+ u64 v = btree_ptr_hash_val(k);
+
+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
}
/*
btree_node_wait_on_io(b);
}
out:
- if (PTR_HASH(&b->key) && !ret)
+ if (b->hash_val && !ret)
trace_btree_node_reap(c, b);
return ret;
out_unlock:
/* raced with another fill: */
/* mark as unhashed... */
- PTR_HASH(&b->key) = 0;
+ b->hash_val = 0;
mutex_lock(&bc->lock);
list_add(&b->list, &bc->freeable);
* free it:
*
* To guard against this, btree nodes are evicted from the cache
- * when they're freed - and PTR_HASH() is zeroed out, which we
+ * when they're freed - and b->hash_val is zeroed out, which we
* check for after we lock the node.
*
* Then, bch2_btree_node_relock() on the parent will fail - because
if (!btree_node_lock(b, k->k.p, level, iter, lock_type))
return ERR_PTR(-EINTR);
- if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
b->level != level ||
race_fault())) {
six_unlock_type(&b->lock, lock_type);
int bch2_fs_btree_cache_init(struct bch_fs *);
void bch2_fs_btree_cache_init_early(struct btree_cache *);
-#define PTR_HASH(_k) *((u64 *) &bkey_i_to_btree_ptr_c(_k)->v)
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
+{
+ switch (k->k.type) {
+ case KEY_TYPE_btree_ptr:
+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
+ case KEY_TYPE_btree_ptr_v2:
+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
+ default:
+ return 0;
+ }
+}
/* is btree node in hash table? */
static inline bool btree_node_hashed(struct btree *b)
{
- return b->key.k.type == KEY_TYPE_btree_ptr &&
- PTR_HASH(&b->key);
+ return b->hash_val != 0;
}
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
return c->opts.btree_node_size >> c->block_bits;
}
-#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 3 / 4)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
BUG_ON(journal_seq_verify(c) &&
k.k->version.lo > journal_cur_seq(&c->journal));
- if (k.k->version.lo > atomic64_read(&c->key_version))
+ /* XXX change to fsck check */
+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
+ "key version number higher than recorded: %llu > %llu",
+ k.k->version.lo,
+ atomic64_read(&c->key_version)))
atomic64_set(&c->key_version, k.k->version.lo);
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
bch2_bpos_swab(&b->data->max_key);
}
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
+ BTREE_ERR_MUST_RETRY, c, b, NULL,
+ "incorrect min_key");
+ }
+
btree_err_on(bkey_cmp(b->data->max_key, b->key.k.p),
BTREE_ERR_MUST_RETRY, c, b, i,
"incorrect max key");
for (k = i->start;
k != vstruct_last(i);) {
- struct bkey_s_c u;
+ struct bkey_s u;
struct bkey tmp;
const char *invalid;
}
if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(&b->format, k);
+ bch2_bkey_swab_key(&b->format, k);
if (!write &&
version < bcachefs_metadata_version_bkey_renumber)
bch2_bkey_renumber(btree_node_type(b), k, write);
- u = bkey_disassemble(b, k, &tmp);
+ u = __bkey_disassemble(b, k, &tmp);
- invalid = __bch2_bkey_invalid(c, u, btree_node_type(b)) ?:
- bch2_bkey_in_btree_node(b, u) ?:
- (write ? bch2_bkey_val_invalid(c, u) : NULL);
+ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN)
+ bch2_bkey_swab_val(u);
+
+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
+ bch2_bkey_in_btree_node(b, u.s_c) ?:
+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
if (invalid) {
char buf[160];
- bch2_bkey_val_to_text(&PBUF(buf), c, u);
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey:\n%s\n%s", invalid, buf);
BTREE_ERR_MUST_RETRY, c, b, NULL,
"bad btree header");
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bch_btree_ptr_v2 *bp =
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
+
+ btree_err_on(b->data->keys.seq != bp->seq,
+ BTREE_ERR_MUST_RETRY, c, b, NULL,
+ "got wrong btree node");
+ }
+
while (b->written < c->opts.btree_node_size) {
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
i = &b->data->keys;
for (k = i->start; k != vstruct_last(i);) {
struct bkey tmp;
- struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
- const char *invalid = bch2_bkey_val_invalid(c, u);
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
if (invalid ||
(inject_invalid_keys(c) &&
!bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
- bch2_bkey_val_to_text(&PBUF(buf), c, u);
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
"invalid bkey %s: %s", buf, invalid);
continue;
}
+ if (u.k->type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
+
+ bp.v->mem_ptr = 0;
+ }
+
k = bkey_next_skip_noops(k, vstruct_last(i));
}
{
struct btree *b = wbio->wbio.bio.bi_private;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct bkey_i_btree_ptr *new_key;
- struct bkey_s_btree_ptr bp;
struct bch_extent_ptr *ptr;
struct btree_trans trans;
struct btree_iter *iter;
bkey_copy(&tmp.k, &b->key);
- new_key = bkey_i_to_btree_ptr(&tmp.k);
- bp = btree_ptr_i_to_s(new_key);
-
bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bp.s_c))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
goto err;
- ret = bch2_btree_node_update_key(c, iter, b, new_key);
+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
if (ret == -EINTR)
goto retry;
if (ret)
#include <linux/prefetch.h>
#include <trace/events/bcachefs.h>
-static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *,
- struct btree_iter_level *,
- struct bkey *);
-
#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
(unsigned long) iter->l[l].b >= 128;
}
-/* Returns < 0 if @k is before iter pos, > 0 if @k is after */
-static inline int __btree_iter_pos_cmp(struct btree_iter *iter,
- const struct btree *b,
- const struct bkey_packed *k,
- bool interior_node)
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
{
- int cmp = bkey_cmp_left_packed(b, k, &iter->pos);
-
- if (cmp)
- return cmp;
- if (bkey_deleted(k))
- return -1;
+ struct bpos pos = iter->pos;
- /*
- * Normally, for extents we want the first key strictly greater than
- * the iterator position - with the exception that for interior nodes,
- * we don't want to advance past the last key if the iterator position
- * is POS_MAX:
- */
- if (iter->flags & BTREE_ITER_IS_EXTENTS &&
- (!interior_node ||
- bkey_cmp_left_packed_byval(b, k, POS_MAX)))
- return -1;
- return 1;
-}
-
-static inline int btree_iter_pos_cmp(struct btree_iter *iter,
- const struct btree *b,
- const struct bkey_packed *k)
-{
- return __btree_iter_pos_cmp(iter, b, k, b->level != 0);
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
+ bkey_cmp(pos, POS_MAX))
+ pos = bkey_successor(pos);
+ return pos;
}
/* Btree node locking: */
static void __bch2_btree_iter_verify(struct btree_iter *iter,
struct btree *b)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[b->level];
struct btree_node_iter tmp = l->iter;
struct bkey_packed *k;
* For extents, the iterator may have skipped past deleted keys (but not
* whiteouts)
*/
- k = b->level || iter->flags & BTREE_ITER_IS_EXTENTS
+ k = b->level || btree_node_type_is_extents(iter->btree_id)
? bch2_btree_node_iter_prev_filter(&tmp, b, KEY_TYPE_discard)
: bch2_btree_node_iter_prev_all(&tmp, b);
- if (k && btree_iter_pos_cmp(iter, b, k) > 0) {
+ if (k && bkey_iter_pos_cmp(b, k, &pos) >= 0) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
bch2_bkey_to_text(&PBUF(buf), &uk);
- panic("prev key should be before iter pos:\n%s\n%llu:%llu\n",
+ panic("iterator should be before prev key:\n%s\n%llu:%llu\n",
buf, iter->pos.inode, iter->pos.offset);
}
k = bch2_btree_node_iter_peek_all(&l->iter, b);
- if (k && btree_iter_pos_cmp(iter, b, k) < 0) {
+ if (k && bkey_iter_pos_cmp(b, k, &pos) < 0) {
char buf[100];
struct bkey uk = bkey_unpack_key(b, k);
}
static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
- struct btree *b,
- struct bkey_packed *where)
+ struct btree *b,
+ struct bkey_packed *where)
{
- struct btree_node_iter *node_iter = &iter->l[0].iter;
+ struct btree_iter_level *l = &iter->l[b->level];
+ struct bpos pos = btree_iter_search_key(iter);
- if (where == bch2_btree_node_iter_peek_all(node_iter, b)) {
- bkey_disassemble(b, where, &iter->k);
- btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
- }
+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
+ return;
+
+ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
+ bch2_btree_node_iter_advance(&l->iter, l->b);
+
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
bool iter_current_key_modified =
orig_iter_pos >= offset &&
orig_iter_pos <= offset + clobber_u64s;
+ struct bpos iter_pos = btree_iter_search_key(iter);
btree_node_iter_for_each(node_iter, set)
if (set->end == old_end)
/* didn't find the bset in the iterator - might have to readd it: */
if (new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
bch2_btree_node_iter_push(node_iter, b, where, end);
goto fixup_done;
} else {
return;
if (new_u64s &&
- btree_iter_pos_cmp(iter, b, where) > 0) {
+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
set->k = offset;
} else if (set->k < offset + clobber_u64s) {
set->k = offset + new_u64s;
if (!bch2_btree_node_iter_end(node_iter) &&
iter_current_key_modified &&
(b->level ||
- (iter->flags & BTREE_ITER_IS_EXTENTS))) {
+ btree_node_type_is_extents(iter->btree_id))) {
struct bset_tree *t;
struct bkey_packed *k, *k2, *p;
struct btree_iter_level *l,
int max_advance)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct bkey_packed *k;
int nr_advanced = 0;
while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
- btree_iter_pos_cmp(iter, l->b, k) < 0) {
+ bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
if (max_advance > 0 && nr_advanced >= max_advance)
return false;
static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
struct btree *b)
{
- int cmp = bkey_cmp(b->key.k.p, iter->pos);
-
- if (!cmp &&
- (iter->flags & BTREE_ITER_IS_EXTENTS) &&
- bkey_cmp(b->key.k.p, POS_MAX))
- cmp = -1;
- return cmp < 0;
+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
}
static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
static inline void __btree_iter_init(struct btree_iter *iter,
unsigned level)
{
+ struct bpos pos = btree_iter_search_key(iter);
struct btree_iter_level *l = &iter->l[level];
- bch2_btree_node_iter_init(&l->iter, l->b, &iter->pos);
-
- if (iter->flags & BTREE_ITER_IS_EXTENTS)
- btree_iter_advance_to_pos(iter, l, -1);
-
- /* Skip to first non whiteout: */
- if (level)
- bch2_btree_node_iter_peek(&l->iter, l->b);
+ bch2_btree_node_iter_init(&l->iter, l->b, &pos);
btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}
return l;
}
+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
+ bool strictly_greater)
+{
+ struct bpos old = btree_iter_search_key(iter);
+ unsigned l;
+ int cmp;
+
+ iter->flags &= ~BTREE_ITER_IS_EXTENTS;
+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;
+ iter->pos = new_pos;
+
+ cmp = bkey_cmp(btree_iter_search_key(iter), old);
+ if (!cmp)
+ return;
+
+ l = btree_iter_pos_changed(iter, cmp);
+
+ if (l != iter->level)
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+ else
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
+}
+
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
int cmp = bkey_cmp(new_pos, iter->pos);
if (debug_check_iterators(iter->trans->c)) {
struct bkey k = bkey_unpack_key(l->b, _k);
- /*
- * this flag is internal to the btree code,
- * we don't care if it doesn't match - if it's now set
- * it just means the key has been written out to disk:
- */
- k.needs_whiteout = iter->k.needs_whiteout;
BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
}
int ret;
recheck:
- while ((k = __btree_iter_peek_all(iter, l, &iter->k)).k &&
- bkey_cmp(k.k->p, iter->pos) <= 0)
- bch2_btree_node_iter_advance(&l->iter, l->b);
+ btree_iter_advance_to_pos(iter, l, -1);
/*
* iterator is now at the correct position for inserting at iter->pos,
*/
node_iter = l->iter;
- if (k.k && bkey_whiteout(k.k))
- k = __btree_iter_unpack(iter, l, &iter->k,
- bch2_btree_node_iter_peek(&node_iter, l->b));
+ k = __btree_iter_unpack(iter, l, &iter->k,
+ bch2_btree_node_iter_peek(&node_iter, l->b));
+
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
+ /*
+ * If there wasn't actually a hole, want the iterator to be
+ * pointed at the key we found:
+ *
+ * XXX: actually, we shouldn't be changing the iterator here:
+ * the iterator needs to be correct for inserting at iter->pos,
+ * and there may be whiteouts between iter->pos and what this
+ * iterator points at:
+ */
+ l->iter = node_iter;
+
+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
+ iter->uptodate = BTREE_ITER_UPTODATE;
+
+ __bch2_btree_iter_verify(iter, l->b);
+ return k;
+ }
/*
* If we got to the end of the node, check if we need to traverse to the
goto recheck;
}
- if (k.k &&
- !bkey_whiteout(k.k) &&
- bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
- /*
- * if we skipped forward to find the first non whiteout and
- * there _wasn't_ actually a hole, we want the iterator to be
- * pointed at the key we found:
- */
- l->iter = node_iter;
-
- EBUG_ON(bkey_cmp(k.k->p, iter->pos) < 0);
- EBUG_ON(bkey_deleted(k.k));
- iter->uptodate = BTREE_ITER_UPTODATE;
-
- __bch2_btree_iter_verify(iter, l->b);
- return k;
- }
-
/* hole */
/* holes can't span inode numbers: */
iter->nodes_locked = 0;
iter->nodes_intent_locked = 0;
for (i = 0; i < ARRAY_SIZE(iter->l); i++)
- iter->l[i].b = NULL;
- iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
prefetch(c->btree_roots[btree_id].b);
}
int bch2_trans_iter_put(struct btree_trans *trans,
struct btree_iter *iter)
{
- int ret = btree_iter_err(iter);
+ int ret;
+
+ if (IS_ERR_OR_NULL(iter))
+ return 0;
+
+ ret = btree_iter_err(iter);
if (!(trans->iters_touched & (1ULL << iter->idx)) &&
!(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
int bch2_trans_iter_free(struct btree_trans *trans,
struct btree_iter *iter)
{
+ if (IS_ERR_OR_NULL(iter))
+ return 0;
+
trans->iters_touched &= ~(1ULL << iter->idx);
return bch2_trans_iter_put(trans, iter);
__btree_trans_get_iter(trans, btree_id, pos, flags);
if (!IS_ERR(iter))
- bch2_btree_iter_set_pos(iter, pos);
+ __bch2_btree_iter_set_pos(iter, pos,
+ btree_node_type_is_extents(btree_id));
return iter;
}
trans->iters_live |= 1ULL << iter->idx;
/*
- * Don't mark it as touched, we don't need to preserve this iter since
- * it's cheap to copy it again:
+ * We don't need to preserve this iter since it's cheap to copy it
+ * again - this will cause trans_iter_put() to free it right away:
*/
trans->iters_touched &= ~(1ULL << iter->idx);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
static inline struct bpos btree_type_successor(enum btree_id id,
struct btree {
/* Hottest entries first */
struct rhash_head hash;
-
- /* Key/pointer for this btree node */
- __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+ u64 hash_val;
struct six_lock lock;
#ifdef CONFIG_BCACHEFS_DEBUG
bool *expensive_debug_checks;
#endif
+
+ /* Key/pointer for this btree node */
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
struct btree_cache {
#define BTREE_ITER_TYPE ((1 << 2) - 1)
+/*
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
+ */
#define BTREE_ITER_SLOTS (1 << 2)
+/*
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
+ * to be doing updates:
+ */
#define BTREE_ITER_INTENT (1 << 3)
+/*
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
+ */
#define BTREE_ITER_PREFETCH (1 << 4)
+/*
+ * Indicates that this iterator should not be reused until transaction commit,
+ * either because a pending update references it or because the update depends
+ * on that particular key being locked (e.g. by the str_hash code, for hash
+ * table consistency)
+ */
#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
- struct btree *, struct bkey_i_btree_ptr *);
+ struct btree *, struct bkey_i *);
int bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_trigger_flags);
goto retry;
}
- bkey_btree_ptr_init(&tmp.k);
+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
+ bkey_btree_ptr_v2_init(&tmp.k);
+ else
+ bkey_btree_ptr_init(&tmp.k);
+
bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
bch2_open_bucket_get(c, wp, &ob);
{
struct bch_fs *c = as->c;
struct btree *b;
+ int ret;
BUG_ON(level >= BTREE_MAX_DEPTH);
BUG_ON(!as->reserve->nr);
b = as->reserve->b[--as->reserve->nr];
- BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id));
-
set_btree_node_accessed(b);
set_btree_node_dirty(b);
set_btree_node_need_write(b);
b->data->flags = 0;
SET_BTREE_NODE_ID(b->data, as->btree_id);
SET_BTREE_NODE_LEVEL(b->data, level);
- b->data->ptr = bkey_i_to_btree_ptr(&b->key)->v.start[0];
+ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr;
+
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
+
+ bp->v.mem_ptr = 0;
+ bp->v.seq = b->data->keys.seq;
+ bp->v.sectors_written = 0;
+ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size);
+ }
if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
btree_node_will_make_reachable(as, b);
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
+ BUG_ON(ret);
+
trace_btree_node_alloc(c, b);
return b;
}
+static void btree_set_min(struct btree *b, struct bpos pos)
+{
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
+ b->data->min_key = pos;
+}
+
+static void btree_set_max(struct btree *b, struct bpos pos)
+{
+ b->key.k.p = pos;
+ b->data->max_key = pos;
+}
+
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
struct btree *b,
struct bkey_format format)
n = bch2_btree_node_alloc(as, b->level);
- n->data->min_key = b->data->min_key;
- n->data->max_key = b->data->max_key;
- n->data->format = format;
SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
+ btree_set_min(n, b->data->min_key);
+ btree_set_max(n, b->data->max_key);
+
+ n->data->format = format;
btree_node_set_format(n, format);
bch2_btree_sort_into(as->c, n, b);
{
struct btree *b = bch2_btree_node_alloc(as, level);
- b->data->min_key = POS_MIN;
- b->data->max_key = POS_MAX;
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, POS_MAX);
b->data->format = bch2_btree_calc_format(b);
- b->key.k.p = POS_MAX;
btree_node_set_format(b, b->data->format);
bch2_btree_build_aux_trees(b);
BTREE_TRIGGER_GC);
while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
- bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
bch2_btree_node_iter_advance(node_iter, b);
/*
BUG_ON(!prev);
- n1->key.k.p = bkey_unpack_pos(n1, prev);
- n1->data->max_key = n1->key.k.p;
- n2->data->min_key =
- btree_type_successor(n1->btree_id, n1->key.k.p);
+ btree_set_max(n1, bkey_unpack_pos(n1, prev));
+ btree_set_min(n2, btree_type_successor(n1->btree_id, n1->key.k.p));
set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k);
set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s));
if (keys)
btree_split_insert_keys(as, n1, iter, keys);
- if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) {
+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
trace_btree_split(c, b);
n2 = __btree_split_node(as, n1, iter);
n = bch2_btree_node_alloc(as, b->level);
- n->data->min_key = prev->data->min_key;
- n->data->max_key = next->data->max_key;
+ btree_set_min(n, prev->data->min_key);
+ btree_set_max(n, next->data->max_key);
n->data->format = new_f;
- n->key.k.p = next->key.k.p;
btree_node_set_format(n, new_f);
struct btree_update *as,
struct btree_iter *iter,
struct btree *b, struct btree *new_hash,
- struct bkey_i_btree_ptr *new_key)
+ struct bkey_i *new_key)
{
struct btree *parent;
int ret;
*/
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
- bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
+ bch2_bkey_nr_ptrs(bkey_i_to_s_c(new_key)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {
- bkey_copy(&new_hash->key, &new_key->k_i);
+ bkey_copy(&new_hash->key, new_key);
ret = bch2_btree_node_hash_insert(&c->btree_cache,
new_hash, b->level, b->btree_id);
BUG_ON(ret);
}
- bch2_keylist_add(&as->parent_keys, &new_key->k_i);
+ bch2_keylist_add(&as->parent_keys, new_key);
bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
if (new_hash) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
}
} else {
struct bch_fs_usage *fs_usage;
percpu_down_read(&c->mark_lock);
fs_usage = bch2_fs_usage_scratch_get(c);
- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
0, 0, fs_usage, 0,
BTREE_TRIGGER_INSERT);
if (gc_visited(c, gc_pos_btree_root(b->btree_id)))
- bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
+ bch2_mark_key_locked(c, bkey_i_to_s_c(new_key),
0, 0, NULL, 0,
BTREE_TRIGGER_INSERT||
BTREE_TRIGGER_GC);
percpu_up_read(&c->mark_lock);
mutex_unlock(&c->btree_interior_update_lock);
- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
BUG_ON(ret);
mutex_unlock(&c->btree_cache.lock);
} else {
- bkey_copy(&b->key, &new_key->k_i);
+ bkey_copy(&b->key, new_key);
}
btree_update_updated_root(as);
int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
struct btree *b,
- struct bkey_i_btree_ptr *new_key)
+ struct bkey_i *new_key)
{
struct btree *parent = btree_node_parent(iter, b);
struct btree_update *as = NULL;
}
}
- /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */
- if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) {
+ /*
+ * check btree_ptr_hash_val() after @b is locked by
+ * btree_iter_traverse():
+ */
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
/* bch2_btree_reserve_get will unlock */
ret = bch2_btree_cache_cannibalize_lock(c, &cl);
if (ret) {
goto err;
}
- ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&new_key->k_i));
+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key));
if (ret)
goto err_free_update;
bkey_btree_ptr_init(&b->key);
b->key.k.p = POS_MAX;
- PTR_HASH(&b->key) = U64_MAX - id;
+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
bch2_bset_init_first(b, &b->data->keys);
bch2_btree_build_aux_trees(b);
b->data->flags = 0;
- b->data->min_key = POS_MIN;
- b->data->max_key = POS_MAX;
+ btree_set_min(b, POS_MIN);
+ btree_set_max(b, POS_MAX);
b->data->format = bch2_btree_calc_format(b);
btree_node_set_format(b, b->data->format);
struct btree_node_iter *node_iter,
struct bkey_i *insert)
{
- const struct bkey_format *f = &b->format;
struct bkey_packed *k;
- unsigned clobber_u64s;
+ unsigned clobber_u64s = 0, new_u64s = 0;
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
bkey_cmp(insert->k.p, b->data->max_key) > 0);
k = bch2_btree_node_iter_peek_all(node_iter, b);
- if (k && !bkey_cmp_packed(b, k, &insert->k)) {
- BUG_ON(bkey_whiteout(k));
-
- if (!bkey_written(b, k) &&
- bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
- !bkey_whiteout(&insert->k)) {
- k->type = insert->k.type;
- memcpy_u64s(bkeyp_val(f, k), &insert->v,
- bkey_val_u64s(&insert->k));
- return true;
- }
+ if (k && bkey_cmp_packed(b, k, &insert->k))
+ k = NULL;
- btree_account_key_drop(b, k);
+ /* @k is the key being overwritten/deleted, if any: */
+ EBUG_ON(k && bkey_whiteout(k));
- if (bkey_whiteout(&insert->k)) {
- unsigned clobber_u64s = k->u64s, new_u64s = k->u64s;
+ /* Deleting, but not found? nothing to do: */
+ if (bkey_whiteout(&insert->k) && !k)
+ return false;
- k->type = KEY_TYPE_deleted;
+ if (bkey_whiteout(&insert->k)) {
+ /* Deleting: */
+ btree_account_key_drop(b, k);
+ k->type = KEY_TYPE_deleted;
- if (k->needs_whiteout) {
- push_whiteout(iter->trans->c, b, k);
- k->needs_whiteout = false;
- }
+ if (k->needs_whiteout)
+ push_whiteout(iter->trans->c, b, k);
+ k->needs_whiteout = false;
- if (k >= btree_bset_last(b)->start) {
- bch2_bset_delete(b, k, clobber_u64s);
- new_u64s = 0;
- }
+ if (k >= btree_bset_last(b)->start) {
+ clobber_u64s = k->u64s;
+ bch2_bset_delete(b, k, clobber_u64s);
+ goto fix_iter;
+ } else {
+ bch2_btree_iter_fix_key_modified(iter, b, k);
+ }
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- clobber_u64s, new_u64s);
- return true;
+ return true;
+ }
- }
+ if (k) {
+ /* Overwriting: */
+ btree_account_key_drop(b, k);
+ k->type = KEY_TYPE_deleted;
insert->k.needs_whiteout = k->needs_whiteout;
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
clobber_u64s = k->u64s;
goto overwrite;
+ } else {
+ bch2_btree_iter_fix_key_modified(iter, b, k);
}
-
- k->type = KEY_TYPE_deleted;
- /*
- * XXX: we should be able to do this without two calls to
- * bch2_btree_node_iter_fix:
- */
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- k->u64s, k->u64s);
- } else {
- /*
- * Deleting, but the key to delete wasn't found - nothing to do:
- */
- if (bkey_whiteout(&insert->k))
- return false;
-
- insert->k.needs_whiteout = false;
}
k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
- clobber_u64s = 0;
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
- bch2_btree_node_iter_fix(iter, b, node_iter, k,
- clobber_u64s, k->u64s);
+ new_u64s = k->u64s;
+fix_iter:
+ if (clobber_u64s != new_u64s)
+ bch2_btree_node_iter_fix(iter, b, node_iter, k,
+ clobber_u64s, new_u64s);
return true;
}
trans_trigger_run = false;
trans_for_each_update(trans, i) {
- /* we know trans->nounlock won't be set here: */
- if (unlikely(!(i->iter->locks_want < 1
- ? __bch2_btree_iter_upgrade(i->iter, 1)
- : i->iter->uptodate <= BTREE_ITER_NEED_PEEK))) {
+ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
+ trace_trans_restart_traverse(trans->ip);
+ ret = -EINTR;
+ goto out;
+ }
+
+ /*
+ * We're not using bch2_btree_iter_upgrade here because
+ * we know trans->nounlock can't be set:
+ */
+ if (unlikely(i->iter->locks_want < 1 &&
+ !__bch2_btree_iter_upgrade(i->iter, 1))) {
trace_trans_restart_upgrade(trans->ip);
ret = -EINTR;
goto out;
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (btree_node_type_is_extents(iter->btree_id)) {
iter->pos_after_commit = k->k.p;
iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
}
*/
delete.k.p = iter->pos;
- if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+ if (btree_node_type_is_extents(iter->btree_id)) {
unsigned max_sectors =
KEY_SIZE_MAX & (~0 << trans->c->block_bits);
ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags);
break;
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
? c->opts.btree_node_size
: -c->opts.btree_node_size;
static inline enum bch_data_type ptr_data_type(const struct bkey *k,
const struct bch_extent_ptr *ptr)
{
- if (k->type == KEY_TYPE_btree_ptr)
+ if (k->type == KEY_TYPE_btree_ptr ||
+ k->type == KEY_TYPE_btree_ptr_v2)
return BCH_DATA_BTREE;
return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER;
BUG_ON(len_a + len_b > bio_sectors(bio));
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
- BUG_ON(crc_old.compression_type);
+ BUG_ON(crc_is_compressed(crc_old));
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
bch2_csum_type_is_encryption(new_csum_type));
if (i->crc)
*i->crc = (struct bch_extent_crc_unpacked) {
.csum_type = i->csum_type,
+ .compression_type = crc_old.compression_type,
.compressed_size = i->len,
.uncompressed_size = i->len,
.offset = 0,
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
- unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+ unsigned compression_type = crc_is_compressed(crc)
+ ? crc.compression_type
+ : 0;
+ unsigned size = compression_type ? crc.uncompressed_size : 0;
struct nonce nonce = (struct nonce) {{
[0] = cpu_to_le32(size << 22),
[1] = cpu_to_le32(version.lo),
[2] = cpu_to_le32(version.lo >> 32),
[3] = cpu_to_le32(version.hi|
- (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+ (compression_type << 24))^BCH_NONCE_EXTENT,
}};
return nonce_add(nonce, crc.nonce << 9);
bio_unmap_or_unbounce(c, dst_data);
return compression_type;
err:
- compression_type = 0;
+ compression_type = BCH_COMPRESSION_TYPE_incompressible;
goto out;
}
int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys)
{
struct btree_trans trans;
- struct btree_iter *btree_iter;
- struct journal_iter journal_iter;
- struct bkey_s_c btree_k, journal_k;
+ struct btree_and_journal_iter iter;
+ struct bkey_s_c k;
int ret;
ret = bch2_fs_ec_start(c);
bch2_trans_init(&trans, c, 0, 0);
- btree_iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, 0);
- journal_iter = bch2_journal_iter_init(journal_keys, BTREE_ID_EC);
+ bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys,
+ BTREE_ID_EC, POS_MIN);
- btree_k = bch2_btree_iter_peek(btree_iter);
- journal_k = bch2_journal_iter_peek(&journal_iter);
- while (1) {
- bool btree;
-
- if (btree_k.k && journal_k.k) {
- int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
-
- if (!cmp)
- btree_k = bch2_btree_iter_next(btree_iter);
- btree = cmp < 0;
- } else if (btree_k.k) {
- btree = true;
- } else if (journal_k.k) {
- btree = false;
- } else {
- break;
- }
-
- bch2_mark_key(c, btree ? btree_k : journal_k,
- 0, 0, NULL, 0,
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
BTREE_TRIGGER_ALLOC_READ|
BTREE_TRIGGER_NOATOMIC);
- if (btree)
- btree_k = bch2_btree_iter_next(btree_iter);
- else
- journal_k = bch2_journal_iter_next(&journal_iter);
+ bch2_btree_and_journal_iter_advance(&iter);
}
ret = bch2_trans_exit(&trans) ?: ret;
#define bch2_bkey_ops_stripe (struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
+ .swab = bch2_ptr_swab, \
}
static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
if (!bch2_checksum_mergeable(crc_l.csum_type))
return BCH_MERGE_NOMERGE;
- if (crc_l.compression_type)
+ if (crc_is_compressed(crc_l))
return BCH_MERGE_NOMERGE;
if (crc_l.csum_type &&
static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
struct bch_extent_crc_unpacked n)
{
- return !u.compression_type &&
+ return !crc_is_compressed(u) &&
u.csum_type &&
u.uncompressed_size > u.live_size &&
bch2_csum_type_is_encryption(u.csum_type) ==
/* Find a checksum entry that covers only live data: */
if (!n.csum_type) {
bkey_for_each_crc(&k->k, ptrs, u, i)
- if (!u.compression_type &&
+ if (!crc_is_compressed(u) &&
u.csum_type &&
u.live_size == u.uncompressed_size) {
n = u;
return false;
}
found:
- BUG_ON(n.compression_type);
+ BUG_ON(crc_is_compressed(n));
BUG_ON(n.offset);
BUG_ON(n.live_size != k->k.size);
struct extent_ptr_decoded p;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- ret += !p.ptr.cached &&
- p.crc.compression_type == BCH_COMPRESSION_TYPE_none;
+ ret += !p.ptr.cached && !crc_is_compressed(p.crc);
}
return ret;
unsigned ret = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_none)
+ if (!p.ptr.cached && crc_is_compressed(p.crc))
ret += p.crc.compressed_size;
return ret;
}
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
+{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct bch_extent_crc_unpacked crc;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
+ return true;
+ return false;
+}
+
bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
unsigned nr_replicas)
{
switch (k->k.type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
case KEY_TYPE_extent:
EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
if (k.k->type == KEY_TYPE_btree_ptr)
size_ondisk = c->opts.btree_node_size;
+ if (k.k->type == KEY_TYPE_btree_ptr_v2)
+ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
bkey_extent_entry_for_each(ptrs, entry) {
if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
return NULL;
}
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(struct bkey_s k)
{
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
- u64 *d = (u64 *) bkeyp_val(f, k);
- unsigned i;
+ u64 *d;
- for (i = 0; i < bkeyp_val_u64s(f, k); i++)
- d[i] = swab64(d[i]);
+ for (d = (u64 *) ptrs.start;
+ d != (u64 *) ptrs.end;
+ d++)
+ *d = swab64(*d);
- for (entry = (union bch_extent_entry *) d;
- entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+ for (entry = ptrs.start;
+ entry < ptrs.end;
entry = extent_entry_next(entry)) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
#undef common_fields
}
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
+{
+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
+}
+
/* bkey_ptrs: generically over any key type that has ptrs */
struct bkey_ptrs_c {
bkey_val_end(r),
};
}
+ case KEY_TYPE_btree_ptr_v2: {
+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
+ return (struct bkey_ptrs_c) {
+ to_entry(&e.v->start[0]),
+ to_entry(extent_entry_last(e))
+ };
+ }
default:
return (struct bkey_ptrs_c) { NULL, NULL };
}
.swab = bch2_ptr_swab, \
}
+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
+ .key_invalid = bch2_btree_ptr_invalid, \
+ .key_debugcheck = bch2_btree_ptr_debugcheck, \
+ .val_to_text = bch2_btree_ptr_to_text, \
+ .swab = bch2_ptr_swab, \
+}
+
/* KEY_TYPE_extent: */
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
{
switch (k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
return true;
unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
struct bkey_s_c);
const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+void bch2_ptr_swab(struct bkey_s);
/* Generic extent code: */
struct posix_acl *acl)
{
struct bch_fs *c = trans->c;
- struct btree_iter *dir_iter;
+ struct btree_iter *dir_iter = NULL;
struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
u64 now = bch2_current_time(trans->c);
int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
BLOCKDEV_INODE_MAX, 0,
&c->unused_inode_hint);
if (ret)
- return ret;
+ goto err;
if (default_acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
default_acl, ACL_TYPE_DEFAULT);
if (ret)
- return ret;
+ goto err;
}
if (acl) {
ret = bch2_set_acl_trans(trans, new_inode, &hash,
acl, ACL_TYPE_ACCESS);
if (ret)
- return ret;
+ goto err;
}
if (name) {
ret = bch2_inode_write(trans, dir_iter, dir_u);
if (ret)
- return ret;
+ goto err;
ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(new_inode->bi_mode),
name, new_inode->bi_inum,
BCH_HASH_SET_MUST_CREATE);
if (ret)
- return ret;
+ goto err;
}
-
- return 0;
+err:
+ bch2_trans_iter_put(trans, dir_iter);
+ return ret;
}
int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
u64 inum, struct bch_inode_unpacked *dir_u,
struct bch_inode_unpacked *inode_u, const struct qstr *name)
{
- struct btree_iter *dir_iter, *inode_iter;
+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
struct bch_hash_info dir_hash;
u64 now = bch2_current_time(trans->c);
+ int ret;
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- if (IS_ERR(inode_iter))
- return PTR_ERR(inode_iter);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto err;
inode_u->bi_ctime = now;
bch2_inode_nlink_inc(inode_u);
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
dir_u->bi_mtime = dir_u->bi_ctime = now;
dir_hash = bch2_hash_info_init(trans->c, dir_u);
- bch2_trans_iter_put(trans, dir_iter);
- return bch2_dirent_create(trans, dir_inum, &dir_hash,
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
mode_to_type(inode_u->bi_mode),
name, inum, BCH_HASH_SET_MUST_CREATE) ?:
bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
+err:
+ bch2_trans_iter_put(trans, dir_iter);
+ bch2_trans_iter_put(trans, inode_iter);
+ return ret;
}
int bch2_unlink_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct qstr *name)
{
- struct btree_iter *dir_iter, *dirent_iter, *inode_iter;
+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
+ *inode_iter = NULL;
struct bch_hash_info dir_hash;
u64 inum, now = bch2_current_time(trans->c);
struct bkey_s_c k;
+ int ret;
dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
- if (IS_ERR(dir_iter))
- return PTR_ERR(dir_iter);
+ ret = PTR_ERR_OR_ZERO(dir_iter);
+ if (ret)
+ goto err;
dir_hash = bch2_hash_info_init(trans->c, dir_u);
dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
name, BTREE_ITER_INTENT);
- if (IS_ERR(dirent_iter))
- return PTR_ERR(dirent_iter);
+ ret = PTR_ERR_OR_ZERO(dirent_iter);
+ if (ret)
+ goto err;
k = bch2_btree_iter_peek_slot(dirent_iter);
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
- if (IS_ERR(inode_iter))
- return PTR_ERR(inode_iter);
+ ret = PTR_ERR_OR_ZERO(inode_iter);
+ if (ret)
+ goto err;
dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
bch2_inode_nlink_dec(inode_u);
- return (S_ISDIR(inode_u->bi_mode)
+ ret = (S_ISDIR(inode_u->bi_mode)
? bch2_empty_dir_trans(trans, inum)
: 0) ?:
bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
bch2_inode_write(trans, dir_iter, dir_u) ?:
bch2_inode_write(trans, inode_iter, inode_u);
+err:
+ bch2_trans_iter_put(trans, inode_iter);
+ bch2_trans_iter_put(trans, dirent_iter);
+ bch2_trans_iter_put(trans, dir_iter);
+ return ret;
}
bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
const struct qstr *dst_name,
enum bch_rename_mode mode)
{
- struct btree_iter *src_dir_iter, *dst_dir_iter = NULL;
- struct btree_iter *src_inode_iter, *dst_inode_iter = NULL;
+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
struct bch_hash_info src_hash, dst_hash;
u64 src_inode, dst_inode, now = bch2_current_time(trans->c);
int ret;
src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
BTREE_ITER_INTENT);
- if (IS_ERR(src_dir_iter))
- return PTR_ERR(src_dir_iter);
+ ret = PTR_ERR_OR_ZERO(src_dir_iter);
+ if (ret)
+ goto err;
src_hash = bch2_hash_info_init(trans->c, src_dir_u);
if (dst_dir != src_dir) {
dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
BTREE_ITER_INTENT);
- if (IS_ERR(dst_dir_iter))
- return PTR_ERR(dst_dir_iter);
+ ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+ if (ret)
+ goto err;
dst_hash = bch2_hash_info_init(trans->c, dst_dir_u);
} else {
dst_name, &dst_inode,
mode);
if (ret)
- return ret;
+ goto err;
src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
BTREE_ITER_INTENT);
- if (IS_ERR(src_inode_iter))
- return PTR_ERR(src_inode_iter);
+ ret = PTR_ERR_OR_ZERO(src_inode_iter);
+ if (ret)
+ goto err;
if (dst_inode) {
dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
BTREE_ITER_INTENT);
- if (IS_ERR(dst_inode_iter))
- return PTR_ERR(dst_inode_iter);
+ ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+ if (ret)
+ goto err;
}
if (mode == BCH_RENAME_OVERWRITE) {
if (S_ISDIR(src_inode_u->bi_mode) !=
- S_ISDIR(dst_inode_u->bi_mode))
- return -ENOTDIR;
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -ENOTDIR;
+ goto err;
+ }
if (S_ISDIR(dst_inode_u->bi_mode) &&
- bch2_empty_dir_trans(trans, dst_inode))
- return -ENOTEMPTY;
+ bch2_empty_dir_trans(trans, dst_inode)) {
+ ret = -ENOTEMPTY;
+ goto err;
+ }
}
if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
- S_ISDIR(src_inode_u->bi_mode))
- return -EXDEV;
+ S_ISDIR(src_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
if (mode == BCH_RENAME_EXCHANGE &&
bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
- S_ISDIR(dst_inode_u->bi_mode))
- return -EXDEV;
+ S_ISDIR(dst_inode_u->bi_mode)) {
+ ret = -EXDEV;
+ goto err;
+ }
if (S_ISDIR(src_inode_u->bi_mode)) {
src_dir_u->bi_nlink--;
if (dst_inode)
dst_inode_u->bi_ctime = now;
- return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
(src_dir != dst_dir
? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
: 0 ) ?:
(dst_inode
? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
: 0 );
+err:
+ bch2_trans_iter_put(trans, dst_inode_iter);
+ bch2_trans_iter_put(trans, src_inode_iter);
+ bch2_trans_iter_put(trans, dst_dir_iter);
+ bch2_trans_iter_put(trans, src_dir_iter);
+ return ret;
}
struct bch_inode_unpacked *inode_u,
u64 min, u64 max, u64 *hint)
{
- struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
- struct btree_iter *iter;
+ struct btree_iter *iter = NULL;
+ struct bkey_s_c k;
u64 start;
int ret;
if (!max)
max = ULLONG_MAX;
- if (c->opts.inodes_32bit)
+ if (trans->c->opts.inodes_32bit)
max = min_t(u64, max, U32_MAX);
start = READ_ONCE(*hint);
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
-
- iter = bch2_trans_get_iter(trans,
- BTREE_ID_INODES, POS(start, 0),
- BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- if (IS_ERR(iter))
- return PTR_ERR(iter);
again:
- while (1) {
- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
-
- ret = bkey_err(k);
- if (ret)
- return ret;
+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(start, 0),
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+ if (iter->pos.inode > max)
+ break;
- switch (k.k->type) {
- case KEY_TYPE_inode:
- /* slot used */
- if (iter->pos.inode >= max)
- goto out;
+ if (k.k->type != KEY_TYPE_inode)
+ goto found_slot;
+ }
- bch2_btree_iter_next_slot(iter);
- break;
+ bch2_trans_iter_put(trans, iter);
- default:
- *hint = k.k->p.inode;
- inode_u->bi_inum = k.k->p.inode;
- inode_u->bi_generation = bkey_generation(k);
+ if (ret)
+ return ret;
- bch2_inode_pack(inode_p, inode_u);
- bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
- return 0;
- }
- }
-out:
if (start != min) {
/* Retry from start */
start = min;
- bch2_btree_iter_set_pos(iter, POS(start, 0));
goto again;
}
return -ENOSPC;
+found_slot:
+ *hint = k.k->p.inode;
+ inode_u->bi_inum = k.k->p.inode;
+ inode_u->bi_generation = bkey_generation(k);
+
+ bch2_inode_pack(inode_p, inode_u);
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+ bch2_trans_iter_put(trans, iter);
+ return 0;
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
- return ret;
+ goto err;
ret = k.k->type == KEY_TYPE_inode
? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
: -ENOENT;
-
+err:
bch2_trans_iter_put(trans, iter);
-
return ret;
}
* particularly want to plumb io_opts all the way through the btree
* update stack right now
*/
- for_each_keylist_key(keys, k)
+ for_each_keylist_key(keys, k) {
bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
+
+ }
+
if (!bch2_keylist_empty(keys)) {
u64 sectors_start = keylist_sectors(keys);
int ret = op->index_update_fn(op);
/* Can we just write the entire extent as is? */
if (op->crc.uncompressed_size == op->crc.live_size &&
op->crc.compressed_size <= wp->sectors_free &&
- op->crc.compression_type == op->compression_type) {
- if (!op->crc.compression_type &&
+ (op->crc.compression_type == op->compression_type ||
+ op->incompressible)) {
+ if (!crc_is_compressed(op->crc) &&
op->csum_type != op->crc.csum_type &&
bch2_write_rechecksum(c, op, op->csum_type))
return PREP_ENCODED_CHECKSUM_ERR;
* If the data is compressed and we couldn't write the entire extent as
* is, we have to decompress it:
*/
- if (op->crc.compression_type) {
+ if (crc_is_compressed(op->crc)) {
struct bch_csum csum;
if (bch2_write_decrypt(op))
ret = -EIO;
goto err;
case PREP_ENCODED_CHECKSUM_ERR:
+ BUG();
goto csum_err;
case PREP_ENCODED_DO_WRITE:
/* XXX look for bug here */
bch2_csum_type_is_encryption(op->crc.csum_type));
BUG_ON(op->compression_type && !bounce);
- crc.compression_type = op->compression_type
- ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
- op->compression_type)
+ crc.compression_type = op->incompressible
+ ? BCH_COMPRESSION_TYPE_incompressible
+ : op->compression_type
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+ op->compression_type)
: 0;
- if (!crc.compression_type) {
+ if (!crc_is_compressed(crc)) {
dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
if (bch2_csum_type_is_encryption(op->csum_type)) {
if (bversion_zero(version)) {
- version.lo = atomic64_inc_return(&c->key_version) + 1;
+ version.lo = atomic64_inc_return(&c->key_version);
} else {
crc.nonce = op->nonce;
op->nonce += src_len >> 9;
}
if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
- !crc.compression_type &&
+ !crc_is_compressed(crc) &&
bch2_csum_type_is_encryption(op->crc.csum_type) ==
bch2_csum_type_is_encryption(op->csum_type)) {
/*
static struct promote_op *__promote_alloc(struct bch_fs *c,
enum btree_id btree_id,
+ struct bkey_s_c k,
struct bpos pos,
struct extent_ptr_decoded *pick,
struct bch_io_opts opts,
(struct data_opts) {
.target = opts.promote_target
},
- btree_id,
- bkey_s_c_null);
+ btree_id, k);
BUG_ON(ret);
return op;
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_REFLINK
: BTREE_ID_EXTENTS,
- pos, pick, opts, sectors, rbio);
+ k, pos, pick, opts, sectors, rbio);
if (!promote)
return NULL;
u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
int ret;
- if (rbio->pick.crc.compression_type)
+ if (crc_is_compressed(rbio->pick.crc))
return;
bkey_on_stack_init(&new);
crc.offset += rbio->offset_into_extent;
crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
- if (crc.compression_type != BCH_COMPRESSION_TYPE_none) {
+ if (crc_is_compressed(crc)) {
bch2_encrypt_bio(c, crc.csum_type, nonce, src);
if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
goto decompression_err;
}
if (rbio->narrow_crcs ||
- rbio->pick.crc.compression_type ||
+ crc_is_compressed(rbio->pick.crc) ||
bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
else if (rbio->pick.crc.csum_type)
EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
- if (pick.crc.compression_type != BCH_COMPRESSION_TYPE_none ||
+ if (crc_is_compressed(pick.crc) ||
(pick.crc.csum_type != BCH_CSUM_NONE &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
(bch2_csum_type_is_encryption(pick.crc.csum_type) &&
&rbio, &bounce, &read_full);
if (!read_full) {
- EBUG_ON(pick.crc.compression_type);
+ EBUG_ON(crc_is_compressed(pick.crc));
EBUG_ON(pick.crc.csum_type &&
(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
bvec_iter_sectors(iter) != pick.crc.live_size ||
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
+ op->incompressible = 0;
op->open_buckets.nr = 0;
op->devs_have.nr = 0;
op->target = 0;
unsigned compression_type:4;
unsigned nr_replicas:4;
unsigned nr_replicas_required:4;
- unsigned alloc_reserve:4;
+ unsigned alloc_reserve:3;
+ unsigned incompressible:1;
struct bch_devs_list devs_have;
u16 target;
return 0;
}
- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(NULL, bkey_to_packed(k));
+ if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN) {
+ bch2_bkey_swab_key(NULL, bkey_to_packed(k));
+ bch2_bkey_swab_val(bkey_i_to_s(k));
+ }
if (!write &&
version < bcachefs_metadata_version_bkey_renumber)
for_each_btree_node(&trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH, b) {
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
- struct bkey_i_btree_ptr *new_key;
retry:
if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
dev_idx))
continue;
bkey_copy(&tmp.k, &b->key);
- new_key = bkey_i_to_btree_ptr(&tmp.k);
- ret = drop_dev_ptrs(c, bkey_i_to_s(&new_key->k_i),
+ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
dev_idx, flags, true);
if (ret) {
bch_err(c, "Cannot drop device without losing data");
goto err;
}
- ret = bch2_btree_node_update_key(c, iter, b, new_key);
+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(iter);
goto retry;
enum btree_id btree_id,
struct bkey_s_c k)
{
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
int ret;
m->btree_id = btree_id;
m->nr_ptrs_reserved = 0;
bch2_write_op_init(&m->op, c, io_opts);
- m->op.compression_type =
- bch2_compression_opt_to_type[io_opts.background_compression ?:
- io_opts.compression];
+
+ if (!bch2_bkey_is_incompressible(k))
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
+ io_opts.compression];
+ else
+ m->op.incompressible = true;
+
m->op.target = data_opts.target,
m->op.write_point = wp;
break;
}
case DATA_REWRITE: {
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
unsigned compressed_sectors = 0;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached &&
- p.crc.compression_type != BCH_COMPRESSION_TYPE_none &&
+ crc_is_compressed(p.crc) &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
compressed_sectors += p.crc.compressed_size;
#include <linux/sched/cputime.h>
#include <trace/events/bcachefs.h>
-static inline bool rebalance_ptr_pred(struct bch_fs *c,
- struct extent_ptr_decoded p,
- struct bch_io_opts *io_opts)
+/*
+ * Check if an extent should be moved:
+ * returns -1 if it should not be moved, or
+ * device of pointer that should be moved, if known, or INT_MAX if unknown
+ */
+static int __bch2_rebalance_pred(struct bch_fs *c,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts)
{
- if (io_opts->background_target &&
- !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target) &&
- !p.ptr.cached)
- return true;
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
if (io_opts->background_compression &&
- p.crc.compression_type !=
- bch2_compression_opt_to_type[io_opts->background_compression])
- return true;
-
- return false;
+ !bch2_bkey_is_incompressible(k))
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ p.crc.compression_type !=
+ bch2_compression_opt_to_type[io_opts->background_compression])
+ return p.ptr.dev;
+
+ if (io_opts->background_target)
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+ if (!p.ptr.cached &&
+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
+ return p.ptr.dev;
+
+ return -1;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
+ atomic64_t *counter;
+ int dev;
- if (!io_opts->background_target &&
- !io_opts->background_compression)
+ dev = __bch2_rebalance_pred(c, k, io_opts);
+ if (dev < 0)
return;
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
- if (rebalance_ptr_pred(c, p, io_opts)) {
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ counter = dev < INT_MAX
+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work
+ : &c->rebalance.work_unknown_dev;
- if (atomic64_add_return(p.crc.compressed_size,
- &ca->rebalance_work) ==
- p.crc.compressed_size)
- rebalance_wakeup(c);
- }
-}
-
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
-{
- if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
- sectors)
+ if (atomic64_add_return(k.k->size, counter) == k.k->size)
rebalance_wakeup(c);
}
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- unsigned nr_replicas = 0;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- nr_replicas += !p.ptr.cached;
-
- if (rebalance_ptr_pred(c, p, io_opts))
- goto found;
+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
+ data_opts->target = io_opts->background_target;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
+ } else {
+ return DATA_SKIP;
}
+}
- if (nr_replicas < io_opts->data_replicas)
- goto found;
-
- return DATA_SKIP;
-found:
- data_opts->target = io_opts->background_target;
- data_opts->btree_insert_flags = 0;
- return DATA_ADD_REPLICAS;
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+{
+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
+ sectors)
+ rebalance_wakeup(c);
}
struct rebalance_work {
prev_cputime = curr_cputime();
while (!kthread_wait_freezable(r->enabled)) {
+ cond_resched();
+
start = jiffies;
cputime = curr_cputime();
/* iterate over keys read from the journal: */
-struct journal_iter bch2_journal_iter_init(struct journal_keys *keys,
- enum btree_id id)
-{
- return (struct journal_iter) {
- .keys = keys,
- .k = keys->d,
- .btree_id = id,
- };
-}
-
struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
- while (1) {
- if (iter->k == iter->keys->d + iter->keys->nr)
- return bkey_s_c_null;
-
+ while (iter->k) {
if (iter->k->btree_id == iter->btree_id)
return bkey_i_to_s_c(iter->k->k);
iter->k++;
+ if (iter->k == iter->keys->d + iter->keys->nr)
+ iter->k = NULL;
}
return bkey_s_c_null;
struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
{
- if (iter->k == iter->keys->d + iter->keys->nr)
+ if (!iter->k)
return bkey_s_c_null;
iter->k++;
+ if (iter->k == iter->keys->d + iter->keys->nr)
+ iter->k = NULL;
+
return bch2_journal_iter_peek(iter);
}
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+ switch (iter->last) {
+ case none:
+ break;
+ case btree:
+ bch2_btree_iter_next(iter->btree);
+ break;
+ case journal:
+ bch2_journal_iter_next(&iter->journal);
+ break;
+ }
+
+ iter->last = none;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+ struct bkey_s_c ret;
+
+ while (1) {
+ struct bkey_s_c btree_k = bch2_btree_iter_peek(iter->btree);
+ struct bkey_s_c journal_k = bch2_journal_iter_peek(&iter->journal);
+
+ if (btree_k.k && journal_k.k) {
+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
+
+ if (!cmp)
+ bch2_btree_iter_next(iter->btree);
+
+ iter->last = cmp < 0 ? btree : journal;
+ } else if (btree_k.k) {
+ iter->last = btree;
+ } else if (journal_k.k) {
+ iter->last = journal;
+ } else {
+ iter->last = none;
+ return bkey_s_c_null;
+ }
+
+ ret = iter->last == journal ? journal_k : btree_k;
+ if (!bkey_deleted(ret.k))
+ break;
+
+ bch2_btree_and_journal_iter_advance(iter);
+ }
+
+ return ret;
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
+{
+ bch2_btree_and_journal_iter_advance(iter);
+
+ return bch2_btree_and_journal_iter_peek(iter);
+}
+
+struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, struct bpos pos)
+{
+ size_t l = 0, r = journal_keys->nr, m;
+
+ while (l < r) {
+ m = l + ((r - l) >> 1);
+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ BUG_ON(l < journal_keys->nr &&
+ (cmp_int(id, journal_keys->d[l].btree_id) ?:
+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
+
+ BUG_ON(l &&
+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
+
+ return l < journal_keys->nr ? journal_keys->d + l : NULL;
+}
+
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
+ struct btree_trans *trans,
+ struct journal_keys *journal_keys,
+ enum btree_id id, struct bpos pos)
+{
+ iter->journal.keys = journal_keys;
+ iter->journal.k = journal_key_search(journal_keys, id, pos);
+ iter->journal.btree_id = id;
+
+ iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+}
+
/* sort and dedup all keys in the journal: */
static void journal_entries_free(struct list_head *list)
c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_siphash;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_ptr_v2;
write_sb = true;
}
enum btree_id btree_id;
};
-struct journal_iter bch2_journal_iter_init(struct journal_keys *,
- enum btree_id);
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *);
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *);
+struct btree_and_journal_iter {
+ enum btree_id btree_id;
+
+ struct btree_iter *btree;
+ struct journal_iter journal;
+
+ enum last_key_returned {
+ none,
+ btree,
+ journal,
+ } last;
+};
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
+struct journal_key *journal_key_search(struct journal_keys *,
+ enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
+ struct btree_trans *,
+ struct journal_keys *,
+ enum btree_id, struct bpos);
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
err:
- if (!IS_ERR(reflink_iter)) {
+ if (!IS_ERR(reflink_iter))
c->reflink_hint = reflink_iter->pos.offset;
- bch2_trans_iter_put(trans, reflink_iter);
- }
+ bch2_trans_iter_put(trans, reflink_iter);
return ret;
}
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
+ .swab = bch2_ptr_swab, \
}
s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
switch (k.k->type) {
case KEY_TYPE_btree_ptr:
+ case KEY_TYPE_btree_ptr_v2:
e->data_type = BCH_DATA_BTREE;
extent_to_replicas(k, e);
break;
if (!ret)
ret = -ENOSPC;
out:
- if (!IS_ERR_OR_NULL(slot))
- bch2_trans_iter_put(trans, slot);
- if (!IS_ERR_OR_NULL(iter))
- bch2_trans_iter_put(trans, iter);
+ bch2_trans_iter_put(trans, slot);
+ bch2_trans_iter_put(trans, iter);
return ret;
found:
u64 inode, const void *key)
{
struct btree_iter *iter;
+ int ret;
iter = bch2_hash_lookup(trans, desc, info, inode, key,
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
- return bch2_hash_delete_at(trans, desc, info, iter);
+ ret = bch2_hash_delete_at(trans, desc, info, iter);
+ bch2_trans_iter_put(trans, iter);
+ return ret;
}
#endif /* _BCACHEFS_STR_HASH_H */
struct extent_ptr_decoded p;
extent_for_each_ptr_decode(e, p, entry) {
- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_none) {
+ if (!crc_is_compressed(p.crc)) {
nr_uncompressed_extents++;
uncompressed_sectors += e.k->size;
} else {