-fad6d13aa55f96e01cc6ff516cdfea53b2fc9eb1
+600598598b7c6d2069a374a14ad4925f39a30faa
x(data_usage, 6) \
x(clock, 7) \
x(dev_usage, 8) \
- x(log, 9)
+ x(log, 9) \
+ x(overwrite, 10)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
trans_for_each_update(trans, i) {
struct bkey_s_c old = { &i->old_k, i->old_v };
- pr_buf(buf, "update: btree %s %pS",
+ pr_buf(buf, "update: btree=%s cached=%u %pS",
bch2_btree_ids[i->btree_id],
+ i->cached,
(void *) i->ip_allocated);
pr_newline(buf);
struct bpos pos)
{
struct btree_insert_entry *i;
+ struct bkey_i *ret = NULL;
- trans_for_each_update(trans, i)
- if ((cmp_int(btree_id, i->btree_id) ?:
- bpos_cmp(pos, i->k->k.p)) <= 0) {
- if (btree_id == i->btree_id)
- return i->k;
+ trans_for_each_update(trans, i) {
+ if (i->btree_id < btree_id)
+ continue;
+ if (i->btree_id > btree_id)
break;
- }
+ if (bpos_cmp(i->k->k.p, pos) < 0)
+ continue;
+ if (i->key_cache_already_flushed)
+ continue;
+ if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0)
+ ret = i->k;
+ }
- return NULL;
+ return ret;
}
struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
return true;
}
+void bch2_btree_key_cache_drop(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct bkey_cached *ck = (void *) path->l[0].b;
+
+ ck->valid = false;
+
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
+}
+
static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct btree_path *, struct bkey_i *);
int bch2_btree_key_cache_flush(struct btree_trans *,
enum btree_id, struct bpos);
+void bch2_btree_key_cache_drop(struct btree_trans *,
+ struct btree_path *);
void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
bool cached:1;
bool insert_trigger_run:1;
bool overwrite_trigger_run:1;
+ bool key_cache_already_flushed:1;
/*
* @old_k may be a key from the journal; @old_btree_u64s always refers
* to the size of the key being overwritten in the btree:
const struct btree_insert_entry *r)
{
return cmp_int(l->btree_id, r->btree_id) ?:
+ cmp_int(l->cached, r->cached) ?:
-cmp_int(l->level, r->level) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
}
static void journal_transaction_name(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
- struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
- unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
- unsigned b, buflen = u64s * sizeof(u64);
-
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 0;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- b = min_t(unsigned, strlen(trans->fn), buflen);
- memcpy(l->d, trans->fn, b);
- while (b < buflen)
- l->d[b++] = '\0';
-
- trans->journal_res.offset += JSET_ENTRY_LOG_U64s;
- trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s;
+ struct journal *j = &c->journal;
+ struct jset_entry *entry =
+ bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_log, 0, 0,
+ JSET_ENTRY_LOG_U64s);
+ struct jset_entry_log *l =
+ container_of(entry, struct jset_entry_log, entry);
+
+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
}
static inline enum btree_insert_ret
return -EINTR;
}
-static inline void do_btree_insert_one(struct btree_trans *trans,
- struct btree_insert_entry *i)
-{
- struct bch_fs *c = trans->c;
- struct journal *j = &c->journal;
-
- EBUG_ON(trans->journal_res.ref !=
- !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
-
- i->k->k.needs_whiteout = false;
-
- if (!i->cached)
- btree_insert_key_leaf(trans, i);
- else
- bch2_btree_insert_key_cached(trans, i->path, i->k);
-
- if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
- bch2_journal_add_keys(j, &trans->journal_res,
- i->btree_id,
- i->level,
- i->k);
-
- if (trans->journal_seq)
- *trans->journal_seq = trans->journal_res.seq;
- }
-}
-
/* Triggers: */
static int run_one_mem_trigger(struct btree_trans *trans,
return ret;
}
- trans_for_each_update(trans, i)
- do_btree_insert_one(trans, i);
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ trans_for_each_update(trans, i) {
+ struct journal *j = &c->journal;
+ struct jset_entry *entry;
+
+ if (i->key_cache_already_flushed)
+ continue;
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_overwrite,
+ i->btree_id, i->level,
+ i->old_k.u64s);
+ bkey_reassemble(&entry->start[0],
+ (struct bkey_s_c) { &i->old_k, i->old_v });
+
+ entry = bch2_journal_add_entry(j, &trans->journal_res,
+ BCH_JSET_ENTRY_btree_keys,
+ i->btree_id, i->level,
+ i->k->k.u64s);
+ bkey_copy(&entry->start[0], i->k);
+ }
+
+ if (trans->journal_seq)
+ *trans->journal_seq = trans->journal_res.seq;
+ }
+
+ trans_for_each_update(trans, i) {
+ i->k->k.needs_whiteout = false;
+
+ if (!i->cached)
+ btree_insert_key_leaf(trans, i);
+ else if (!i->key_cache_already_flushed)
+ bch2_btree_insert_key_cached(trans, i->path, i->k);
+ else
+ bch2_btree_key_cache_drop(trans, i->path);
+ }
return ret;
}
trans->journal_preres_u64s = 0;
/* For journalling transaction name: */
- trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
BUG_ON(!i->path->should_be_locked);
BUG_ON(!btree_node_intent_locked(i->path, i->level));
+ if (i->key_cache_already_flushed)
+ continue;
+
+ /* we're going to journal the key being updated: */
u64s = jset_u64s(i->k->k.u64s);
if (i->cached &&
likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
trans->journal_preres_u64s += u64s;
trans->journal_u64s += u64s;
+
+ /* and we're also going to log the overwrite: */
+ trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
if (trans->extra_journal_res) {
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
- struct bkey_i *k, enum btree_update_flags flags)
+bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags,
+ unsigned long ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
+ int ret = 0;
BUG_ON(!path->should_be_locked);
.cached = path->cached,
.path = path,
.k = k,
- .ip_allocated = _RET_IP_,
+ .ip_allocated = ip,
};
#ifdef CONFIG_BCACHEFS_DEBUG
}
}
- __btree_path_get(n.path, true);
- return 0;
+ __btree_path_get(i->path, true);
+
+ /*
+ * If a key is present in the key cache, it must also exist in the
+ * btree - this is necessary for cache coherency. When iterating over
+ * a btree that's cached in the key cache, the btree iter code checks
+ * the key cache - but the key has to exist in the btree for that to
+ * work:
+ */
+ if (path->cached &&
+ bkey_deleted(&i->old_k)) {
+ struct btree_path *btree_path;
+
+ i->key_cache_already_flushed = true;
+ i->flags |= BTREE_TRIGGER_NORUN;
+
+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+
+ ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ if (ret)
+ goto err;
+
+ btree_path->should_be_locked = true;
+ ret = bch2_trans_update_by_path_trace(trans, btree_path, k, flags, ip);
+err:
+ bch2_path_put(trans, btree_path, true);
+ }
+
+ return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+ struct bkey_i *k, enum btree_update_flags flags)
+{
+ return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_);
}
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
k->k.type = KEY_TYPE_whiteout;
}
+ /*
+ * Ensure that updates to cached btrees go to the key cache:
+ */
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
struct nonce nonce,
void *buf, size_t len)
{
- struct scatterlist sg;
-
- sg_init_table(&sg, 1);
- sg_set_page(&sg,
- is_vmalloc_addr(buf)
- ? vmalloc_to_page(buf)
- : virt_to_page(buf),
- len, offset_in_page(buf));
- return do_encrypt_sg(tfm, nonce, &sg, len);
+ if (!is_vmalloc_addr(buf)) {
+ struct scatterlist sg;
+
+ sg_init_table(&sg, 1);
+ sg_set_page(&sg,
+ is_vmalloc_addr(buf)
+ ? vmalloc_to_page(buf)
+ : virt_to_page(buf),
+ len, offset_in_page(buf));
+ return do_encrypt_sg(tfm, nonce, &sg, len);
+ } else {
+ unsigned pages = buf_pages(buf, len);
+ struct scatterlist *sg;
+ size_t orig_len = len;
+ int ret, i;
+
+ sg = kmalloc_array(sizeof(*sg), pages, GFP_KERNEL);
+ if (!sg)
+ return -ENOMEM;
+
+ sg_init_table(sg, pages);
+
+ for (i = 0; i < pages; i++) {
+ unsigned offset = offset_in_page(buf);
+ unsigned pg_len = min(len, PAGE_SIZE - offset);
+
+ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
+ buf += pg_len;
+ len -= pg_len;
+ }
+
+ ret = do_encrypt_sg(tfm, nonce, sg, orig_len);
+ kfree(sg);
+ return ret;
+ }
}
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
return vstruct_idx(j->buf[res->idx].data, res->offset);
}
-static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type,
enum btree_id id, unsigned level,
- const void *data, unsigned u64s)
+ unsigned u64s)
{
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
entry->pad[0] = 0;
entry->pad[1] = 0;
entry->pad[2] = 0;
- memcpy_u64s_small(entry->_data, data, u64s);
-
return jset_u64s(u64s);
}
-static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
- unsigned type, enum btree_id id,
- unsigned level,
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
+ enum btree_id id, unsigned level,
const void *data, unsigned u64s)
{
- unsigned actual = journal_entry_set(journal_res_entry(j, res),
- type, id, level, data, u64s);
+ unsigned ret = journal_entry_init(entry, type, id, level, u64s);
+
+ memcpy_u64s_small(entry->_data, data, u64s);
+ return ret;
+}
+
+static inline struct jset_entry *
+bch2_journal_add_entry(struct journal *j, struct journal_res *res,
+ unsigned type, enum btree_id id,
+ unsigned level, unsigned u64s)
+{
+ struct jset_entry *entry = journal_res_entry(j, res);
+ unsigned actual = journal_entry_init(entry, type, id, level, u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
-}
-
-static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
- enum btree_id id, unsigned level,
- const struct bkey_i *k)
-{
- bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
- id, level, k, k->k.u64s);
+ return entry;
}
static inline bool journal_entry_empty(struct jset *j)
while (res->u64s)
bch2_journal_add_entry(j, res,
BCH_JSET_ENTRY_btree_keys,
- 0, 0, NULL, 0);
+ 0, 0, 0);
bch2_journal_buf_put(j, res->idx);
static int journal_validate_key(struct bch_fs *c, const char *where,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k, const char *type,
+ struct bkey_i *k,
unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in %s entry offset %zi/%u: k->u64s 0",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: k->u64s 0",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in %s entry offset %zi/%u: bad format %u",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: bad format %u",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s),
k->k.format)) {
if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf)) {
printbuf_reset(&buf);
- pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
- type, where,
+ pr_buf(&buf, "invalid key in %s at %s offset %zi/%u:",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s));
pr_newline(&buf);
int ret = journal_validate_key(c, where, entry,
entry->level,
entry->btree_id,
- k, "key", version, big_endian, write);
+ k, version, big_endian, write);
if (ret == FSCK_DELETED_KEY)
continue;
}
return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
- "btree root", version, big_endian, write);
+ version, big_endian, write);
fsck_err:
return ret;
}
pr_buf(out, "%.*s", bytes, l->d);
}
+static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, const char *,
struct jset_entry *, unsigned, int, int);
switch (opt->type) {
case BCH_OPT_BOOL:
ret = kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0 || (*res != 0 && *res != 1)) {
+ pr_buf(err, "%s: must be bool",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_UINT:
ret = opt->flags & OPT_HUMAN_READABLE
? bch2_strtou64_h(val, res)
: kstrtou64(val, 10, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ pr_buf(err, "%s: must be a number",
+ opt->attr.name);
return ret;
+ }
break;
case BCH_OPT_STR:
ret = match_string(opt->choices, -1, val);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ pr_buf(err, "%s: invalid selection",
+ opt->attr.name);
return ret;
+ }
*res = ret;
break;
return 0;
ret = opt->parse(c, val, res);
- if (ret < 0)
+ if (ret < 0) {
+ if (err)
+ pr_buf(err, "%s: parse error",
+ opt->attr.name);
return ret;
+ }
}
return bch2_opt_validate(opt, *res, err);