From 3bc3dc1e26429cc20719b24ec4cc5201b91dca5c Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Wed, 27 Dec 2023 19:51:21 +0200 Subject: [PATCH] Upload to experimental --- Makefile | 3 +- debian/changelog | 11 + debian/patches/do-not-install-mount-symlink | 16 + debian/patches/series | 1 + debian/rules | 3 +- libbcachefs/bbpos_types.h | 18 + libbcachefs/btree_journal_iter.c | 531 ++++++ libbcachefs/btree_journal_iter.h | 57 + libbcachefs/btree_key_cache_types.h | 34 + libbcachefs/btree_trans_commit.c | 1172 +++++++++++++ libbcachefs/btree_update.c | 910 ++++++++++ libbcachefs/darray.c | 21 + libbcachefs/disk_groups_types.h | 18 + libbcachefs/fs-io-buffered.c | 1106 ++++++++++++ libbcachefs/fs-io-buffered.h | 27 + libbcachefs/fs-io-direct.c | 680 ++++++++ libbcachefs/fs-io-direct.h | 16 + libbcachefs/fs-io-pagecache.c | 791 +++++++++ libbcachefs/fs-io-pagecache.h | 176 ++ libbcachefs/io_misc.c | 524 ++++++ libbcachefs/io_misc.h | 34 + libbcachefs/io_read.c | 1210 +++++++++++++ libbcachefs/io_read.h | 158 ++ libbcachefs/io_write.c | 1675 ++++++++++++++++++ libbcachefs/io_write.h | 110 ++ libbcachefs/io_write_types.h | 96 ++ libbcachefs/logged_ops.c | 112 ++ libbcachefs/logged_ops.h | 20 + libbcachefs/sb-clean.c | 398 +++++ libbcachefs/sb-clean.h | 16 + libbcachefs/sb-errors.c | 172 ++ libbcachefs/sb-errors.h | 270 +++ libbcachefs/sb-errors_types.h | 16 + libbcachefs/sb-members.c | 420 +++++ libbcachefs/sb-members.h | 227 +++ libbcachefs/six.c | 867 ++++++++++ libbcachefs/six.h | 386 +++++ libbcachefs/snapshot.c | 1713 +++++++++++++++++++ libbcachefs/snapshot.h | 268 +++ make-release-tarball.sh | 60 + rust-src/src/cmd_completions.rs | 24 + 41 files changed, 14365 insertions(+), 2 deletions(-) create mode 100644 debian/patches/do-not-install-mount-symlink create mode 100644 debian/patches/series create mode 100644 libbcachefs/bbpos_types.h create mode 100644 libbcachefs/btree_journal_iter.c create mode 100644 libbcachefs/btree_journal_iter.h create mode 100644 libbcachefs/btree_key_cache_types.h create mode 100644 libbcachefs/btree_trans_commit.c create mode 100644 libbcachefs/btree_update.c create mode 100644 libbcachefs/darray.c create mode 100644 libbcachefs/disk_groups_types.h create mode 100644 libbcachefs/fs-io-buffered.c create mode 100644 libbcachefs/fs-io-buffered.h create mode 100644 libbcachefs/fs-io-direct.c create mode 100644 libbcachefs/fs-io-direct.h create mode 100644 libbcachefs/fs-io-pagecache.c create mode 100644 libbcachefs/fs-io-pagecache.h create mode 100644 libbcachefs/io_misc.c create mode 100644 libbcachefs/io_misc.h create mode 100644 libbcachefs/io_read.c create mode 100644 libbcachefs/io_read.h create mode 100644 libbcachefs/io_write.c create mode 100644 libbcachefs/io_write.h create mode 100644 libbcachefs/io_write_types.h create mode 100644 libbcachefs/logged_ops.c create mode 100644 libbcachefs/logged_ops.h create mode 100644 libbcachefs/sb-clean.c create mode 100644 libbcachefs/sb-clean.h create mode 100644 libbcachefs/sb-errors.c create mode 100644 libbcachefs/sb-errors.h create mode 100644 libbcachefs/sb-errors_types.h create mode 100644 libbcachefs/sb-members.c create mode 100644 libbcachefs/sb-members.h create mode 100644 libbcachefs/six.c create mode 100644 libbcachefs/six.h create mode 100644 libbcachefs/snapshot.c create mode 100644 libbcachefs/snapshot.h create mode 100755 make-release-tarball.sh create mode 100644 rust-src/src/cmd_completions.rs diff --git a/Makefile b/Makefile index 61a6245..4e4de54 100644 --- a/Makefile +++ b/Makefile @@ -159,7 +159,8 @@ install: bcachefs $(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK) $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs - $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs + # See: #1057295 + # $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs diff --git a/debian/changelog b/debian/changelog index f1e4e5e..e25d809 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,14 @@ +bcachefs-tools (24+really1.3.4-2~exp1) experimental; urgency=medium + + [ Chris Hofstaedtler ] + * Non-maintainer upload. + * Install files into /usr instead of /. (Closes: #1059373) + + [ Jonathan Carter ] + * Do not install bcachefs.mount symlink (Closes: #1057295) + + -- Jonathan Carter Wed, 27 Dec 2023 19:22:06 +0200 + bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium * New upstream release diff --git a/debian/patches/do-not-install-mount-symlink b/debian/patches/do-not-install-mount-symlink new file mode 100644 index 0000000..cbbbf62 --- /dev/null +++ b/debian/patches/do-not-install-mount-symlink @@ -0,0 +1,16 @@ +Description: do not install mount.bcachefs symlink +Bug-Debian: https://bugs.debian.org/1059373 +Last-Update: 2023-12-27 + +--- bcachefs-tools-24+really1.3.4.orig/Makefile ++++ bcachefs-tools-24+really1.3.4/Makefile +@@ -159,7 +159,8 @@ install: bcachefs + $(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK) + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs +- $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs ++ # See: #1057295 ++ # $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs + $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 0000000..adfba3c --- /dev/null +++ b/debian/patches/series @@ -0,0 +1 @@ +do-not-install-mount-symlink diff --git a/debian/rules b/debian/rules index a202318..fd52350 100755 --- a/debian/rules +++ b/debian/rules @@ -5,6 +5,7 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all export NO_RUST=-true PREFIX := /usr +ROOT_SBINDIR := /usr/sbin DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH) @@ -17,6 +18,6 @@ endif dh $@ override_dh_auto_install: - dh_auto_install -- "PREFIX=$(PREFIX)" + dh_auto_install -- "PREFIX=$(PREFIX)" "ROOT_SBINDIR=$(ROOT_SBINDIR)" override_dh_auto_test: diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h new file mode 100644 index 0000000..5198e94 --- /dev/null +++ b/libbcachefs/bbpos_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_TYPES_H +#define _BCACHEFS_BBPOS_TYPES_H + +struct bbpos { + enum btree_id btree; + struct bpos pos; +}; + +static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) +{ + return (struct bbpos) { btree, pos }; +} + +#define BBPOS_MIN BBPOS(0, POS_MIN) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) + +#endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c new file mode 100644 index 0000000..58a981b --- /dev/null +++ b/libbcachefs/btree_journal_iter.c @@ -0,0 +1,531 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bset.h" +#include "btree_journal_iter.h" +#include "journal_io.h" + +#include + +/* + * For managing keys we read from the journal: until journal replay works normal + * btree lookups need to be able to find and return keys from the journal where + * they overwrite what's in the btree, so we have a special iterator and + * operations for the regular btree iter code to use: + */ + +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) +{ + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->d + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + + return l; +} + +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && + !k->overwritten) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct btree_and_journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, journal.list) + if (iter->journal.idx == gap_end) + iter->journal.idx = keys->gap - 1; +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } +} + +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->size && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (idx > keys->gap) + idx -= keys->size - keys->nr; + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = max_t(size_t, keys->size, 8) * 2, + }; + + new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -BCH_ERR_ENOMEM_journal_key_insert; + } + + /* Since @keys was full, there was no gap: */ + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; + } + + journal_iters_move_gap(c, keys->gap, idx); + + move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + keys->gap = idx; + + keys->nr++; + keys->d[keys->gap++] = n; + + journal_iters_fix(c); + + return 0; +} + +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_key_insert; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->size && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + bpos_eq(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + struct journal_key *k = iter->keys->d + iter->idx; + + while (k < iter->keys->d + iter->keys->size && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return bkey_i_to_s_c(k->k); + + bch2_journal_iter_advance(iter); + k = iter->keys->d + iter->idx; + } + + return bkey_s_c_null; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + if (bpos_eq(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c btree_k, journal_k, ret; +again: + if (iter->at_end) + return bkey_s_c_null; + + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); + + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); + + ret = journal_k.k && + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) + ? journal_k + : btree_k; + + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) + ret = bkey_s_c_null; + + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; + } + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; + } + + return ret; +} + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) +{ + bch2_journal_iter_exit(&iter->journal); +} + +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); + iter->pos = b->data->min_key; + iter->at_end = false; +} + +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) +{ + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); +} + +/* sort and dedup all keys in the journal: */ + +void bch2_journal_entries_free(struct bch_fs *c) +{ + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + if (*i) + kvpfree(*i, offsetof(struct journal_replay, j) + + vstruct_bytes(&(*i)->j)); + genradix_free(&c->journal_entries); +} + +/* + * When keys compare equal, oldest compares first: + */ +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +void bch2_journal_keys_free(struct journal_keys *keys) +{ + struct journal_key *i; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + + kvfree(keys->d); + keys->d = NULL; + keys->nr = keys->gap = keys->size = 0; +} + +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + +int bch2_journal_keys_sort(struct bch_fs *c) +{ + struct genradix_iter iter; + struct journal_replay *i, **_i; + struct jset_entry *entry; + struct bkey_i *k; + struct journal_keys *keys = &c->journal_keys; + size_t nr_keys = 0, nr_read = 0; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + for_each_jset_key(k, entry, &i->j) + nr_keys++; + } + + if (!nr_keys) + return 0; + + keys->size = roundup_pow_of_two(nr_keys); + + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) { + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", + nr_keys); + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + keys->d[keys->nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + nr_read++; + } + } + + __journal_keys_sort(keys); + keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + return 0; +} diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h new file mode 100644 index 0000000..5d64e7e --- /dev/null +++ b/libbcachefs/btree_journal_iter.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_H + +struct journal_iter { + struct list_head list; + enum btree_id btree_id; + unsigned level; + size_t idx; + struct journal_keys *keys; +}; + +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ + +struct btree_and_journal_iter { + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + + struct journal_iter journal; + struct bpos pos; + bool at_end; +}; + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct bch_fs *); + +int bch2_journal_keys_sort(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h new file mode 100644 index 0000000..cfd09f5 --- /dev/null +++ b/libbcachefs/btree_key_cache_types.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H +#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + + struct list_head freed_pcpu; + size_t nr_freed_pcpu; + struct list_head freed_nonpcpu; + size_t nr_freed_nonpcpu; + + struct shrinker shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */ diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c new file mode 100644 index 0000000..70077ef --- /dev/null +++ b/libbcachefs/btree_trans_commit.c @@ -0,0 +1,1172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "snapshot.h" + +#include + +static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); + + if (j_k) + k = bkey_i_to_s_c(j_k); + } + + u = *k.k; + u.needs_whiteout = i->old_k.needs_whiteout; + + BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); + BUG_ON(i->old_v != k.v); +#endif +} + +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + +static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i != trans->updates && + insert_l(&i[0])->b == insert_l(&i[-1])->b; +} + +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} + +inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(trans, b); +} + +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int bch2_trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + EBUG_ON(trans->write_locked); + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + trans->write_locked = true; + return 0; +} + +static inline void bch2_trans_unlock_write(struct btree_trans *trans) +{ + if (likely(trans->write_locked)) { + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + trans->write_locked = false; + } +} + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bkey_packed *k; + unsigned clobber_u64s = 0, new_u64s = 0; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); + EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ + EBUG_ON(k && bkey_deleted(k)); + + /* Deleting, but not found? nothing to do: */ + if (bkey_deleted(&insert->k) && !k) + return false; + + if (bkey_deleted(&insert->k)) { + /* Deleting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) + push_whiteout(trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + + return true; + } + + if (k) { + /* Overwriting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + new_u64s = k->u64s; +fix_iter: + if (clobber_u64s != new_u64s) + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, + clobber_u64s, new_u64s); + return true; +} + +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans *trans = bch2_trans_get(c); + unsigned long old, new, v; + unsigned idx = w - b->writes; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + v = READ_ONCE(b->flags); + + do { + old = new = v; + + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_journal_reclaim; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + + bch2_trans_put(trans); + return 0; +} + +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 0, seq); +} + +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 1, seq); +} + +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); +} + +/** + * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node + * @trans: btree transaction object + * @path: path pointing to @insert's pos + * @insert: key to insert + * @journal_seq: sequence number of journal reservation + */ +inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *insert, + u64 journal_seq) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct bset_tree *t = bset_tree_last(b); + struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, + &path_l(path)->iter, insert))) + return; + + i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + + if (unlikely(!btree_node_dirty(b))) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_btree_node_dirty_acct(c, b); + } + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); +} + +/* Cached btree updates: */ + +/* Normal update interface: */ + +static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + btree_type_has_snapshots(i->btree_id) && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); +} + +static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, + trans->journal_u64s, flags); +} + +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct jset_entry *entry = + bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_log, 0, 0, + JSET_ENTRY_LOG_U64s); + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + + strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); +} + +static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) +{ + struct bch_fs *c = trans->c; + + if (!bch2_btree_node_insert_fits(c, b, u64s)) + return -BCH_ERR_btree_insert_btree_node_full; + + return 0; +} + +noinline static int +btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned new_u64s) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_i *new_k; + int ret; + + bch2_trans_unlock_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + ret = bch2_trans_relock(trans) ?: + bch2_trans_lock_write(trans); + if (unlikely(ret)) { + kfree(new_k); + return ret; + } + + memcpy(new_k, ck->k, ck->u64s * sizeof(u64)); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + struct btree_insert_entry *i; + unsigned new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c) && + !(flags & BCH_TRANS_COMMIT_journal_reclaim)) + return -BCH_ERR_btree_insert_need_journal_reclaim; + + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 + * bytes (it won't be used): + */ + u64s += 1; + + if (u64s <= ck->u64s) + return 0; + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT); + if (unlikely(!new_k)) + return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s); + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + int ret; + + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + + if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id))) + return 0; + + if (old_ops->atomic_trigger == new_ops->atomic_trigger) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + + _deleted.p = i->path->pos; + + ret = bch2_mark_key(trans, i->btree_id, i->level, + deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, i->btree_id, i->level, + old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + old_ops->trans_trigger == new_ops->trans_trigger) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + } else if (!overwrite && !i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; + } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); +#endif + return 0; +} + +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + /* + * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc + */ + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } + } + + return ret; +} + +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; + int ret; + + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + + prefetch(&trans->c->journal.flags); + + trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + u64s += i->k->k.u64s; + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, i->path, u64s); + if (ret) { + *stopped_at = i; + return ret; + } + + i->k->k.needs_whiteout = false; + } + + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { + ret = bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); + } + + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BCH_TRANS_COMMIT_no_journal_res)) { + if (bch2_journal_seq_verify) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (bch2_inject_invalid_keys) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + if (trans->nr_wb_updates) { + EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + goto fatal_err; + } + + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + goto fatal_err; + } + + if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); + + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + trans_for_each_update(trans, i) { + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + verify_update_old_key(trans, i); + + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble((struct bkey_i *) entry->start, + (struct bkey_s_c) { &i->old_k, i->old_v }); + } + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); + bkey_copy((struct bkey_i *) entry->start, i->k); + } + + trans_for_each_wb_update(trans, wb) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); + bkey_copy((struct bkey_i *) entry->start, &wb->k); + } + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; + } + + trans_for_each_update(trans, i) { + if (!i->cached) { + bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq); + } else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, flags, i); + else { + bch2_btree_key_cache_drop(trans, i->path); + btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + } + } + + return 0; +fatal_err: + bch2_fatal_error(c); +revert_fs_usage: + if (trans->fs_usage_deltas) + bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + return ret; +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + + trans_for_each_wb_update(trans, wb) + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); +} + +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, + enum bkey_invalid_flags flags, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(err); + printbuf_indent_add(err, 2); + + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + + return -EINVAL; +} + +static int bch2_trans_commit_journal_pin_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + return 0; +} + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0, u64s_delta = 0; + + trans_for_each_update(trans, i) { + if (i->cached) + continue; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; + } + } + + ret = bch2_trans_lock_write(trans); + if (unlikely(ret)) + return ret; + + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); + + if (!ret && unlikely(trans->journal_replay_not_finished)) + bch2_drop_overwrites_from_journal(trans); + + bch2_trans_unlock_write(trans); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + trans->journal_pin, + bch2_trans_commit_journal_pin_flush); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + bch2_journal_res_put(&c->journal, &trans->journal_res); + + return ret; +} + +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret = bch2_journal_error(&c->journal) ?: + !bch2_btree_key_cache_must_wait(c); + + if (!ret) + journal_reclaim_kick(&c->journal); + return ret; +} + +static noinline +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + int ret, unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + break; + case -BCH_ERR_journal_res_get_blocked: + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); + break; + case -BCH_ERR_btree_insert_need_journal_reclaim: + bch2_trans_unlock(trans); + + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + break; + + ret = bch2_trans_relock(trans); + break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = __bch2_btree_write_buffer_flush(trans, + flags|BCH_TRANS_COMMIT_no_check_rw, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } + default: + BUG_ON(ret >= 0); + break; + } + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + (flags & BCH_TRANS_COMMIT_no_enospc), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); + + return ret; +} + +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + int ret; + + if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || + test_bit(BCH_FS_STARTED, &c->flags)) + return -BCH_ERR_erofs_trans_commit; + + ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); + if (ret) + return ret; + + bch2_write_ref_get(c, BCH_WRITE_REF_trans); + return 0; +} + +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) + break; + } + + return ret; +} + +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + struct btree_write_buffered_key *wb; + int ret = 0; + + if (!trans->nr_updates && + !trans->nr_wb_updates && + !trans->extra_journal_entries.nr) + goto out_reset; + + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; + + trans_for_each_update(trans, i) { + struct printbuf buf = PRINTBUF; + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) + ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf); + btree_insert_entry_checks(trans, i); + printbuf_exit(&buf); + + if (ret) + return ret; + } + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } + + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); + if (ret) + goto out_reset; + } + + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = __bch2_btree_write_buffer_flush(trans, + flags|BCH_TRANS_COMMIT_no_check_rw, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + + trans_for_each_update(trans, i) { + EBUG_ON(!i->path->should_be_locked); + + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) + goto out; + + EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + /* we're going to journal the key being updated: */ + trans->journal_u64s += jset_u64s(i->k->k.u64s); + + /* and we're also going to log the overwrite: */ + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); + } + + trans_for_each_wb_update(trans, wb) + trans->journal_u64s += jset_u64s(wb->k.k.u64s); + + if (trans->extra_journal_res) { + ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (flags & BCH_TRANS_COMMIT_no_enospc) + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto err; + } +retry: + bch2_trans_verify_not_in_restart(trans); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ + bch2_trans_verify_locks(trans); + + if (ret) + goto err; + + trace_and_count(c, transaction_commit, trans, _RET_IP_); +out: + if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); +out_reset: + if (!ret) + bch2_trans_downgrade(trans); + bch2_trans_reset_updates(trans); + + return ret; +err: + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + if (ret) + goto out; + + /* + * We might have done another transaction commit in the error path - + * i.e. btree write buffer flush - which will have made use of + * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is + * how the journal sequence number to pin is passed in - so we must + * restart: + */ + if (flags & BCH_TRANS_COMMIT_no_journal_res) { + ret = -BCH_ERR_transaction_restart_nested; + goto out; + } + + goto retry; +} diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c new file mode 100644 index 0000000..1837f84 --- /dev/null +++ b/libbcachefs/btree_update.c @@ -0,0 +1,910 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_iter.h" +#include "btree_journal_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "extents.h" +#include "keylist.h" +#include "snapshot.h" +#include "trace.h" + +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->cached, r->cached) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p); +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags, + unsigned long ip); + +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *update; + int ret; + + update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; + + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; + + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; + + *insert = update; + return 0; +} + +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; + + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter old_iter, new_iter = { NULL }; + struct bkey_s_c old_k, new_k; + snapshot_id_list s; + struct bkey_i *update; + int ret = 0; + + if (!bch2_snapshot_has_children(c, old_pos.snapshot)) + return 0; + + darray_init(&s); + + bch2_trans_iter_init(trans, &old_iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) + continue; + + new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bkey_err(new_k); + if (ret) + break; + + if (new_k.k->type == KEY_TYPE_deleted) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = whiteout_pos; + update->k.type = KEY_TYPE_whiteout; + + ret = bch2_trans_update(trans, &new_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } + bch2_trans_iter_exit(trans, &new_iter); + + ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &old_iter); + darray_exit(&s); + + return ret; +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_update_flags flags, + struct bkey_s_c old, + struct bkey_s_c new) +{ + enum btree_id btree_id = iter->btree_id; + struct bkey_i *update; + struct bpos new_start = bkey_start_pos(new.k); + bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); + bool back_split = bkey_gt(old.k->p, new.k->p); + int ret = 0, compressed_sectors; + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(old))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_back(new_start, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + /* If we're overwriting in a different snapshot - middle split: */ + if (old.k->p.snapshot != new.k->p.snapshot && + (front_split || back_split)) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new_start, update); + bch2_cut_back(new.k->p, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (bkey_le(old.k->p, new.k->p)) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bkey_init(&update->k); + update->k.p = old.k->p; + update->k.p.snapshot = new.k->p.snapshot; + + if (new.k->p.snapshot != old.k->p.snapshot) { + update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + return ret; + if (ret) + update->k.type = KEY_TYPE_whiteout; + } + + ret = bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (back_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new.k->p, update); + + ret = bch2_trans_update_by_path(trans, iter->path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags, _RET_IP_); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); + if (ret) + goto err; + } + + goto next; + } + + while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { + bool done = bkey_lt(insert->k.p, k.k->p); + + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + goto err; + + if (done) + goto out; +next: + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = extent_back_merge(trans, &iter, insert, k); + if (ret) + goto err; + } +out: + if (!bkey_deleted(&insert->k)) + ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_path *path, + struct btree_insert_entry *i, + enum btree_update_flags flags, + unsigned long ip) +{ + struct btree_path *btree_path; + struct bkey k; + int ret; + + btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, btree_path, 0); + if (ret) + goto out; + + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; + + btree_path_set_should_be_locked(btree_path); + ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); +out: + bch2_path_put(trans, btree_path, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + int cmp; + + EBUG_ON(!path->should_be_locked); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(!bpos_eq(k->k.p, path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, + .k = k, + .ip_allocated = ip, + }; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ + trans_for_each_update(trans, i) { + cmp = btree_insert_entry_cmp(&n, i); + if (cmp <= 0) + break; + } + + if (!cmp && i < trans->updates + trans->nr_updates) { + EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + bch2_path_put(trans, i->path, true); + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->ip_allocated = n.ip_allocated; + } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(i->path, true); + + /* + * If a key is present in the key cache, it must also exist in the + * btree - this is necessary for cache coherency. When iterating over + * a btree that's cached in the key cache, the btree iter code checks + * the key cache - but the key has to exist in the btree for that to + * work: + */ + if (path->cached && bkey_deleted(&i->old_k)) + return flush_new_cached_update(trans, path, i, flags, ip); + + return 0; +} + +static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_path *path) +{ + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + !bpos_eq(iter->key_cache_path->pos, iter->pos)) { + struct bkey_cached *ck; + int ret; + + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + btree_path_set_should_be_locked(iter->key_cache_path); + } + + return 0; +} + +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + /* + * Ensure that updates to cached btrees go to the key cache: + */ + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + ret = bch2_trans_update_get_key_cache(trans, iter, path); + if (ret) + return ret; + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); +} + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct btree_write_buffered_key *i; + int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + trans_for_each_wb_update(trans, i) { + if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { + bkey_copy(&i->k, k); + return 0; + } + } + + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; + + if (trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_transaction_stats *s = btree_trans_stats(trans); + + BUG_ON(trans->wb_updates_size > U8_MAX / 2); + trans->wb_updates_size = max(1, trans->wb_updates_size * 2); + if (s) + s->wb_updates_size = trans->wb_updates_size; + } + + u = bch2_trans_kmalloc_nomemzero(trans, + trans->wb_updates_size * + sizeof(struct btree_write_buffered_key)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + if (trans->nr_wb_updates) + memcpy(u, trans->wb_updates, trans->nr_wb_updates * + sizeof(struct btree_write_buffered_key)); + trans->wb_updates = u; + } + + trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { + .btree = btree, + }; + + bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; + + return 0; +} + +int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) +{ + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + k = bch2_btree_iter_prev(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { + ret = -BCH_ERR_ENOSPC_btree_slot; + goto err; + } + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + +int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_CACHED| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/** + * bch2_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @k: key to insert + * @disk_res: must be non-NULL whenever inserting or potentially + * splitting data extents + * @flags: transaction commit flags + * + * Returns: 0 on success, error code on failure + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, + struct disk_reservation *disk_res, int flags) +{ + return bch2_trans_do(c, disk_res, NULL, flags, + bch2_btree_insert_trans(trans, id, k, 0)); +} + +int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, + unsigned len, unsigned update_flags) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = iter->pos; + bch2_key_resize(&k->k, len); + return bch2_trans_update(trans, iter, k, update_flags); +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) +{ + return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +} + +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + +int bch2_btree_delete(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + unsigned update_flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, pos, + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, update_flags); + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + u32 restart_count = trans->restart_count; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + + /* + * This could probably be more efficient for extents: + */ + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + + ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, + BCH_TRANS_COMMIT_no_enospc); + bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret ?: trans_was_restarted(trans, restart_count); +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + int ret = bch2_trans_run(c, + bch2_btree_delete_range_trans(trans, id, start, end, + update_flags, journal_seq)); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +} + +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ + struct bkey_i *k; + int ret = 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + return bch2_trans_update_buffered(trans, btree, k); +} + +__printf(2, 0) +static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + struct jset_entry_log *l; + unsigned u64s; + int ret; + + prt_vprintf(&buf, fmt, args); + ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; + + u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + + ret = darray_make_room(entries, jset_u64s(u64s)); + if (ret) + goto err; + + l = (void *) &darray_top(*entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, buf.buf, buf.pos); + while (buf.pos & 7) + l->d[buf.pos++] = '\0'; + + entries->nr += jset_u64s(u64s); +err: + printbuf_exit(&buf); + return ret; +} + +__printf(3, 0) +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) +{ + int ret; + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BCH_TRANS_COMMIT_lazy_rw|commit_flags, + __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args)); + } + + return ret; +} + +__printf(2, 3) +int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +__printf(2, 3) +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); + va_end(args); + return ret; +} diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c new file mode 100644 index 0000000..aae07be --- /dev/null +++ b/libbcachefs/darray.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "darray.h" + +int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp) +{ + if (new_size > d->size) { + new_size = roundup_pow_of_two(new_size); + + void *data = krealloc_array(d->data, new_size, element_size, gfp); + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} diff --git a/libbcachefs/disk_groups_types.h b/libbcachefs/disk_groups_types.h new file mode 100644 index 0000000..a54ef08 --- /dev/null +++ b/libbcachefs/disk_groups_types.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H +#define _BCACHEFS_DISK_GROUPS_TYPES_H + +struct bch_disk_group_cpu { + bool deleted; + u16 parent; + u8 label[BCH_SB_LABEL_SIZE]; + struct bch_devs_mask devs; +}; + +struct bch_disk_groups_cpu { + struct rcu_head rcu; + unsigned nr; + struct bch_disk_group_cpu entries[] __counted_by(nr); +}; + +#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */ diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c new file mode 100644 index 0000000..52f0e7a --- /dev/null +++ b/libbcachefs/fs-io-buffered.c @@ -0,0 +1,1106 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "fs-io.h" +#include "fs-io-buffered.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io_read.h" +#include "io_write.h" + +#include +#include +#include + +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + +/* readpage(s): */ + +static void bch2_readpages_end_io(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + if (!bio->bi_status) { + folio_mark_uptodate(fi.folio); + } else { + folio_clear_uptodate(fi.folio); + folio_set_error(fi.folio); + } + folio_unlock(fi.folio); + } + + bio_put(bio); +} + +struct readpages_iter { + struct address_space *mapping; + unsigned idx; + folios folios; +}; + +static int readpages_iter_init(struct readpages_iter *iter, + struct readahead_control *ractl) +{ + struct folio **fi; + int ret; + + memset(iter, 0, sizeof(*iter)); + + iter->mapping = ractl->mapping; + + ret = bch2_filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; + + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); + folio_put(*fi); + folio_put(*fi); + } + + return 0; +} + +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) +{ + if (iter->idx >= iter->folios.nr) + return NULL; + return iter->folios.data[iter->idx]; +} + +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; +} + +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + +static int readpage_bio_extend(struct btree_trans *trans, + struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, + bool get_more) +{ + /* Don't hold btree locks while allocating memory: */ + bch2_trans_unlock(trans); + + while (bio_sectors(bio) < sectors_this_extent && + bio->bi_vcnt < bio->bi_max_vecs) { + struct folio *folio = readpage_iter_peek(iter); + int ret; + + if (folio) { + readpage_iter_advance(iter); + } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + + if (!get_more) + break; + + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) + break; + + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) + break; + + if (!__bch2_folio_create(folio, GFP_KERNEL)) { + folio_put(folio); + break; + } + + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); + if (ret) { + __bch2_folio_release(folio); + folio_put(folio); + break; + } + + folio_put(folio); + } + + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + } + + return bch2_trans_relock(trans); +} + +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, + struct readpages_iter *readpages_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + u32 snapshot; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + sectors = min(sectors, k.k->size - offset_into_extent); + + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); + if (ret) + break; + } + + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + iter.pos.inode, + iter.pos.offset << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } + + bch2_bkey_buf_exit(&sk, c); +} + +void bch2_readahead(struct readahead_control *ractl) +{ + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct btree_trans *trans = bch2_trans_get(c); + struct folio *folio; + struct readpages_iter readpages_iter; + int ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_pagecache_add_get(inode); + + while ((folio = readpage_iter_peek(&readpages_iter))) { + unsigned n = min_t(unsigned, + readpages_iter.folios.nr - + readpages_iter.idx, + BIO_MAX_VECS); + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), + opts); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(trans, rbio, inode_inum(inode), + &readpages_iter); + bch2_trans_unlock(trans); + } + + bch2_pagecache_add_put(inode); + + bch2_trans_put(trans); + darray_exit(&readpages_iter.folios); +} + +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) +{ + bch2_folio_create(folio, __GFP_NOFAIL); + + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0)); +} + +static void bch2_read_single_folio_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_read_bio *rbio; + struct bch_io_opts opts; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), + opts); + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; + + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); + bio_put(&rbio->bio); + + if (ret < 0) + return ret; + + folio_mark_uptodate(folio); + return 0; +} + +int bch2_read_folio(struct file *file, struct folio *folio) +{ + int ret; + + ret = bch2_read_single_folio(folio, folio->mapping); + folio_unlock(folio); + return bch2_err_class(ret); +} + +/* writepages: */ + +struct bch_writepage_io { + struct bch_inode_info *inode; + + /* must be last: */ + struct bch_write_op op; +}; + +struct bch_writepage_state { + struct bch_writepage_io *io; + struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; +}; + +static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_writepage_state ret = { 0 }; + + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); + return ret; +} + +/* + * Determine when a writepage io is full. We have to limit writepage bios to a + * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to + * what the bounce path in bch2_write_extent() can handle. In theory we could + * loosen this restriction for non-bounce I/O, but we don't have that context + * here. Ideally, we can up this limit and make it configurable in the future + * when the bounce path can be enhanced to accommodate larger source bios. + */ +static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len) +{ + struct bio *bio = &io->op.wbio.bio; + return bio_full(bio, len) || + (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE); +} + +static void bch2_writepage_io_done(struct bch_write_op *op) +{ + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct folio_iter fi; + unsigned i; + + if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, -EIO); + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ + WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up + * slightly) + * XXX wtf? + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); + */ + + /* + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ + bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(fi.folio); + } + + bio_put(&io->op.wbio.bio); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) +{ + struct bch_writepage_io *io = w->io; + + w->io = NULL; + closure_call(&io->op.cl, bch2_write, NULL, NULL); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, + struct bch_writepage_state *w, + struct bch_inode_info *inode, + u64 sector, + unsigned nr_replicas) +{ + struct bch_write_op *op; + + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, + REQ_OP_WRITE, + GFP_KERNEL, + &c->writepage_bioset), + struct bch_writepage_io, op.wbio.bio); + + w->io->inode = inode; + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; + op->devs_need_flush = &inode->ei_devs_need_flush; + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +} + +static int __bch2_writepage(struct folio *folio, + struct writeback_control *wbc, + void *data) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_writepage_state *w = data; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; + loff_t i_size = i_size_read(&inode->v); + int ret; + + EBUG_ON(!folio_test_uptodate(folio)); + + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) + goto do_io; + + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); + return 0; + } + + /* + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); +do_io: + f_sectors = folio_sectors(folio); + s = bch2_folio(folio); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } + + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); + BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + nr_replicas_this_write = + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; + + s->s[i].replicas_reserved = 0; + bch2_folio_sector_set(folio, s, i, SECTOR_allocated); + } + spin_unlock(&s->lock); + + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); + + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + + folio_unlock(folio); + + offset = 0; + while (1) { + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) + offset++; + + if (offset == f_sectors) + break; + + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; + sectors++; + } + BUG_ON(!sectors); + + sector = folio_sector(folio) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bch_io_full(w->io, sectors << 9) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, wbc, w, inode, sector, + nr_replicas_this_write); + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ + WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + "writing past i_size: %llu > %llu (unrounded %llu)\n", + bio_end_sector(&w->io->op.wbio.bio) << 9, + round_up(i_size, block_bytes(c)), + i_size); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(folio); + + return 0; +} + +int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct bch_fs *c = mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(mapping->host)); + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); + if (w.io) + bch2_writepage_do_io(&w); + blk_finish_plug(&plug); + kfree(w.tmp); + return bch2_err_class(ret); +} + +/* buffered writes: */ + +int bch2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; + int ret = -ENOMEM; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_folio_reservation_init(c, inode, res); + *fsdata = res; + + bch2_pagecache_add_get(inode); + + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (IS_ERR_OR_NULL(folio)) + goto err_unlock; + + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + if (folio_test_uptodate(folio)) + goto out; + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) + goto out; + + if (!offset && pos + len >= inode->v.i_size) { + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } + + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } +readpage: + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto err; +out: + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto err; + + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); + if (ret) { + if (!folio_test_uptodate(folio)) { + /* + * If the folio hasn't been read in, we won't know if we + * actually need a reservation - we don't actually need + * to read here, we just need to check if the folio is + * fully backed by uncompressed data: + */ + goto readpage; + } + + goto err; + } + + *pagep = &folio->page; + return 0; +err: + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; +err_unlock: + bch2_pagecache_add_put(inode); + kfree(res); + *fsdata = NULL; + return bch2_err_class(ret); +} + +int bch2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); + + lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); + + if (unlikely(copied < len && !folio_test_uptodate(folio))) { + /* + * The folio needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + */ + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + copied = 0; + } + + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); + + if (copied) { + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); + + inode->ei_last_dirtied = (unsigned long) current; + } + + folio_unlock(folio); + folio_put(folio); + bch2_pagecache_add_put(inode); + + bch2_folio_reservation_put(c, inode, res); + kfree(res); + + return copied; +} + +static noinline void folios_trunc(folios *fs, struct folio **fi) +{ + while (fs->data + fs->nr > fi) { + struct folio *f = darray_pop(fs); + + folio_unlock(f); + folio_put(f); + } +} + +static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, + loff_t pos, unsigned len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + folios fs; + struct folio **fi, *f; + unsigned copied = 0, f_offset, f_copied; + u64 end = pos + len, f_pos, f_len; + loff_t last_folio_pos = inode->v.i_size; + int ret = 0; + + BUG_ON(!len); + + bch2_folio_reservation_init(c, inode, &res); + darray_init(&fs); + + ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &fs); + if (ret) + goto out; + + BUG_ON(!fs.nr); + + f = darray_first(fs); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + f = darray_last(fs); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + } else { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + } + + ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr); + if (ret) + goto out; + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + + /* + * XXX: per POSIX and fstests generic/275, on -ENOSPC we're + * supposed to write as much as we have disk space for. + * + * On failure here we should still write out a partial page if + * we aren't completely out of disk space - we don't do that + * yet: + */ + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); + if (unlikely(ret)) { + folios_trunc(&fs, fi); + if (!fs.nr) + goto out; + + end = min(end, folio_end_pos(darray_last(fs))); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (mapping_writably_mapped(mapping)) + darray_for_each(fs, fi) + flush_dcache_folio(*fi); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + if (!f_copied) { + folios_trunc(&fs, fi); + break; + } + + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + iov_iter_revert(iter, f_copied); + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&fs, fi); + break; + } + + flush_dcache_folio(f); + copied += f_copied; + + if (f_copied != f_len) { + folios_trunc(&fs, fi + 1); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (!copied) + goto out; + + end = pos + copied; + + spin_lock(&inode->v.i_lock); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); + darray_for_each(fs, fi) { + f = *fi; + f_len = min(end, folio_end_pos(f)) - f_pos; + + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); + + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + inode->ei_last_dirtied = (unsigned long) current; +out: + darray_for_each(fs, fi) { + folio_unlock(*fi); + folio_put(*fi); + } + + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&fs); + bch2_folio_reservation_put(c, inode, &res); + + return copied ?: ret; +} + +static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; + + bch2_pagecache_add_get(inode); + + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = iov_iter_count(iter); +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE - offset); + + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + ret = -EFAULT; + break; + } + } + + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + + cond_resched(); + + if (unlikely(ret == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(iter)); + goto again; + } + pos += ret; + written += ret; + ret = 0; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); + + bch2_pagecache_add_put(inode); + + return written ? written : ret; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: + return bch2_err_class(ret); +} + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) +{ + bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fs_io_buffered_init(struct bch_fs *c) +{ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_writepage_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h new file mode 100644 index 0000000..a6126ff --- /dev/null +++ b/libbcachefs/fs-io-buffered.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_BUFFERED_H +#define _BCACHEFS_FS_IO_BUFFERED_H + +#ifndef NO_BCACHEFS_FS + +int bch2_read_single_folio(struct folio *, struct address_space *); +int bch2_read_folio(struct file *, struct folio *); + +int bch2_writepages(struct address_space *, struct writeback_control *); +void bch2_readahead(struct readahead_control *); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, struct page **, void **); +int bch2_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_buffered_exit(struct bch_fs *); +int bch2_fs_fs_io_buffered_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c new file mode 100644 index 0000000..5b42a76 --- /dev/null +++ b/libbcachefs/fs-io-direct.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "fs.h" +#include "fs-io.h" +#include "fs-io-direct.h" +#include "fs-io-pagecache.h" +#include "io_read.h" +#include "io_write.h" + +#include +#include +#include +#include + +/* O_DIRECT reads */ + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + bool should_dirty; + struct bch_read_bio rbio; +}; + +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static void bch2_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret); + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); +} + +static void bch2_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_status) + dio->ret = blk_status_to_errno(bio->bi_status); + + closure_put(&dio->cl); +} + +static void bch2_direct_IO_read_split_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + bch2_direct_IO_read_endio(bio); + bio_check_or_release(bio, should_dirty); +} + +static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct dio_read *dio; + struct bio *bio; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); + size_t shorten; + ssize_t ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + if ((offset|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + ret = min_t(loff_t, iter->count, + max_t(loff_t, 0, i_size_read(&inode->v) - offset)); + + if (!ret) + return ret; + + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + iter->count -= shorten; + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; + + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + dio->cl.closure_get_happened = true; + } + + dio->req = req; + dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); + + goto start; + while (iter->count) { + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_private = dio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + + if (dio->should_dirty) + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + } + + iter->count += shorten; + + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + return ret; + } else { + return -EIOCBQUEUED; + } +} + +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(inode); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(inode); + } +out: + return bch2_err_class(ret); +} + +/* O_DIRECT writes */ + +struct dio_write { + struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; + struct mm_struct *mm; + unsigned loop:1, + extending:1, + sync:1, + flush:1, + free_iov:1; + struct quota_res quota_res; + u64 written; + + struct iov_iter iter; + struct iovec inline_vecs[2]; + + /* must be last: */ + struct bch_write_op op; +}; + +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; +retry: + bch2_trans_begin(trans); + + err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_put(trans); + + return err ? false : ret; +} + +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); + +/* + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the + * caller's stack, we're not guaranteed that it will live for the duration of + * the IO: + */ +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + /* + * iov_iter has a single embedded iovec - nothing to do: + */ + if (iter_is_ubuf(&dio->iter)) + return 0; + + /* + * We don't currently handle non-iovec iov_iters here - return an error, + * and we'll fall back to doing the IO synchronously: + */ + if (!iter_is_iovec(&dio->iter)) + return -1; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.__iov = iov; + return 0; +} + +static void bch2_dio_write_flush_done(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) { + dio->op.error = ret; + } else { + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, + &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + + if (dio->free_iov) + kfree(dio->iter.__iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + if (dio->extending) { + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + } + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); + mutex_unlock(&inode->ei_quota_lock); + } + + bio_release_pages(bio, false); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static __always_inline long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bch_io_opts opts; + struct bio *bio = &dio->op.wbio.bio; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; + long ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + while (1) { + iter_count = dio->iter.count; + + EBUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; + + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + + if (unlikely(ret < 0)) + goto err; + + if (unlikely(dropped_locks)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + ret = -EFAULT; + goto err; + } + + bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; + dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); + if (unlikely(ret)) + goto err; + + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_dio_write_check_allocated(dio)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); + + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; + + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + + if (!sync) + return -EIOCBQUEUED; + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + break; + + bio_reset(bio, NULL, REQ_OP_WRITE); + } +out: + return bch2_dio_write_done(dio); +err: + dio->op.error = ret; + + bio_release_pages(bio, false); + + bch2_quota_reservation_put(c, inode, &dio->quota_res); + goto out; +} + +static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +{ + struct mm_struct *mm = dio->mm; + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + bch2_dio_write_done(dio); + else + bch2_dio_write_continue(dio); +} + +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct dio_write *dio; + struct bio *bio; + bool locked = true, extending; + ssize_t ret; + + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); + + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; + + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + goto err; + + inode_dio_begin(&inode->v); + bch2_pagecache_block_get(inode); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_WRITE, + GFP_KERNEL, + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + dio->req = req; + dio->mapping = mapping; + dio->inode = inode; + dio->mm = current->mm; + dio->loop = false; + dio->extending = extending; + dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; + dio->free_iov = false; + dio->quota_res.sectors = 0; + dio->written = 0; + dio->iter = *iter; + dio->op.c = c; + + if (unlikely(mapping->nrpages)) { + ret = bch2_write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } + + ret = bch2_dio_write_loop(dio); +err: + if (locked) + inode_unlock(&inode->v); + return ret; +err_put_bio: + bch2_pagecache_block_put(inode); + bio_put(bio); + inode_dio_end(&inode->v); + goto err; +} + +void bch2_fs_fs_io_direct_exit(struct bch_fs *c) +{ + bioset_exit(&c->dio_write_bioset); + bioset_exit(&c->dio_read_bioset); +} + +int bch2_fs_fs_io_direct_init(struct bch_fs *c) +{ + if (bioset_init(&c->dio_read_bioset, + 4, offsetof(struct dio_read, rbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_read_bioset_init; + + if (bioset_init(&c->dio_write_bioset, + 4, offsetof(struct dio_write, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_write_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-direct.h b/libbcachefs/fs-io-direct.h new file mode 100644 index 0000000..814621e --- /dev/null +++ b/libbcachefs/fs-io-direct.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_DIRECT_H +#define _BCACHEFS_FS_IO_DIRECT_H + +#ifndef NO_BCACHEFS_FS +ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); + +void bch2_fs_fs_io_direct_exit(struct bch_fs *); +int bch2_fs_fs_io_direct_init(struct bch_fs *); +#else +static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} +static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_DIRECT_H */ diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c new file mode 100644 index 0000000..ff664fd --- /dev/null +++ b/libbcachefs/fs-io-pagecache.c @@ -0,0 +1,791 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "btree_iter.h" +#include "extents.h" +#include "fs-io.h" +#include "fs-io-pagecache.h" +#include "subvolume.h" + +#include +#include + +int bch2_filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + fgf_t fgp_flags, gfp_t gfp, + folios *fs) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(fs->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(fs, f); + } + + if (!fs->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return fs->nr ? 0 : ret; +} + +/* pagecache_block must be held */ +int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +#if 0 +/* Useful for debug tracing: */ +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; +#endif + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +/* for newly allocated folios: */ +struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + struct bch_folio *s; + + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); + if (!s) + return NULL; + + spin_lock_init(&s->lock); + folio_attach_private(folio, s); + return s; +} + +struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); +} + +static unsigned bkey_to_sector_state(struct bkey_s_c k) +{ + if (bkey_extent_is_reservation(k)) + return SECTOR_reserved; + if (bkey_extent_is_allocation(k.k)) + return SECTOR_allocated; + return SECTOR_unallocated; +} + +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + bch2_folio_sector_set(folio, s, i, state); + } + + if (i == sectors) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **fs, unsigned nr_folios) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_folio *s; + u64 offset = folio_sector(fs[0]); + unsigned folio_idx; + u32 snapshot; + bool need_set = false; + int ret; + + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(fs[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = fs[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - + folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - + folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } + + if (folio_idx == nr_folios) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_put(trans); + + return ret; +} + +void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct folio_vec fv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); +} + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + s64 i_sectors_delta = 0; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, j, + folio_sector_reserve(s->s[j].state)); + } + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +int bch2_get_folio_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, bool check_enospc) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < sectors; i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +void bch2_folio_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + BUG_ON(!s->uptodate); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; + + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } + + return 0; +} + +static void bch2_clear_folio_bits(struct folio *folio) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_folio *s = bch2_folio(folio); + struct disk_reservation disk_res = { 0 }; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; + + if (!s) + return; + + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); + + for (i = 0; i < sectors; i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + dirty_sectors -= s->s[i].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); + } + + bch2_disk_reservation_put(c, &disk_res); + + bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_folio_release(folio); +} + +void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, dirty_sectors = 0; + + WARN_ON((u64) folio_pos(folio) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + BUG_ON(!s->uptodate); + + spin_lock(&s->lock); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } + + spin_unlock(&s->lock); + + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); +} + +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(inode)) + goto got_lock; + + bch2_pagecache_block_put(fdm_host); + + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); + + bch2_pagecache_block_get(fdm_host); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + + bch2_pagecache_add_get(inode); +got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(inode); + + return ret; +} + +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; + vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + + sb_start_pagefault(inode->v.i_sb); + file_update_time(file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get + * a bch2_write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + bch2_pagecache_add_get(inode); + + folio_lock(folio); + isize = i_size_read(&inode->v); + + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); + ret = VM_FAULT_SIGBUS; + goto out; + } + + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); + + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + bch2_pagecache_add_put(inode); + sb_end_pagefault(inode->v.i_sb); + + return ret; +} + +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + if (offset || length < folio_size(folio)) + return; + + bch2_clear_folio_bits(folio); +} + +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; + + bch2_clear_folio_bits(folio); + return true; +} + +/* fseek: */ + +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; +} + +loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned i; + loff_t ret; + int offset; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + if (!nonblock) { + folio_lock(folio); + } else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + return -EAGAIN; + } + + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); + folio_unlock(folio); + folio_batch_release(&fbatch); + return ret; + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return end_offset; +} + +/* + * Search for a hole in a folio. + * + * The filemap layer returns -ENOENT if no folio exists, so reuse the same error + * code to indicate a pagecache hole exists at the returned offset. Otherwise + * return 0 if the folio is filled with data, or an error code. This function + * can return -EAGAIN if nonblock is specified. + */ +static int folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas, bool nonblock) +{ + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + int ret = -ENOENT; + + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, + FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + + s = bch2_folio(folio); + if (!s) + goto unlock; + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + + *offset = folio_end_pos(folio); + ret = 0; +unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + +loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas, + bool nonblock) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + loff_t ret = 0; + + while (!ret && offset < end_offset) + ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); + + if (ret && ret != -ENOENT) + return ret; + return min(offset, end_offset); +} + +int bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas, + bool nonblock) +{ + loff_t ret; + + ret = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_start = ret; + + if (*hole_start == *hole_end) + return 0; + + ret = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_end = ret; + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h new file mode 100644 index 0000000..27f712a --- /dev/null +++ b/libbcachefs/fs-io-pagecache.h @@ -0,0 +1,176 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_PAGECACHE_H +#define _BCACHEFS_FS_IO_PAGECACHE_H + +#include + +typedef DARRAY(struct folio *) folios; + +int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, + u64, fgf_t, gfp_t, folios *); +int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); + +/* + * Use u64 for the end pos and sector helpers because if the folio covers the + * max supported range of the mapping, the start offset of the next folio + * overflows loff_t. This breaks much of the range based processing in the + * buffered write path. + */ +static inline u64 folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline u64 folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +struct bch_folio_sector { + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; + + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; + + /* i_sectors: */ + enum bch_folio_sector_state state:8; +}; + +struct bch_folio { + spinlock_t lock; + atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ + bool uptodate; + struct bch_folio_sector s[]; +}; + +/* Helper for when we need to add debug instrumentation: */ +static inline void bch2_folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) +{ + s->s[i].state = n; +} + +/* file offset (to folio offset) to bch_folio_sector index */ +static inline int folio_pos_to_s(struct folio *folio, loff_t pos) +{ + u64 f_offset = pos - folio_pos(folio); + + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); + return f_offset >> SECTOR_SHIFT; +} + +/* for newly allocated folios: */ +static inline void __bch2_folio_release(struct folio *folio) +{ + kfree(folio_detach_private(folio)); +} + +static inline void bch2_folio_release(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) +{ + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) + : NULL; +} + +static inline struct bch_folio *bch2_folio(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + + return __bch2_folio(folio); +} + +struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); +struct bch_folio *bch2_folio_create(struct folio *, gfp_t); + +struct bch2_folio_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ + /* XXX: this should not be open coded */ + return inode->ei_inode.bi_data_replicas + ? inode->ei_inode.bi_data_replicas - 1 + : c->opts.data_replicas; +} + +static inline void bch2_folio_reservation_init(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); +void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); + +void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); +void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); + +int bch2_get_folio_disk_reservation(struct bch_fs *, + struct bch_inode_info *, + struct folio *, bool); + +void bch2_folio_reservation_put(struct bch_fs *, + struct bch_inode_info *, + struct bch2_folio_reservation *); +int bch2_folio_reservation_get(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +void bch2_set_folio_dirty(struct bch_fs *, + struct bch_inode_info *, + struct folio *, + struct bch2_folio_reservation *, + unsigned, unsigned); + +vm_fault_t bch2_page_fault(struct vm_fault *); +vm_fault_t bch2_page_mkwrite(struct vm_fault *); +void bch2_invalidate_folio(struct folio *, size_t, size_t); +bool bch2_release_folio(struct folio *, gfp_t); + +loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); +loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); +int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); + +#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c new file mode 100644 index 0000000..eab0c8c --- /dev/null +++ b/libbcachefs/io_misc.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_misc.c - fallocate, fpunch, truncate: + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "inode.h" +#include "io_misc.h" +#include "io_write.h" +#include "logged_ops.h" +#include "rebalance.h" +#include "subvolume.h" + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + u64 sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets = { 0 }; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated = 0; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; + + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto err; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto err; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + ret = -BCH_ERR_transaction_restart_nested; + if (ret) + goto err; + + sectors = min_t(u64, sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + + if (closure_nr_remaining(&cl) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + return ret; +} + +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + u32 snapshot; + + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + + ret = bkey_err(k); + if (ret) + continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; +} + +/* truncate: */ + +void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size)); +} + +static int truncate_set_isize(struct btree_trans *trans, + subvol_inum inum, + u64 new_i_size) +{ + struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + (inode_u.bi_size = new_i_size, 0) ?: + bch2_inode_write(trans, &iter, &inode_u); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter fpunch_iter; + struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + u64 new_i_size = le64_to_cpu(op->v.new_i_size); + int ret; + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + truncate_set_isize(trans, inum, new_i_size)); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, + POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), + BTREE_ITER_INTENT); + ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); + bch2_trans_iter_exit(trans, &fpunch_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; +err: + bch2_logged_op_finish(trans, op_k); + return ret; +} + +int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_truncate(trans, op_k, NULL); +} + +int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta) +{ + struct bkey_i_logged_op_truncate op; + + bkey_logged_op_truncate_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.new_i_size = cpu_to_le64(new_i_size); + + /* + * Logged ops aren't atomic w.r.t. snapshot creation: creating a + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ + down_read(&c->snapshot_create_lock); + int ret = bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta)); + up_read(&c->snapshot_create_lock); + + return ret; +} + +/* finsert/fcollapse: */ + +void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k); + + prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol)); + prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum)); + prt_printf(out, " dst_offset=%lli", le64_to_cpu(op.v->dst_offset)); + prt_printf(out, " src_offset=%llu", le64_to_cpu(op.v->src_offset)); +} + +static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len) +{ + struct btree_iter iter; + struct bch_inode_unpacked inode_u; + int ret; + + offset <<= 9; + len <<= 9; + + ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + if (len > 0) { + if (MAX_LFS_FILESIZE - inode_u.bi_size < len) { + ret = -EFBIG; + goto err; + } + + if (offset >= inode_u.bi_size) { + ret = -EINVAL; + goto err; + } + } + + inode_u.bi_size += len; + inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c); + + ret = bch2_inode_write(trans, &iter, &inode_u); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __bch2_resume_logged_op_finsert(struct btree_trans *trans, + struct bkey_i *op_k, + u64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + struct bch_io_opts opts; + u64 dst_offset = le64_to_cpu(op->v.dst_offset); + u64 src_offset = le64_to_cpu(op->v.src_offset); + s64 shift = dst_offset - src_offset; + u64 len = abs(shift); + u64 pos = le64_to_cpu(op->v.pos); + bool insert = shift > 0; + int ret = 0; + + ret = bch2_inum_opts_get(trans, inum, &opts); + if (ret) + return ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + POS(inum.inum, 0), + BTREE_ITER_INTENT); + + switch (op->v.state) { +case LOGGED_OP_FINSERT_start: + op->v.state = LOGGED_OP_FINSERT_shift_extents; + + if (insert) { + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + adjust_i_size(trans, inum, src_offset, len) ?: + bch2_logged_op_update(trans, &op->k_i)); + if (ret) + goto err; + } else { + bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + + ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_logged_op_update(trans, &op->k_i)); + } + + fallthrough; +case LOGGED_OP_FINSERT_shift_extents: + while (1) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete, *copy; + struct bkey_s_c k; + struct bpos src_pos = POS(inum.inum, src_offset); + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + + k = insert + ? bch2_btree_iter_peek_prev(&iter) + : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + if ((ret = bkey_err(k))) + goto btree_err; + + if (!k.k || + k.k->p.inode != inum.inum || + bkey_le(k.k->p, POS(inum.inum, src_offset))) + break; + + copy = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(copy))) + goto btree_err; + + if (insert && + bkey_lt(bkey_start_pos(k.k), src_pos)) { + bch2_cut_front(src_pos, copy); + + /* Splitting compressed extent? */ + bch2_disk_reservation_add(c, &disk_res, + copy->k.size * + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)), + BCH_DISK_RESERVATION_NOFAIL); + } + + bkey_init(&delete.k); + delete.k.p = copy->k.p; + delete.k.p.snapshot = snapshot; + delete.k.size = copy->k.size; + + copy->k.p.offset += shift; + copy->k.p.snapshot = snapshot; + + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); + + ret = bch2_bkey_set_needs_rebalance(c, copy, + opts.background_target, + opts.background_compression) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: + bch2_logged_op_update(trans, &op->k_i) ?: + bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc); +btree_err: + bch2_disk_reservation_put(c, &disk_res); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + pos = le64_to_cpu(op->v.pos); + } + + op->v.state = LOGGED_OP_FINSERT_finish; + + if (!insert) { + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + adjust_i_size(trans, inum, src_offset, shift) ?: + bch2_logged_op_update(trans, &op->k_i)); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + adjust_i_size(trans, inum, 0, 0) ?: + bch2_logged_op_update(trans, &op->k_i)); + } + + break; +case LOGGED_OP_FINSERT_finish: + break; + } +err: + bch2_logged_op_finish(trans, op_k); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k) +{ + return __bch2_resume_logged_op_finsert(trans, op_k, NULL); +} + +int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 len, bool insert, + s64 *i_sectors_delta) +{ + struct bkey_i_logged_op_finsert op; + s64 shift = insert ? len : -len; + + bkey_logged_op_finsert_init(&op.k_i); + op.v.subvol = cpu_to_le32(inum.subvol); + op.v.inum = cpu_to_le64(inum.inum); + op.v.dst_offset = cpu_to_le64(offset + shift); + op.v.src_offset = cpu_to_le64(offset); + op.v.pos = cpu_to_le64(insert ? U64_MAX : offset); + + /* + * Logged ops aren't atomic w.r.t. snapshot creation: creating a + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ + down_read(&c->snapshot_create_lock); + int ret = bch2_trans_run(c, + bch2_logged_op_start(trans, &op.k_i) ?: + __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta)); + up_read(&c->snapshot_create_lock); + + return ret; +} diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h new file mode 100644 index 0000000..9cb44a7 --- /dev/null +++ b/libbcachefs/io_misc.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_MISC_H +#define _BCACHEFS_IO_MISC_H + +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + u64, struct bch_io_opts, s64 *, + struct write_point_specifier); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + +void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_truncate_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *); + +int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *); + +void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) { \ + .val_to_text = bch2_logged_op_finsert_to_text, \ + .min_val_size = 24, \ +}) + +int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *); + +int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *); + +#endif /* _BCACHEFS_IO_MISC_H */ diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c new file mode 100644 index 0000000..b833409 --- /dev/null +++ b/libbcachefs/io_read.c @@ -0,0 +1,1210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "data_update.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io_read.h" +#include "io_misc.h" +#include "io_write.h" +#include "subvolume.h" +#include "trace.h" + +#include + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + +/* Cache promotion on read */ + +struct promote_op { + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + + struct data_update write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + BUG_ON(!opts.promote_target); + + if (!(flags & BCH_READ_MAY_PROMOTE)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return -BCH_ERR_nopromote_already_promoted; + + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_nopromote_unwritten; + + if (bch2_target_congested(c, opts.promote_target)) + return -BCH_ERR_nopromote_congested; + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return -BCH_ERR_nopromote_in_flight; + + return 0; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + bch2_data_update_exit(&op->write); + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); +} + +static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = + container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) +{ + struct bch_fs *c = trans->c; + struct promote_op *op = NULL; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOFS); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOFS)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + + ret = bch2_data_update_init(trans, NULL, &op->write, + writepoint_hashed((unsigned long) current), + opts, + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, + }, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + + op->write.op.end_io = promote_done; + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return NULL; +} + +noinline +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + struct bch_fs *c = trans->c; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + int ret; + + ret = should_promote(c, k, pos, opts, flags); + if (ret) + goto nopromote; + + promote = __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) { + ret = -BCH_ERR_nopromote_enomem; + goto nopromote; + } + + *bounce = true; + *read_full = promote_full; + return promote; +nopromote: + trace_read_nopromote(c, ret); + return NULL; +} + +/* Read */ + +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, + struct workqueue_struct *wq) +{ + if (context <= rbio->context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->context = context; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + BUG_ON(rbio->bounce && !rbio->split); + + if (rbio->promote) + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; +} + +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); + bio_endio(&rbio->bio); +} + +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + + bch2_trans_iter_init(trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_and_count(c, read_retry, &rbio->bio); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + + rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = bkey_err(k))) + goto out; + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) + goto out; + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; + goto out; + } + + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_rbio_narrow_crcs(trans, rbio)); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); + + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->bvec_iter; + } + + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + goto csum_err; + + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; + + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); + + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +nodecode: + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } + + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } + + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + return; + } + + if (rbio->narrow_crcs || + rbio->promote || + crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; + + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); +} + +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); + goto err; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_MUST_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (orig->opts.promote_target) + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + + rbio = rbio_init(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), + orig->opts); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; + rbio->split = true; + } else if (flags & BCH_READ_MUST_CLONE) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), + orig->opts); + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); + if (rbio->split) + rbio->parent = orig; + else + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; + rbio->flags = flags; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) + trace_and_count(c, read_bounce, &rbio->bio); + + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(trans, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; + + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); + + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } + + if (!ret) + goto out_read_done; + + return ret; + } + +err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: + /* + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; +} + +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +} + +void bch2_fs_io_read_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_read_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; + + return 0; +} diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h new file mode 100644 index 0000000..d9c18bb --- /dev/null +++ b/libbcachefs/io_read.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_READ_H +#define _BCACHEFS_IO_READ_H + +#include "bkey_buf.h" + +struct bch_read_bio { + struct bch_fs *c; + u64 start_time; + u64 submit_time; + + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *end_io; + }; + + /* + * Saved copy of bio->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter bvec_iter; + + unsigned offset_into_extent; + + u16 flags; + union { + struct { + u16 bounce:1, + split:1, + kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, + retry:2, + context:2; + }; + u16 _state; + }; + + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + u32 subvol; + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; + + struct bch_io_opts opts; + + struct work_struct work; + + struct bio bio; +}; + +#define to_rbio(_bio) container_of((_bio), struct bch_read_bio, bio) + +struct bch_devs_mask; +struct cache_promote_op; +struct extent_ptr_decoded; + +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) +{ + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); +} + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, +}; + +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) +{ + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); +} + +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + return rbio; +} + +void bch2_fs_io_read_exit(struct bch_fs *); +int bch2_fs_io_read_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_READ_H */ diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c new file mode 100644 index 0000000..75376f0 --- /dev/null +++ b/libbcachefs/io_write.c @@ -0,0 +1,1675 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "extent_update.h" +#include "inode.h" +#include "io_write.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" +#include "rebalance.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" +#include "trace.h" + +#include +#include +#include +#include + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) +{ + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) +{ + atomic64_t *latency = &ca->cur_latency[rw]; + u64 now = local_clock(); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); + + do { + old = v; + + /* + * If the io latency was reasonably close to the current + * latency, skip doing the update and atomic operation - most of + * the time: + */ + if (abs((int) (old - io_latency)) < (old >> 1) && + now & ~(~0U << 5)) + break; + + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); +} + +#endif + +/* Allocate, free from mempool: */ + +void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) +{ + struct bvec_iter_all iter; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, iter) + if (bv->bv_page != ZERO_PAGE(0)) + mempool_free(bv->bv_page, &c->bio_bounce_pages); + bio->bi_vcnt = 0; +} + +static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) +{ + struct page *page; + + if (likely(!*using_mempool)) { + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) { + mutex_lock(&c->bio_bounce_pages_lock); + *using_mempool = true; + goto pool_alloc; + + } + } else { +pool_alloc: + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); + } + + return page; +} + +void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + size_t size) +{ + bool using_mempool = false; + + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; + } + + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); +} + +/* Extent update path: */ + +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; + + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + + bch2_trans_copy_iter(&iter, extent_iter); + + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); + + *i_sectors_delta += sectors * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*usage_increasing && + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + + if (bkey_ge(old.k->p, new->k.p)) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct btree_iter *extent_iter, + u64 new_i_size, + s64 i_sectors_delta) +{ + struct btree_iter iter; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; + /* + * Crazy performance optimization: + * Every extent update needs to also update the inode: the inode trigger + * will set bi->journal_seq to the journal sequence number of this + * transaction - for fsync. + * + * But if that's the only reason we're updating the inode (we're not + * updating bi_size or bi_sectors), then we don't need the inode update + * to be journalled - if we crash, the bi_journal_seq update will be + * lost, but that's fine. + */ + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_CACHED); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + goto err; + } + + inode = bkey_i_to_inode_v3(k); + + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } + + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } + + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } + + ret = bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 new_i_size, + s64 *i_sectors_delta_total, + bool check_enospc) +{ + struct bpos next_pos; + bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; + + next_pos = k->k.p; + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc || !usage_increasing + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + + /* + * Note: + * We always have to do an inode update - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_extent_update_i_size_sectors(trans, iter, + min(k->k.p.offset << 9, new_i_size), + i_sectors_delta) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc); + if (unlikely(ret)) + return ret; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; +} + +static int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; + int ret; + + BUG_ON(!inum.subvol); + + bch2_bkey_buf_init(&sk); + + do { + bch2_trans_begin(trans); + + k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, + &sk.k->k.p.snapshot); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + ret = bch2_bkey_set_needs_rebalance(c, sk.k, + op->opts.background_target, + op->opts.background_compression) ?: + bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_ge(iter.pos, k->k.p)) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} + +/* Writes */ + +void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + enum bch_data_type type, + const struct bkey_i *k, + bool nocow) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + const struct bch_extent_ptr *ptr; + struct bch_write_bio *n; + struct bch_dev *ca; + + BUG_ON(c->opts.nochanges); + + bkey_for_each_ptr(ptrs, ptr) { + BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || + !c->devs[ptr->dev]); + + ca = bch_dev_bkey_exists(c, ptr->dev); + + if (to_entry(ptr + 1) < ptrs.end) { + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, + GFP_NOFS, &ca->replica_set)); + + n->bio.bi_end_io = wbio->bio.bi_end_io; + n->bio.bi_private = wbio->bio.bi_private; + n->parent = wbio; + n->split = true; + n->bounce = false; + n->put_bio = true; + n->bio.bi_opf = wbio->bio.bi_opf; + bio_inc_remaining(&wbio->bio); + } else { + n = wbio; + n->split = false; + } + + n->c = c; + n->dev = ptr->dev; + n->have_ioref = nocow || bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); + n->nocow = nocow; + n->submit_time = local_clock(); + n->inode_offset = bkey_start_offset(&k->k); + n->bio.bi_iter.bi_sector = ptr->offset; + + if (likely(n->have_ioref)) { + this_cpu_add(ca->io_done->sectors[WRITE][type], + bio_sectors(&n->bio)); + + bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { + bio_endio(&n->bio); + continue; + } + + submit_bio(&n->bio); + } else { + n->bio.bi_status = BLK_STS_REMOVED; + bio_endio(&n->bio); + } + } +} + +static void __bch2_write(struct bch_write_op *); + +static void bch2_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + + EBUG_ON(op->open_buckets.nr); + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + if (op->end_io) + op->end_io(op); +} + +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) +{ + struct keylist *keys = &op->insert_keys; + struct bch_extent_ptr *ptr; + struct bkey_i *src, *dst = keys->keys, *n; + + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); + + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; + } + + if (dst != src) + memmove_u64s_down(dst, src, src->k.u64s); + dst = bkey_next(dst); + } + + keys->top = dst; + return 0; +} + +/** + * __bch2_write_index - after a write, update index to point to new data + * @op: bch_write_op to process + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + unsigned dev; + int ret = 0; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } + + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + BUG_ON(keylist_sectors(keys) && !ret); + + op->written += sectors_start - keylist_sectors(keys); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + insert->k.p.inode, insert->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) + goto err; + } +out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); + return; +err: + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + goto out; +} + +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); + + queue_work(wq, &wp->index_update_work); +} + +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); + + if (!op) + break; + + op->flags |= BCH_WRITE_IN_WORKER; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); + } +} + +static void bch2_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } + + if (wbio->nocow) + set_bit(wbio->dev, op->devs_need_flush->d); + + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); + percpu_ref_put(&ca->io_ref); + } + + if (wbio->bounce) + bch2_bio_free_pages_pool(c, bio); + + if (wbio->put_bio) + bio_put(bio); + + if (parent) + bio_endio(&parent->bio); + else + closure_put(cl); +} + +static void init_append_extent(struct bch_write_op *op, + struct write_point *wp, + struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + struct bkey_i_extent *e; + + op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); + + bch2_keylist_push(&op->insert_keys); +} + +static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + struct write_point *wp, + struct bio *src, + bool *page_alloc_failed, + void *buf) +{ + struct bch_write_bio *wbio; + struct bio *bio; + unsigned output_available = + min(wp->sectors_free << 9, src->bi_iter.bi_size); + unsigned pages = DIV_ROUND_UP(output_available + + (buf + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); + + pages = min(pages, BIO_MAX_VECS); + + bio = bio_alloc_bioset(NULL, pages, 0, + GFP_NOFS, &c->bio_write); + wbio = wbio_init(bio); + wbio->put_bio = true; + /* copy WRITE_SYNC flag */ + wbio->bio.bi_opf = src->bi_opf; + + if (buf) { + bch2_bio_map(bio, buf, output_available); + return bio; + } + + wbio->bounce = true; + + /* + * We can't use mempool for more than c->sb.encoded_extent_max + * worth of pages, but we'd like to allocate more if we can: + */ + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, + c->opts.encoded_extent_max)); + + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = + bch2_bio_alloc_pages(bio, + output_available - + bio->bi_iter.bi_size, + GFP_NOFS) != 0; + + return bio; +} + +static int bch2_write_rechecksum(struct bch_fs *c, + struct bch_write_op *op, + unsigned new_csum_type) +{ + struct bio *bio = &op->wbio.bio; + struct bch_extent_crc_unpacked new_crc; + int ret; + + /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ + + if (bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)) + new_csum_type = op->crc.csum_type; + + ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); + if (ret) + return ret; + + bio_advance(bio, op->crc.offset << 9); + bio->bi_iter.bi_size = op->crc.live_size << 9; + op->crc = new_crc; + return 0; +} + +static int bch2_write_decrypt(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; + int ret; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; + + /* + * If we need to decrypt data in the write path, we'll no longer be able + * to verify the existing checksum (poly1305 mac, in this case) after + * it's decrypted - this is the last point we'll be able to reverify the + * checksum: + */ + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return -EIO; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + return ret; +} + +static enum prep_encoded_ret { + PREP_ENCODED_OK, + PREP_ENCODED_ERR, + PREP_ENCODED_CHECKSUM_ERR, + PREP_ENCODED_DO_WRITE, +} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +{ + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + + if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); + + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 && + op->crc.compressed_size <= wp->sectors_free && + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && + op->csum_type != op->crc.csum_type && + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_DO_WRITE; + } + + /* + * If the data is compressed and we couldn't write the entire extent as + * is, we have to decompress it: + */ + if (crc_is_compressed(op->crc)) { + struct bch_csum csum; + + if (bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + /* Last point we can still verify checksum: */ + csum = bch2_checksum_bio(c, op->crc.csum_type, + extent_nonce(op->version, op->crc), + bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + return PREP_ENCODED_ERR; + } + + /* + * No longer have compressed data after this point - data might be + * encrypted: + */ + + /* + * If the data is checksummed and we're only writing a subset, + * rechecksum and adjust bio to point to currently live data: + */ + if ((op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) && + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + + /* + * If we want to compress the data, it has to be decrypted: + */ + if ((op->compression_opt || + bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(op->csum_type)) && + bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_OK; +} + +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) +{ + struct bch_fs *c = op->c; + struct bio *src = &op->wbio.bio, *dst = src; + struct bvec_iter saved_iter; + void *ec_buf; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; + int ret, more = 0; + + BUG_ON(!bio_sectors(src)); + + ec_buf = bch2_writepoint_ec_buf(c, wp); + + switch (bch2_write_prep_encoded_data(op, wp)) { + case PREP_ENCODED_OK: + break; + case PREP_ENCODED_ERR: + ret = -EIO; + goto err; + case PREP_ENCODED_CHECKSUM_ERR: + goto csum_err; + case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; + } + + if (ec_buf || + op->compression_opt || + (op->csum_type && + !(op->flags & BCH_WRITE_PAGES_STABLE)) || + (bch2_csum_type_is_encryption(op->csum_type) && + !(op->flags & BCH_WRITE_PAGES_OWNED))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } + + saved_iter = dst->bi_iter; + + do { + struct bch_extent_crc_unpacked crc = { 0 }; + struct bversion version = op->version; + size_t dst_len = 0, src_len = 0; + + if (page_alloc_failed && + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + + BUG_ON(op->compression_opt && + (op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_opt + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_opt) + : 0; + if (!crc_is_compressed(crc)) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, + c->opts.encoded_extent_max); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); + bio_copy_data(dst, src); + swap(dst->bi_iter.bi_size, dst_len); + } + + src_len = dst_len; + } + + BUG_ON(!src_len || !dst_len); + + if (bch2_csum_type_is_encryption(op->csum_type)) { + if (bversion_zero(version)) { + version.lo = atomic64_inc_return(&c->key_version); + } else { + crc.nonce = op->nonce; + op->nonce += src_len >> 9; + } + } + + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { + u8 compression_type = crc.compression_type; + u16 nonce = crc.nonce; + /* + * Note: when we're using rechecksum(), we need to be + * checksumming @src because it has all the data our + * existing checksum covers - if we bounced (because we + * were trying to compress), @dst will only have the + * part of the data the new checksum will cover. + * + * But normally we want to be checksumming post bounce, + * because part of the reason for bouncing is so the + * data can't be modified (by userspace) while it's in + * flight. + */ + if (bch2_rechecksum_bio(c, src, version, op->crc, + &crc, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->csum_type)) + goto csum_err; + /* + * rchecksum_bio sets compression_type on crc from op->crc, + * this isn't always correct as sometimes we're changing + * an extent from uncompressed to incompressible. + */ + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->crc.csum_type)) + goto csum_err; + + crc.compressed_size = dst_len >> 9; + crc.uncompressed_size = src_len >> 9; + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; + swap(dst->bi_iter.bi_size, dst_len); + } + + init_append_extent(op, wp, version, crc); + + if (dst != src) + bio_advance(dst, dst_len); + bio_advance(src, src_len); + total_output += dst_len; + total_input += src_len; + } while (dst->bi_iter.bi_size && + src->bi_iter.bi_size && + wp->sectors_free && + !bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)); + + more = src->bi_iter.bi_size != 0; + + dst->bi_iter = saved_iter; + + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, + GFP_NOFS, &c->bio_write); + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; + } + + dst->bi_iter.bi_size = total_output; +do_write: + *_dst = dst; + return more; +csum_err: + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + ret = -EIO; +err: + if (to_wbio(dst)->bounce) + bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) + bio_put(dst); + + return ret; +} + +static bool bch2_extent_is_writeable(struct bch_write_op *op, + struct bkey_s_c k) +{ + struct bch_fs *c = op->c; + struct bkey_s_c_extent e; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + unsigned replicas = 0; + + if (k.k->type != KEY_TYPE_extent) + return false; + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { + if (crc_is_encoded(p.crc) || p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); + } + + return replicas >= op->opts.data_replicas; +} + +static inline void bch2_nocow_write_unlock(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + const struct bch_extent_ptr *ptr; + struct bkey_i *k; + + for_each_keylist_key(&op->insert_keys, k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); + } +} + +static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +{ + struct bkey_i *new; + struct bkey_ptrs ptrs; + struct bch_extent_ptr *ptr; + int ret; + + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { + /* trace this */ + return 0; + } + + new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important + * that we update the extent that we wrote to - even if a snapshot has + * since been created. The write is still outstanding, so we're ok + * w.r.t. snapshot atomicity: + */ + return bch2_extent_update_i_size_sectors(trans, iter, + min(new->k.p.offset << 9, new_i_size), 0) ?: + bch2_trans_update(trans, iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_i *orig; + struct bkey_s_c k; + int ret; + + for_each_keylist_key(&op->insert_keys, orig) { + ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_INTENT, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); + })); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + insert->k.p.inode, insert->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) { + op->error = ret; + break; + } + } + + bch2_trans_put(trans); +} + +static void __bch2_nocow_write_done(struct bch_write_op *op) +{ + bch2_nocow_write_unlock(op); + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + op->error = -EIO; + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + bch2_nocow_write_convert_unwritten(op); +} + +static void bch2_nocow_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + __bch2_nocow_write_done(op); + bch2_write_done(cl); +} + +static void bch2_nocow_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + struct { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; + u32 snapshot; + int ret, i; + + if (op->flags & BCH_WRITE_MOVE) + return; + + trans = bch2_trans_get(c); +retry: + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot); + if (unlikely(ret)) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(op->pos.inode, op->pos.offset, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bio *bio = &op->wbio.bio; + + nr_buckets = 0; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + /* fall back to normal cow write path? */ + if (unlikely(k.k->p.snapshot != snapshot || + !bch2_extent_is_writeable(op, k))) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) + break; + + /* Get iorefs before dropping btree locks: */ + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, + bucket_to_u64(buckets[nr_buckets].b)); + + prefetch(buckets[nr_buckets].l); + + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) + goto err_get_ioref; + + nr_buckets++; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + + /* Unlock before taking nocow locks, doing IO: */ + bkey_reassemble(op->insert_keys.top, k); + bch2_trans_unlock(trans); + + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + struct nocow_lock_bucket *l = buckets[i].l; + bool stale; + + __bch2_bucket_nocow_lock(&c->nocow_locks, l, + bucket_to_u64(buckets[i].b), + BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; + } + + bio = &op->wbio.bio; + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { + bio = bio_split(bio, k.k->p.offset - op->pos.offset, + GFP_KERNEL, &c->bio_write); + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { + op->flags |= BCH_WRITE_DONE; + } + + op->pos.offset += bio_sectors(bio); + op->written += bio_sectors(bio); + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_DONE) + break; + bch2_btree_iter_advance(&iter); + } +out: + bch2_trans_iter_exit(trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s: btree lookup error %s", + __func__, bch2_err_str(ret)); + op->error = ret; + op->flags |= BCH_WRITE_DONE; + } + + bch2_trans_put(trans); + + /* fallback to cow write path? */ + if (!(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; + } else if (op->flags & BCH_WRITE_SYNC) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl); + } else { + /* + * XXX + * needs to run out of process context because ei_quota_lock is + * a mutex + */ + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); + } + return; +err_get_ioref: + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* Fall back to COW path: */ + goto out; +err_bucket_stale: + while (i >= 0) { + bch2_bucket_nocow_unlock(&c->nocow_locks, + buckets[i].b, + BUCKET_NOCOW_LOCK_UPDATE); + --i; + } + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + goto out; +} + +static void __bch2_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct write_point *wp = NULL; + struct bio *bio = NULL; + unsigned nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); + if (op->flags & BCH_WRITE_DONE) + goto out_nofs_restore; + } +again: + memset(&op->failed, 0, sizeof(op->failed)); + + do { + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + + /* +1 for possible cache device: */ + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)) + break; + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->watermark, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; + + goto err; + } + + EBUG_ON(!wp); + + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); + + bch2_alloc_sectors_done_inlined(c, wp); +err: + if (ret <= 0) { + op->flags |= BCH_WRITE_DONE; + + if (ret < 0) { + op->error = ret; + break; + } + } + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + + closure_get(bio->bi_private); + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write, false); + } while (ret); + + /* + * Sync or no? + * + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { + closure_sync(&op->cl); + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + bch2_write_queue(op, wp); + continue_at(&op->cl, bch2_write_index, NULL); + } +out_nofs_restore: + memalloc_nofs_restore(nofs_flags); +} + +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + __bch2_write_index(op); +err: + bch2_write_done(&op->cl); +} + +/** + * bch2_write() - handle a write to a cache device or flash only volume + * @cl: &bch_write_op->cl + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data won't fit in a single open bucket, there will be multiple keys); + * after the data is written it calls bch_journal, and after the keys have been + * added to the next journal write they're inserted into the btree. + * + * If op->discard is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +void bch2_write(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; + struct bch_fs *c = op->c; + unsigned data_len; + + EBUG_ON(op->cl.parent); + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "misaligned write"); + op->error = -EIO; + goto err; + } + + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (c->opts.inline_data && + data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } + + __bch2_write(op); + return; +err: + bch2_disk_reservation_put(c, &op->res); + + closure_debug_destroy(&op->cl); + if (op->end_io) + op->end_io(op); +} + +static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +void bch2_fs_io_write_exit(struct bch_fs *c) +{ + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); +} + +int bch2_fs_io_write_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + return 0; +} diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h new file mode 100644 index 0000000..9323167 --- /dev/null +++ b/libbcachefs/io_write.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_H +#define _BCACHEFS_IO_WRITE_H + +#include "checksum.h" +#include "io_write_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->watermark == BCH_WATERMARK_copygc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} + +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64, s64 *, bool); + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ + op->c = c; + op->end_io = NULL; + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); + op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; + op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; + op->devs_need_flush = NULL; +} + +void bch2_write(struct closure *); + +void bch2_write_point_do_index_updates(struct work_struct *); + +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); + return wbio; +} + +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + +void bch2_fs_io_write_exit(struct bch_fs *); +int bch2_fs_io_write_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_WRITE_H */ diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h new file mode 100644 index 0000000..c7f97c2 --- /dev/null +++ b/libbcachefs/io_write_types.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_WRITE_TYPES_H +#define _BCACHEFS_IO_WRITE_TYPES_H + +#include "alloc_types.h" +#include "btree_types.h" +#include "buckets_types.h" +#include "extents_types.h" +#include "keylist_types.h" +#include "opts.h" +#include "super_types.h" + +#include +#include + +struct bch_write_bio { + struct_group(wbio, + struct bch_fs *c; + struct bch_write_bio *parent; + + u64 submit_time; + u64 inode_offset; + + struct bch_devs_list failed; + u8 dev; + + unsigned split:1, + bounce:1, + put_bio:1, + have_ioref:1, + nocow:1, + used_mempool:1, + first_btree_write:1; + ); + + struct bio bio; +}; + +struct bch_write_op { + struct closure cl; + struct bch_fs *c; + void (*end_io)(struct bch_write_op *); + u64 start_time; + + unsigned written; /* sectors */ + u16 flags; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ + + unsigned compression_opt:8; + unsigned csum_type:4; + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; + unsigned watermark:3; + unsigned incompressible:1; + unsigned stripe_waited:1; + + struct bch_devs_list devs_have; + u16 target; + u16 nonce; + struct bch_io_opts opts; + + u32 subvol; + struct bpos pos; + struct bversion version; + + /* For BCH_WRITE_DATA_ENCODED: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; + + struct write_point *wp; + struct list_head wp_list; + + struct disk_reservation res; + + struct open_buckets open_buckets; + + u64 new_i_size; + s64 i_sectors_delta; + + struct bch_devs_mask failed; + + struct keylist insert_keys; + u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; + + /* + * Bitmask of devices that have had nocow writes issued to them since + * last flush: + */ + struct bch_devs_mask *devs_need_flush; + + /* Must be last: */ + struct bch_write_bio wbio; +}; + +#endif /* _BCACHEFS_IO_WRITE_TYPES_H */ diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c new file mode 100644 index 0000000..9a76a9a --- /dev/null +++ b/libbcachefs/logged_ops.c @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "error.h" +#include "io_misc.h" +#include "logged_ops.h" +#include "super.h" + +struct bch_logged_op_fn { + u8 type; + int (*resume)(struct btree_trans *, struct bkey_i *); +}; + +static const struct bch_logged_op_fn logged_op_fns[] = { +#define x(n) { \ + .type = KEY_TYPE_logged_op_##n, \ + .resume = bch2_resume_logged_op_##n, \ +}, + BCH_LOGGED_OPS() +#undef x +}; + +static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type) +{ + for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++) + if (logged_op_fns[i].type == type) + return logged_op_fns + i; + return NULL; +} + +static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type); + struct bkey_buf sk; + u32 restart_count = trans->restart_count; + int ret; + + if (!fn) + return 0; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?: + fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count); + + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_resume_logged_ops(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, + BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k, + resume_logged_op(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + struct btree_iter iter; + int ret; + + ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + if (ret) + return ret; + + k->k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +{ + return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_logged_op_start(trans, k)); +} + +void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0)); + /* + * This needs to be a fatal error because we've left an unfinished + * operation in the logged ops btree. + * + * We should only ever see an error here if the filesystem has already + * been shut down, but make sure of that here: + */ + if (ret) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s", + __func__, buf.buf, bch2_err_str(ret)); + printbuf_exit(&buf); + } +} diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h new file mode 100644 index 0000000..4d1e786 --- /dev/null +++ b/libbcachefs/logged_ops.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LOGGED_OPS_H +#define _BCACHEFS_LOGGED_OPS_H + +#include "bkey.h" + +#define BCH_LOGGED_OPS() \ + x(truncate) \ + x(finsert) + +static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) +{ + return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); +} + +int bch2_resume_logged_ops(struct bch_fs *); +int bch2_logged_op_start(struct btree_trans *, struct bkey_i *); +void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *); + +#endif /* _BCACHEFS_LOGGED_OPS_H */ diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c new file mode 100644 index 0000000..e151ada --- /dev/null +++ b/libbcachefs/sb-clean.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "error.h" +#include "journal_io.h" +#include "replicas.h" +#include "sb-clean.h" +#include "super-io.h" + +/* + * BCH_SB_FIELD_clean: + * + * Btree roots, and a few other things, are recovered from the journal after an + * unclean shutdown - but after a clean shutdown, to avoid having to read the + * journal, we can store them in the superblock. + * + * bch_sb_field_clean simply contains a list of journal entries, stored exactly + * as they would be in the journal: + */ + +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, + int write) +{ + struct jset_entry *entry; + int ret; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, NULL, entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; + } + + return 0; +} + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +int bch2_verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + sb_clean_journal_seq_mismatch, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + prt_printf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + prt_printf(&buf2, "(none)"); + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(&k1->k)) || + l1 != l2, c, + sb_clean_btree_root_mismatch, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, + l1, buf1.buf, + l2, buf2.buf); + } +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); + + if (fsck_err_on(!sb_clean, c, + sb_clean_missing, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); + } + + ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) +{ + struct bch_dev *ca; + unsigned i, dev; + + percpu_down_read(&c->mark_lock); + + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); + } + + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + + percpu_up_read(&c->mark_lock); + + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } +} + +static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -BCH_ERR_invalid_sb_clean; + } + + return 0; +} + +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); + prt_newline(out); + prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + prt_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, +}; + +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; + + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_sb_maybe_downgrade(c); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +void bch2_fs_mark_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + int ret; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + + sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, 0); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; + } + + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} diff --git a/libbcachefs/sb-clean.h b/libbcachefs/sb-clean.h new file mode 100644 index 0000000..71caef2 --- /dev/null +++ b/libbcachefs/sb-clean.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_CLEAN_H +#define _BCACHEFS_SB_CLEAN_H + +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); +int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, + struct jset *); +struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); +void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); + +extern const struct bch_sb_field_ops bch_sb_field_ops_clean; + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +#endif /* _BCACHEFS_SB_CLEAN_H */ diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c new file mode 100644 index 0000000..f0930ab --- /dev/null +++ b/libbcachefs/sb-errors.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "sb-errors.h" +#include "super-io.h" + +static const char * const bch2_sb_error_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_SB_ERRS() + NULL +}; + +static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id) +{ + if (id < BCH_SB_ERR_MAX) + prt_str(out, bch2_sb_error_strs[id]); + else + prt_printf(out, "(unknown error %u)", id); +} + +static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e) +{ + return e + ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0]) + : 0; +} + +static inline unsigned bch2_sb_field_errors_u64s(unsigned nr) +{ + return (sizeof(struct bch_sb_field_errors) + + sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64); +} + +static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + for (i = 0; i < nr; i++) { + if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) { + prt_printf(err, "entry with count 0 (id "); + bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_printf(err, ")"); + return -BCH_ERR_invalid_sb_errors; + } + + if (i + 1 < nr && + BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >= + BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) { + prt_printf(err, "entries out of order"); + return -BCH_ERR_invalid_sb_errors; + } + } + + return 0; +} + +static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_errors *e = field_to_type(f, errors); + unsigned i, nr = bch2_sb_field_errors_nr_entries(e); + + if (out->nr_tabstops <= 1) + printbuf_tabstop_push(out, 16); + + for (i = 0; i < nr; i++) { + bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i])); + prt_tab(out); + prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i])); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time)); + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_errors = { + .validate = bch2_sb_errors_validate, + .to_text = bch2_sb_errors_to_text, +}; + +void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) +{ + bch_sb_errors_cpu *e = &c->fsck_error_counts; + struct bch_sb_error_entry_cpu n = { + .id = err, + .nr = 1, + .last_error_time = ktime_get_real_seconds() + }; + unsigned i; + + mutex_lock(&c->fsck_error_counts_lock); + for (i = 0; i < e->nr; i++) { + if (err == e->data[i].id) { + e->data[i].nr++; + e->data[i].last_error_time = n.last_error_time; + goto out; + } + if (err < e->data[i].id) + break; + } + + if (darray_make_room(e, 1)) + goto out; + + darray_insert_item(e, i, n); +out: + mutex_unlock(&c->fsck_error_counts_lock); +} + +void bch2_sb_errors_from_cpu(struct bch_fs *c) +{ + bch_sb_errors_cpu *src = &c->fsck_error_counts; + struct bch_sb_field_errors *dst = + bch2_sb_field_resize(&c->disk_sb, errors, + bch2_sb_field_errors_u64s(src->nr)); + unsigned i; + + if (!dst) + return; + + for (i = 0; i < src->nr; i++) { + SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); + SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); + dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); + } +} + +static int bch2_sb_errors_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); + bch_sb_errors_cpu *dst = &c->fsck_error_counts; + unsigned i, nr = bch2_sb_field_errors_nr_entries(src); + int ret; + + if (!nr) + return 0; + + mutex_lock(&c->fsck_error_counts_lock); + ret = darray_make_room(dst, nr); + if (ret) + goto err; + + dst->nr = nr; + + for (i = 0; i < nr; i++) { + dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); + dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); + dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); + } +err: + mutex_unlock(&c->fsck_error_counts_lock); + + return ret; +} + +void bch2_fs_sb_errors_exit(struct bch_fs *c) +{ + darray_exit(&c->fsck_error_counts); +} + +void bch2_fs_sb_errors_init_early(struct bch_fs *c) +{ + mutex_init(&c->fsck_error_counts_lock); + darray_init(&c->fsck_error_counts); +} + +int bch2_fs_sb_errors_init(struct bch_fs *c) +{ + return bch2_sb_errors_to_cpu(c); +} diff --git a/libbcachefs/sb-errors.h b/libbcachefs/sb-errors.h new file mode 100644 index 0000000..5a09a53 --- /dev/null +++ b/libbcachefs/sb-errors.h @@ -0,0 +1,270 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_H +#define _BCACHEFS_SB_ERRORS_H + +#include "sb-errors_types.h" + +#define BCH_SB_ERRS() \ + x(clean_but_journal_not_empty, 0) \ + x(dirty_but_no_journal_entries, 1) \ + x(dirty_but_no_journal_entries_post_drop_nonflushes, 2) \ + x(sb_clean_journal_seq_mismatch, 3) \ + x(sb_clean_btree_root_mismatch, 4) \ + x(sb_clean_missing, 5) \ + x(jset_unsupported_version, 6) \ + x(jset_unknown_csum, 7) \ + x(jset_last_seq_newer_than_seq, 8) \ + x(jset_past_bucket_end, 9) \ + x(jset_seq_blacklisted, 10) \ + x(journal_entries_missing, 11) \ + x(journal_entry_replicas_not_marked, 12) \ + x(journal_entry_past_jset_end, 13) \ + x(journal_entry_replicas_data_mismatch, 14) \ + x(journal_entry_bkey_u64s_0, 15) \ + x(journal_entry_bkey_past_end, 16) \ + x(journal_entry_bkey_bad_format, 17) \ + x(journal_entry_bkey_invalid, 18) \ + x(journal_entry_btree_root_bad_size, 19) \ + x(journal_entry_blacklist_bad_size, 20) \ + x(journal_entry_blacklist_v2_bad_size, 21) \ + x(journal_entry_blacklist_v2_start_past_end, 22) \ + x(journal_entry_usage_bad_size, 23) \ + x(journal_entry_data_usage_bad_size, 24) \ + x(journal_entry_clock_bad_size, 25) \ + x(journal_entry_clock_bad_rw, 26) \ + x(journal_entry_dev_usage_bad_size, 27) \ + x(journal_entry_dev_usage_bad_dev, 28) \ + x(journal_entry_dev_usage_bad_pad, 29) \ + x(btree_node_unreadable, 30) \ + x(btree_node_fault_injected, 31) \ + x(btree_node_bad_magic, 32) \ + x(btree_node_bad_seq, 33) \ + x(btree_node_unsupported_version, 34) \ + x(btree_node_bset_older_than_sb_min, 35) \ + x(btree_node_bset_newer_than_sb, 36) \ + x(btree_node_data_missing, 37) \ + x(btree_node_bset_after_end, 38) \ + x(btree_node_replicas_sectors_written_mismatch, 39) \ + x(btree_node_replicas_data_mismatch, 40) \ + x(bset_unknown_csum, 41) \ + x(bset_bad_csum, 42) \ + x(bset_past_end_of_btree_node, 43) \ + x(bset_wrong_sector_offset, 44) \ + x(bset_empty, 45) \ + x(bset_bad_seq, 46) \ + x(bset_blacklisted_journal_seq, 47) \ + x(first_bset_blacklisted_journal_seq, 48) \ + x(btree_node_bad_btree, 49) \ + x(btree_node_bad_level, 50) \ + x(btree_node_bad_min_key, 51) \ + x(btree_node_bad_max_key, 52) \ + x(btree_node_bad_format, 53) \ + x(btree_node_bkey_past_bset_end, 54) \ + x(btree_node_bkey_bad_format, 55) \ + x(btree_node_bad_bkey, 56) \ + x(btree_node_bkey_out_of_order, 57) \ + x(btree_root_bkey_invalid, 58) \ + x(btree_root_read_error, 59) \ + x(btree_root_bad_min_key, 50) \ + x(btree_root_bad_max_key, 61) \ + x(btree_node_read_error, 62) \ + x(btree_node_topology_bad_min_key, 63) \ + x(btree_node_topology_bad_max_key, 64) \ + x(btree_node_topology_overwritten_by_prev_node, 65) \ + x(btree_node_topology_overwritten_by_next_node, 66) \ + x(btree_node_topology_interior_node_empty, 67) \ + x(fs_usage_hidden_wrong, 68) \ + x(fs_usage_btree_wrong, 69) \ + x(fs_usage_data_wrong, 70) \ + x(fs_usage_cached_wrong, 71) \ + x(fs_usage_reserved_wrong, 72) \ + x(fs_usage_persistent_reserved_wrong, 73) \ + x(fs_usage_nr_inodes_wrong, 74) \ + x(fs_usage_replicas_wrong, 75) \ + x(dev_usage_buckets_wrong, 76) \ + x(dev_usage_sectors_wrong, 77) \ + x(dev_usage_fragmented_wrong, 78) \ + x(dev_usage_buckets_ec_wrong, 79) \ + x(bkey_version_in_future, 80) \ + x(bkey_u64s_too_small, 81) \ + x(bkey_invalid_type_for_btree, 82) \ + x(bkey_extent_size_zero, 83) \ + x(bkey_extent_size_greater_than_offset, 84) \ + x(bkey_size_nonzero, 85) \ + x(bkey_snapshot_nonzero, 86) \ + x(bkey_snapshot_zero, 87) \ + x(bkey_at_pos_max, 88) \ + x(bkey_before_start_of_btree_node, 89) \ + x(bkey_after_end_of_btree_node, 90) \ + x(bkey_val_size_nonzero, 91) \ + x(bkey_val_size_too_small, 92) \ + x(alloc_v1_val_size_bad, 93) \ + x(alloc_v2_unpack_error, 94) \ + x(alloc_v3_unpack_error, 95) \ + x(alloc_v4_val_size_bad, 96) \ + x(alloc_v4_backpointers_start_bad, 97) \ + x(alloc_key_data_type_bad, 98) \ + x(alloc_key_empty_but_have_data, 99) \ + x(alloc_key_dirty_sectors_0, 100) \ + x(alloc_key_data_type_inconsistency, 101) \ + x(alloc_key_to_missing_dev_bucket, 102) \ + x(alloc_key_cached_inconsistency, 103) \ + x(alloc_key_cached_but_read_time_zero, 104) \ + x(alloc_key_to_missing_lru_entry, 105) \ + x(alloc_key_data_type_wrong, 106) \ + x(alloc_key_gen_wrong, 107) \ + x(alloc_key_dirty_sectors_wrong, 108) \ + x(alloc_key_cached_sectors_wrong, 109) \ + x(alloc_key_stripe_wrong, 110) \ + x(alloc_key_stripe_redundancy_wrong, 111) \ + x(bucket_sector_count_overflow, 112) \ + x(bucket_metadata_type_mismatch, 113) \ + x(need_discard_key_wrong, 114) \ + x(freespace_key_wrong, 115) \ + x(freespace_hole_missing, 116) \ + x(bucket_gens_val_size_bad, 117) \ + x(bucket_gens_key_wrong, 118) \ + x(bucket_gens_hole_wrong, 119) \ + x(bucket_gens_to_invalid_dev, 120) \ + x(bucket_gens_to_invalid_buckets, 121) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123) \ + x(need_discard_freespace_key_bad, 124) \ + x(backpointer_pos_wrong, 125) \ + x(backpointer_to_missing_device, 126) \ + x(backpointer_to_missing_alloc, 127) \ + x(backpointer_to_missing_ptr, 128) \ + x(lru_entry_at_time_0, 129) \ + x(lru_entry_to_invalid_bucket, 130) \ + x(lru_entry_bad, 131) \ + x(btree_ptr_val_too_big, 132) \ + x(btree_ptr_v2_val_too_big, 133) \ + x(btree_ptr_has_non_ptr, 134) \ + x(extent_ptrs_invalid_entry, 135) \ + x(extent_ptrs_no_ptrs, 136) \ + x(extent_ptrs_too_many_ptrs, 137) \ + x(extent_ptrs_redundant_crc, 138) \ + x(extent_ptrs_redundant_stripe, 139) \ + x(extent_ptrs_unwritten, 140) \ + x(extent_ptrs_written_and_unwritten, 141) \ + x(ptr_to_invalid_device, 142) \ + x(ptr_to_duplicate_device, 143) \ + x(ptr_after_last_bucket, 144) \ + x(ptr_before_first_bucket, 145) \ + x(ptr_spans_multiple_buckets, 146) \ + x(ptr_to_missing_backpointer, 147) \ + x(ptr_to_missing_alloc_key, 148) \ + x(ptr_to_missing_replicas_entry, 149) \ + x(ptr_to_missing_stripe, 150) \ + x(ptr_to_incorrect_stripe, 151) \ + x(ptr_gen_newer_than_bucket_gen, 152) \ + x(ptr_too_stale, 153) \ + x(stale_dirty_ptr, 154) \ + x(ptr_bucket_data_type_mismatch, 155) \ + x(ptr_cached_and_erasure_coded, 156) \ + x(ptr_crc_uncompressed_size_too_small, 157) \ + x(ptr_crc_csum_type_unknown, 158) \ + x(ptr_crc_compression_type_unknown, 159) \ + x(ptr_crc_redundant, 160) \ + x(ptr_crc_uncompressed_size_too_big, 161) \ + x(ptr_crc_nonce_mismatch, 162) \ + x(ptr_stripe_redundant, 163) \ + x(reservation_key_nr_replicas_invalid, 164) \ + x(reflink_v_refcount_wrong, 165) \ + x(reflink_p_to_missing_reflink_v, 166) \ + x(stripe_pos_bad, 167) \ + x(stripe_val_size_bad, 168) \ + x(stripe_sector_count_wrong, 169) \ + x(snapshot_tree_pos_bad, 170) \ + x(snapshot_tree_to_missing_snapshot, 171) \ + x(snapshot_tree_to_missing_subvol, 172) \ + x(snapshot_tree_to_wrong_subvol, 173) \ + x(snapshot_tree_to_snapshot_subvol, 174) \ + x(snapshot_pos_bad, 175) \ + x(snapshot_parent_bad, 176) \ + x(snapshot_children_not_normalized, 177) \ + x(snapshot_child_duplicate, 178) \ + x(snapshot_child_bad, 179) \ + x(snapshot_skiplist_not_normalized, 180) \ + x(snapshot_skiplist_bad, 181) \ + x(snapshot_should_not_have_subvol, 182) \ + x(snapshot_to_bad_snapshot_tree, 183) \ + x(snapshot_bad_depth, 184) \ + x(snapshot_bad_skiplist, 185) \ + x(subvol_pos_bad, 186) \ + x(subvol_not_master_and_not_snapshot, 187) \ + x(subvol_to_missing_root, 188) \ + x(subvol_root_wrong_bi_subvol, 189) \ + x(bkey_in_missing_snapshot, 190) \ + x(inode_pos_inode_nonzero, 191) \ + x(inode_pos_blockdev_range, 192) \ + x(inode_unpack_error, 193) \ + x(inode_str_hash_invalid, 194) \ + x(inode_v3_fields_start_bad, 195) \ + x(inode_snapshot_mismatch, 196) \ + x(inode_unlinked_but_clean, 197) \ + x(inode_unlinked_but_nlink_nonzero, 198) \ + x(inode_checksum_type_invalid, 199) \ + x(inode_compression_type_invalid, 200) \ + x(inode_subvol_root_but_not_dir, 201) \ + x(inode_i_size_dirty_but_clean, 202) \ + x(inode_i_sectors_dirty_but_clean, 203) \ + x(inode_i_sectors_wrong, 204) \ + x(inode_dir_wrong_nlink, 205) \ + x(inode_dir_multiple_links, 206) \ + x(inode_multiple_links_but_nlink_0, 207) \ + x(inode_wrong_backpointer, 208) \ + x(inode_wrong_nlink, 209) \ + x(inode_unreachable, 210) \ + x(deleted_inode_but_clean, 211) \ + x(deleted_inode_missing, 212) \ + x(deleted_inode_is_dir, 213) \ + x(deleted_inode_not_unlinked, 214) \ + x(extent_overlapping, 215) \ + x(extent_in_missing_inode, 216) \ + x(extent_in_non_reg_inode, 217) \ + x(extent_past_end_of_inode, 218) \ + x(dirent_empty_name, 219) \ + x(dirent_val_too_big, 220) \ + x(dirent_name_too_long, 221) \ + x(dirent_name_embedded_nul, 222) \ + x(dirent_name_dot_or_dotdot, 223) \ + x(dirent_name_has_slash, 224) \ + x(dirent_d_type_wrong, 225) \ + x(dirent_d_parent_subvol_wrong, 226) \ + x(dirent_in_missing_dir_inode, 227) \ + x(dirent_in_non_dir_inode, 228) \ + x(dirent_to_missing_inode, 229) \ + x(dirent_to_missing_subvol, 230) \ + x(dirent_to_itself, 231) \ + x(quota_type_invalid, 232) \ + x(xattr_val_size_too_small, 233) \ + x(xattr_val_size_too_big, 234) \ + x(xattr_invalid_type, 235) \ + x(xattr_name_invalid_chars, 236) \ + x(xattr_in_missing_inode, 237) \ + x(root_subvol_missing, 238) \ + x(root_dir_missing, 239) \ + x(root_inode_not_dir, 240) \ + x(dir_loop, 241) \ + x(hash_table_key_duplicate, 242) \ + x(hash_table_key_wrong_offset, 243) + +enum bch_sb_error_id { +#define x(t, n) BCH_FSCK_ERR_##t = n, + BCH_SB_ERRS() +#undef x + BCH_SB_ERR_MAX +}; + +extern const struct bch_sb_field_ops bch_sb_field_ops_errors; + +void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id); + +void bch2_sb_errors_from_cpu(struct bch_fs *); + +void bch2_fs_sb_errors_exit(struct bch_fs *); +void bch2_fs_sb_errors_init_early(struct bch_fs *); +int bch2_fs_sb_errors_init(struct bch_fs *); + +#endif /* _BCACHEFS_SB_ERRORS_H */ diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h new file mode 100644 index 0000000..b1c0998 --- /dev/null +++ b/libbcachefs/sb-errors_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_ERRORS_TYPES_H +#define _BCACHEFS_SB_ERRORS_TYPES_H + +#include "darray.h" + +struct bch_sb_error_entry_cpu { + u64 id:16, + nr:48; + u64 last_error_time; +}; + +typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu; + +#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */ + diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c new file mode 100644 index 0000000..bed0f85 --- /dev/null +++ b/libbcachefs/sb-members.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "disk_groups.h" +#include "opts.h" +#include "replicas.h" +#include "sb-members.h" +#include "super-io.h" + +#define x(t, n, ...) [n] = #t, +static const char * const bch2_iops_measurements[] = { + BCH_IOPS_MEASUREMENTS() + NULL +}; + +char * const bch2_member_error_strs[] = { + BCH_MEMBER_ERROR_TYPES() + NULL +}; +#undef x + +/* Code for bch_sb_field_members_v1: */ + +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) +{ + return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); +} + +static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) +{ + struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); + return ret; +} + +static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) +{ + return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); +} + +static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) +{ + struct bch_member ret, *p = members_v1_get_mut(mi, i); + memset(&ret, 0, sizeof(ret)); + memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); + return ret; +} + +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) +{ + struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); + if (mi2) + return members_v2_get(mi2, i); + struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); + return members_v1_get(mi1, i); +} + +static int sb_members_v2_resize_entries(struct bch_fs *c) +{ + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + + if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) { + unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) * + c->disk_sb.sb->nr_devices), 8); + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) + return -BCH_ERR_ENOSPC_sb_members_v2; + + for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { + void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); + memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes)); + memset(dst + le16_to_cpu(mi->member_bytes), + 0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes))); + } + mi->member_bytes = cpu_to_le16(sizeof(struct bch_member)); + } + return 0; +} + +int bch2_sb_members_v2_init(struct bch_fs *c) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; + + if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) { + mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2, + DIV_ROUND_UP(sizeof(*mi2) + + sizeof(struct bch_member) * c->sb.nr_devices, + sizeof(u64))); + mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1); + memcpy(&mi2->_members[0], &mi1->_members[0], + BCH_MEMBER_V1_BYTES * c->sb.nr_devices); + memset(&mi2->pad[0], 0, sizeof(mi2->pad)); + mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES); + } + + return sb_members_v2_resize_entries(c); +} + +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) +{ + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; + + mi1 = bch2_sb_field_resize(disk_sb, members_v1, + DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * + disk_sb->sb->nr_devices, sizeof(u64))); + if (!mi1) + return -BCH_ERR_ENOSPC_sb_members; + + mi2 = bch2_sb_field_get(disk_sb->sb, members_v2); + + for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++) + memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES); + + return 0; +} + +static int validate_member(struct printbuf *err, + struct bch_member m, + struct bch_sb *sb, + int i) +{ + if (le64_to_cpu(m.nbuckets) > LONG_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m.nbuckets), LONG_MAX); + return -BCH_ERR_invalid_sb_members; + } + + if (le64_to_cpu(m.nbuckets) - + le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) { + prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m.bucket_size) < + le16_to_cpu(sb->block_size)) { + prt_printf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size)); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m.bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) { + prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -BCH_ERR_invalid_sb_members; + } + + return 0; +} + +static void member_to_text(struct printbuf *out, + struct bch_member m, + struct bch_sb_field_disk_groups *gi, + struct bch_sb *sb, + int i) +{ + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m.bucket_size); + u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; + + if (!bch2_member_exists(&m)) + return; + + prt_printf(out, "Device:"); + prt_tab(out); + prt_printf(out, "%u", i); + prt_newline(out); + + printbuf_indent_add(out, 2); + + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(&m)) { + unsigned idx = BCH_MEMBER_GROUP(&m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m.uuid.b); + prt_newline(out); + + prt_printf(out, "Size:"); + prt_tab(out); + prt_units_u64(out, device_size << 9); + prt_newline(out); + + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s errors:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.errors[i])); + prt_newline(out); + } + + for (unsigned i = 0; i < BCH_IOPS_NR; i++) { + prt_printf(out, "%s iops:", bch2_iops_measurements[i]); + prt_tab(out); + prt_printf(out, "%u", le32_to_cpu(m.iops[i])); + prt_newline(out); + } + + prt_printf(out, "Bucket size:"); + prt_tab(out); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + + prt_printf(out, "First bucket:"); + prt_tab(out); + prt_printf(out, "%u", le16_to_cpu(m.first_bucket)); + prt_newline(out); + + prt_printf(out, "Buckets:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(m.nbuckets)); + prt_newline(out); + + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m.last_mount) + bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + + prt_printf(out, "State:"); + prt_tab(out); + prt_printf(out, "%s", + BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(&m)] + : "unknown"); + prt_newline(out); + + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(&m)) + prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:"); + prt_tab(out); + if (data_have) + prt_bitflags(out, bch2_data_types, data_have); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Discard:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m)); + prt_newline(out); + + prt_printf(out, "Freespace initialized:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static int bch2_sb_members_v1_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + unsigned i; + + if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) { + prt_printf(err, "too many devices for section size"); + return -BCH_ERR_invalid_sb_members; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member m = members_v1_get(mi, i); + + int ret = validate_member(err, m, sb, i); + if (ret) + return ret; + } + + return 0; +} + +static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v1_get(mi, i), gi, sb, i); +} + +const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { + .validate = bch2_sb_members_v1_validate, + .to_text = bch2_sb_members_v1_to_text, +}; + +static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) + member_to_text(out, members_v2_get(mi, i), gi, sb, i); +} + +static int bch2_sb_members_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); + size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) - + (void *) mi; + + if (mi_bytes > vstruct_bytes(&mi->field)) { + prt_printf(err, "section too small (%zu > %zu)", + mi_bytes, vstruct_bytes(&mi->field)); + return -BCH_ERR_invalid_sb_members; + } + + for (unsigned i = 0; i < sb->nr_devices; i++) { + int ret = validate_member(err, members_v2_get(mi, i), sb, i); + if (ret) + return ret; + } + + return 0; +} + +const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = { + .validate = bch2_sb_members_v2_validate, + .to_text = bch2_sb_members_v2_to_text, +}; + +void bch2_sb_members_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + struct bch_dev *ca; + unsigned i, e; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, i); + + for (e = 0; e < BCH_MEMBER_ERROR_NR; e++) + m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); + } + rcu_read_unlock(); +} + +void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member m; + + mutex_lock(&ca->fs->sb_lock); + m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&ca->fs->sb_lock); + + printbuf_tabstop_push(out, 12); + + prt_str(out, "IO errors since filesystem creation"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); + + prt_str(out, "IO errors since "); + bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC); + prt_str(out, " ago"); + prt_newline(out); + + printbuf_indent_add(out, 2); + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) { + prt_printf(out, "%s:", bch2_member_error_strs[i]); + prt_tab(out); + prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i])); + prt_newline(out); + } + printbuf_indent_sub(out, 2); +} + +void bch2_dev_errors_reset(struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_member *m; + + mutex_lock(&c->sb_lock); + m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) + m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); + m->errors_reset_time = ktime_get_real_seconds(); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h new file mode 100644 index 0000000..03613e3 --- /dev/null +++ b/libbcachefs/sb-members.h @@ -0,0 +1,227 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SB_MEMBERS_H +#define _BCACHEFS_SB_MEMBERS_H + +extern char * const bch2_member_error_strs[]; + +static inline struct bch_member * +__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) +{ + return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); +} + +int bch2_sb_members_v2_init(struct bch_fs *c); +int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); +struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); +struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); + +static inline bool bch2_dev_is_online(struct bch_dev *ca) +{ + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_failed; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; +} + +static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) +{ + return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); +} + +static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs.nr; i++) + if (devs.devs[i] == dev) + return true; + + return false; +} + +static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs->nr; i++) + if (devs->devs[i] == dev) { + array_remove_item(devs->devs, devs->nr, i); + return; + } +} + +static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, + unsigned dev) +{ + if (!bch2_dev_list_has_dev(*devs, dev)) { + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); + devs->devs[devs->nr++] = dev; + } +} + +static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) +{ + return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; +} + +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + const struct bch_devs_mask *mask) +{ + struct bch_dev *ca = NULL; + + while ((*iter = mask + ? find_next_bit(mask->d, c->sb.nr_devices, *iter) + : *iter) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define for_each_member_device_rcu(ca, c, iter, mask) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca; + + rcu_read_lock(); + if ((ca = __bch2_next_dev(c, iter, NULL))) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + return ca; +} + +/* + * If you break early, you must drop your ref on the current device + */ +#define for_each_member_device(ca, c, iter) \ + for ((iter) = 0; \ + (ca = bch2_get_next_dev(c, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + +static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch2_next_dev(c, iter, NULL)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) + +/* + * If a key exists that references a device, the device won't be going away and + * we can omit rcu_read_lock(): + */ +static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_check(c->devs[idx], 1); +} + +static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_protected(c->devs[idx], + lockdep_is_held(&c->sb_lock) || + lockdep_is_held(&c->state_lock)); +} + +/* XXX kill, move to struct bch_fs */ +static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +{ + struct bch_devs_mask devs; + struct bch_dev *ca; + unsigned i; + + memset(&devs, 0, sizeof(devs)); + for_each_online_member(ca, c, i) + __set_bit(ca->dev_idx, devs.d); + return devs; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; +extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; + +static inline bool bch2_member_exists(struct bch_member *m) +{ + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); +} + +static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev) +{ + if (dev < sb->nr_devices) { + struct bch_member m = bch2_sb_member_get(sb, dev); + return bch2_member_exists(&m); + } + return false; +} + +static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) +{ + return (struct bch_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = bch2_member_exists(mi), + }; +} + +void bch2_sb_members_from_cpu(struct bch_fs *); + +void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); +void bch2_dev_errors_reset(struct bch_dev *); + +#endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/libbcachefs/six.c b/libbcachefs/six.c new file mode 100644 index 0000000..f37d5ad --- /dev/null +++ b/libbcachefs/six.c @@ -0,0 +1,867 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "six.h" + +#ifdef DEBUG +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) do {} while (0) +#endif + +#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) +#define six_release(l, ip) lock_release(l, ip) + +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); + +#define SIX_LOCK_HELD_read_OFFSET 0 +#define SIX_LOCK_HELD_read ~(~0U << 26) +#define SIX_LOCK_HELD_intent (1U << 26) +#define SIX_LOCK_HELD_write (1U << 27) +#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) +#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) +#define SIX_LOCK_NOSPIN (1U << 31) + +struct six_lock_vals { + /* Value we add to the lock in order to take the lock: */ + u32 lock_val; + + /* If the lock has this value (used as a mask), taking the lock fails: */ + u32 lock_fail; + + /* Mask that indicates lock is held for this type: */ + u32 held_mask; + + /* Waitlist we wakeup when releasing the lock: */ + enum six_lock_type unlock_wakeup; +}; + +static const struct six_lock_vals l[] = { + [SIX_LOCK_read] = { + .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, + .lock_fail = SIX_LOCK_HELD_write, + .held_mask = SIX_LOCK_HELD_read, + .unlock_wakeup = SIX_LOCK_write, + }, + [SIX_LOCK_intent] = { + .lock_val = SIX_LOCK_HELD_intent, + .lock_fail = SIX_LOCK_HELD_intent, + .held_mask = SIX_LOCK_HELD_intent, + .unlock_wakeup = SIX_LOCK_intent, + }, + [SIX_LOCK_write] = { + .lock_val = SIX_LOCK_HELD_write, + .lock_fail = SIX_LOCK_HELD_read, + .held_mask = SIX_LOCK_HELD_write, + .unlock_wakeup = SIX_LOCK_read, + }, +}; + +static inline void six_set_bitmask(struct six_lock *lock, u32 mask) +{ + if ((atomic_read(&lock->state) & mask) != mask) + atomic_or(mask, &lock->state); +} + +static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) +{ + if (atomic_read(&lock->state) & mask) + atomic_and(~mask, &lock->state); +} + +static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, + u32 old, struct task_struct *owner) +{ + if (type != SIX_LOCK_intent) + return; + + if (!(old & SIX_LOCK_HELD_intent)) { + EBUG_ON(lock->owner); + lock->owner = owner; + } else { + EBUG_ON(lock->owner != current); + } +} + +static inline unsigned pcpu_read_count(struct six_lock *lock) +{ + unsigned read_count = 0; + int cpu; + + for_each_possible_cpu(cpu) + read_count += *per_cpu_ptr(lock->readers, cpu); + return read_count; +} + +/* + * __do_six_trylock() - main trylock routine + * + * Returns 1 on success, 0 on failure + * + * In percpu reader mode, a failed trylock may cause a spurious trylock failure + * for anoter thread taking the competing lock type, and we may havve to do a + * wakeup: when a wakeup is required, we return -1 - wakeup_type. + */ +static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + struct task_struct *task, bool try) +{ + int ret; + u32 old; + + EBUG_ON(type == SIX_LOCK_write && lock->owner != task); + EBUG_ON(type == SIX_LOCK_write && + (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); + + /* + * Percpu reader mode: + * + * The basic idea behind this algorithm is that you can implement a lock + * between two threads without any atomics, just memory barriers: + * + * For two threads you'll need two variables, one variable for "thread a + * has the lock" and another for "thread b has the lock". + * + * To take the lock, a thread sets its variable indicating that it holds + * the lock, then issues a full memory barrier, then reads from the + * other thread's variable to check if the other thread thinks it has + * the lock. If we raced, we backoff and retry/sleep. + * + * Failure to take the lock may cause a spurious trylock failure in + * another thread, because we temporarily set the lock to indicate that + * we held it. This would be a problem for a thread in six_lock(), when + * they are calling trylock after adding themself to the waitlist and + * prior to sleeping. + * + * Therefore, if we fail to get the lock, and there were waiters of the + * type we conflict with, we will have to issue a wakeup. + * + * Since we may be called under wait_lock (and by the wakeup code + * itself), we return that the wakeup has to be done instead of doing it + * here. + */ + if (type == SIX_LOCK_read && lock->readers) { + preempt_disable(); + this_cpu_inc(*lock->readers); /* signal that we own lock */ + + smp_mb(); + + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); + + this_cpu_sub(*lock->readers, !ret); + preempt_enable(); + + if (!ret) { + smp_mb(); + if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write) + ret = -1 - SIX_LOCK_write; + } + } else if (type == SIX_LOCK_write && lock->readers) { + if (try) { + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + ret = !pcpu_read_count(lock); + + if (try && !ret) { + old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); + if (old & SIX_LOCK_WAITING_read) + ret = -1 - SIX_LOCK_read; + } + } else { + old = atomic_read(&lock->state); + do { + ret = !(old & l[type].lock_fail); + if (!ret || (type == SIX_LOCK_write && !try)) { + smp_mb(); + break; + } + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); + + EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); + } + + if (ret > 0) + six_set_owner(lock, type, old, task); + + EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && + (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); + + return ret; +} + +static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +{ + struct six_lock_waiter *w, *next; + struct task_struct *task; + bool saw_one; + int ret; +again: + ret = 0; + saw_one = false; + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, &lock->wait_list, list) { + if (w->lock_want != lock_type) + continue; + + if (saw_one && lock_type != SIX_LOCK_read) + goto unlock; + saw_one = true; + + ret = __do_six_trylock(lock, lock_type, w->task, false); + if (ret <= 0) + goto unlock; + + /* + * Similar to percpu_rwsem_wake_function(), we need to guard + * against the wakee noticing w->lock_acquired, returning, and + * then exiting before we do the wakeup: + */ + task = get_task_struct(w->task); + __list_del(w->list.prev, w->list.next); + /* + * The release barrier here ensures the ordering of the + * __list_del before setting w->lock_acquired; @w is on the + * stack of the thread doing the waiting and will be reused + * after it sees w->lock_acquired with no other locking: + * pairs with smp_load_acquire() in six_lock_slowpath() + */ + smp_store_release(&w->lock_acquired, true); + wake_up_process(task); + put_task_struct(task); + } + + six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); +unlock: + raw_spin_unlock(&lock->wait_lock); + + if (ret < 0) { + lock_type = -ret - 1; + goto again; + } +} + +__always_inline +static void six_lock_wakeup(struct six_lock *lock, u32 state, + enum six_lock_type lock_type) +{ + if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) + return; + + if (!(state & (SIX_LOCK_WAITING_read << lock_type))) + return; + + __six_lock_wakeup(lock, lock_type); +} + +__always_inline +static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) +{ + int ret; + + ret = __do_six_trylock(lock, type, current, try); + if (ret < 0) + __six_lock_wakeup(lock, -ret - 1); + + return ret > 0; +} + +/** + * six_trylock_ip - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + if (!do_six_trylock(lock, type, true)) + return false; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); + return true; +} +EXPORT_SYMBOL_GPL(six_trylock_ip); + +/** + * six_relock_ip - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip) +{ + if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) + return false; + + if (six_lock_seq(lock) != seq) { + six_unlock_ip(lock, type, ip); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(six_relock_ip); + +#ifdef CONFIG_LOCK_SPIN_ON_OWNER + +static inline bool six_owner_running(struct six_lock *lock) +{ + /* + * When there's no owner, we might have preempted between the owner + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ + rcu_read_lock(); + struct task_struct *owner = READ_ONCE(lock->owner); + bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); + rcu_read_unlock(); + + return ret; +} + +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) +{ + unsigned loop = 0; + u64 end_time; + + if (type == SIX_LOCK_write) + return false; + + if (lock->wait_list.next != &wait->list) + return false; + + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + + preempt_disable(); + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (!need_resched() && six_owner_running(lock)) { + /* + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup + */ + if (smp_load_acquire(&wait->lock_acquired)) { + preempt_enable(); + return true; + } + + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); + break; + } + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + + preempt_enable(); + return false; +} + +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ + +static inline bool six_optimistic_spin(struct six_lock *lock, + struct six_lock_waiter *wait, + enum six_lock_type type) +{ + return false; +} + +#endif + +noinline +static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret = 0; + + if (type == SIX_LOCK_write) { + EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; + + raw_spin_lock(&lock->wait_lock); + six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); + /* + * Retry taking the lock after taking waitlist lock, in case we raced + * with an unlock: + */ + ret = __do_six_trylock(lock, type, current, false); + if (ret <= 0) { + wait->start_time = local_clock(); + + if (!list_empty(&lock->wait_list)) { + struct six_lock_waiter *last = + list_last_entry(&lock->wait_list, + struct six_lock_waiter, list); + + if (time_before_eq64(wait->start_time, last->start_time)) + wait->start_time = last->start_time + 1; + } + + list_add_tail(&wait->list, &lock->wait_list); + } + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(ret > 0)) { + ret = 0; + goto out; + } + + if (unlikely(ret < 0)) { + __six_lock_wakeup(lock, -ret - 1); + ret = 0; + } + + if (six_optimistic_spin(lock, wait, type)) + goto out; + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + /* + * Ensures that writes to the waitlist entry happen after we see + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup + */ + if (smp_load_acquire(&wait->lock_acquired)) + break; + + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; + if (unlikely(ret)) { + bool acquired; + + /* + * If should_sleep_fn() returns an error, we are + * required to return that error even if we already + * acquired the lock - should_sleep_fn() might have + * modified external state (e.g. when the deadlock cycle + * detector in bcachefs issued a transaction restart) + */ + raw_spin_lock(&lock->wait_lock); + acquired = wait->lock_acquired; + if (!acquired) + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(acquired)) + do_six_unlock_type(lock, type); + break; + } + + schedule(); + } + + __set_current_state(TASK_RUNNING); +out: + if (ret && type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); + } + trace_contention_end(lock, 0); + + return ret; +} + +/** + * six_lock_ip_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * This is the most general six_lock() variant, with parameters to support full + * cycle detection for deadlock avoidance. + * + * The code calling this function must implement tracking of held locks, and the + * @wait object should be embedded into the struct that tracks held locks - + * which must also be accessible in a thread-safe way. + * + * @should_sleep_fn should invoke the cycle detector; it should walk each + * lock's waiters, and for each waiter recursively walk their held locks. + * + * When this function must block, @wait will be added to @lock's waitlist before + * calling trylock, and before calling @should_sleep_fn, and @wait will not be + * removed from the lock waitlist until the lock has been successfully acquired, + * or we abort. + * + * @wait.start_time will be monotonically increasing for any given waitlist, and + * thus may be used as a loop cursor. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret; + + wait->start_time = 0; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); + + ret = do_six_trylock(lock, type, true) ? 0 + : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); + + if (ret && type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + if (!ret) + lock_acquired(&lock->dep_map, ip); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_ip_waiter); + +__always_inline +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + u32 state; + + if (type == SIX_LOCK_intent) + lock->owner = NULL; + + if (type == SIX_LOCK_read && + lock->readers) { + smp_mb(); /* unlock barrier */ + this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ + state = atomic_read(&lock->state); + } else { + u32 v = l[type].lock_val; + + if (type != SIX_LOCK_read) + v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; + + EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); + state = atomic_sub_return_release(v, &lock->state); + } + + six_lock_wakeup(lock, state, l[type].unlock_wakeup); +} + +/** + * six_unlock_ip - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + EBUG_ON(type == SIX_LOCK_write && + !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + EBUG_ON((type == SIX_LOCK_write || + type == SIX_LOCK_intent) && + lock->owner != current); + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + else + lock->seq++; + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + do_six_unlock_type(lock, type); +} +EXPORT_SYMBOL_GPL(six_unlock_ip); + +/** + * six_lock_downgrade - convert an intent lock to a read lock + * @lock: lock to dowgrade + * + * @lock will have read count incremented and intent count decremented + */ +void six_lock_downgrade(struct six_lock *lock) +{ + six_lock_increment(lock, SIX_LOCK_read); + six_unlock_intent(lock); +} +EXPORT_SYMBOL_GPL(six_lock_downgrade); + +/** + * six_lock_tryupgrade - attempt to convert read lock to an intent lock + * @lock: lock to upgrade + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_lock_tryupgrade(struct six_lock *lock) +{ + u32 old = atomic_read(&lock->state), new; + + do { + new = old; + + if (new & SIX_LOCK_HELD_intent) + return false; + + if (!lock->readers) { + EBUG_ON(!(new & SIX_LOCK_HELD_read)); + new -= l[SIX_LOCK_read].lock_val; + } + + new |= SIX_LOCK_HELD_intent; + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); + + if (lock->readers) + this_cpu_dec(*lock->readers); + + six_set_owner(lock, SIX_LOCK_intent, old, current); + + return true; +} +EXPORT_SYMBOL_GPL(six_lock_tryupgrade); + +/** + * six_trylock_convert - attempt to convert a held lock from one type to another + * @lock: lock to upgrade + * @from: SIX_LOCK_read or SIX_LOCK_intent + * @to: SIX_LOCK_read or SIX_LOCK_intent + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_trylock_convert(struct six_lock *lock, + enum six_lock_type from, + enum six_lock_type to) +{ + EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); + + if (to == from) + return true; + + if (to == SIX_LOCK_read) { + six_lock_downgrade(lock); + return true; + } else { + return six_lock_tryupgrade(lock); + } +} +EXPORT_SYMBOL_GPL(six_trylock_convert); + +/** + * six_lock_increment - increase held lock count on a lock that is already held + * @lock: lock to increment + * @type: SIX_LOCK_read or SIX_LOCK_intent + * + * @lock must already be held, with a lock type that is greater than or equal to + * @type + * + * A corresponding six_unlock_type() call will be required for @lock to be fully + * unlocked. + */ +void six_lock_increment(struct six_lock *lock, enum six_lock_type type) +{ + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); + + /* XXX: assert already locked, and that we don't overflow: */ + + switch (type) { + case SIX_LOCK_read: + if (lock->readers) { + this_cpu_inc(*lock->readers); + } else { + EBUG_ON(!(atomic_read(&lock->state) & + (SIX_LOCK_HELD_read| + SIX_LOCK_HELD_intent))); + atomic_add(l[type].lock_val, &lock->state); + } + break; + case SIX_LOCK_intent: + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + lock->intent_lock_recurse++; + break; + case SIX_LOCK_write: + BUG(); + break; + } +} +EXPORT_SYMBOL_GPL(six_lock_increment); + +/** + * six_lock_wakeup_all - wake up all waiters on @lock + * @lock: lock to wake up waiters for + * + * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * abort the lock operation. + * + * This function is never needed in a bug-free program; it's only useful in + * debug code, e.g. to determine if a cycle detector is at fault. + */ +void six_lock_wakeup_all(struct six_lock *lock) +{ + u32 state = atomic_read(&lock->state); + struct six_lock_waiter *w; + + six_lock_wakeup(lock, state, SIX_LOCK_read); + six_lock_wakeup(lock, state, SIX_LOCK_intent); + six_lock_wakeup(lock, state, SIX_LOCK_write); + + raw_spin_lock(&lock->wait_lock); + list_for_each_entry(w, &lock->wait_list, list) + wake_up_process(w->task); + raw_spin_unlock(&lock->wait_lock); +} +EXPORT_SYMBOL_GPL(six_lock_wakeup_all); + +/** + * six_lock_counts - return held lock counts, for each lock type + * @lock: lock to return counters for + * + * Return: the number of times a lock is held for read, intent and write. + */ +struct six_lock_count six_lock_counts(struct six_lock *lock) +{ + struct six_lock_count ret; + + ret.n[SIX_LOCK_read] = !lock->readers + ? atomic_read(&lock->state) & SIX_LOCK_HELD_read + : pcpu_read_count(lock); + ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_counts); + +/** + * six_lock_readers_add - directly manipulate reader count of a lock + * @lock: lock to add/subtract readers for + * @nr: reader count to add/subtract + * + * When an upper layer is implementing lock reentrency, we may have both read + * and intent locks on the same lock. + * + * When we need to take a write lock, the read locks will cause self-deadlock, + * because six locks themselves do not track which read locks are held by the + * current thread and which are held by a different thread - it does no + * per-thread tracking of held locks. + * + * The upper layer that is tracking held locks may however, if trylock() has + * failed, count up its own read locks, subtract them, take the write lock, and + * then re-add them. + * + * As in any other situation when taking a write lock, @lock must be held for + * intent one (or more) times, so @lock will never be left unlocked. + */ +void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) { + this_cpu_add(*lock->readers, nr); + } else { + EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); + /* reader count starts at bit 0 */ + atomic_add(nr, &lock->state); + } +} +EXPORT_SYMBOL_GPL(six_lock_readers_add); + +/** + * six_lock_exit - release resources held by a lock prior to freeing + * @lock: lock to exit + * + * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * required to free the percpu read counts. + */ +void six_lock_exit(struct six_lock *lock) +{ + WARN_ON(lock->readers && pcpu_read_count(lock)); + WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); + + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_exit); + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags) +{ + atomic_set(&lock->state, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + + /* + * Don't assume that we have real percpu variables available in + * userspace: + */ +#ifdef __KERNEL__ + if (flags & SIX_LOCK_INIT_PCPU) { + /* + * We don't return an error here on memory allocation failure + * since percpu is an optimization, and locks will work with the + * same semantics in non-percpu mode: callers can check for + * failure if they wish by checking lock->readers, but generally + * will not want to treat it as an error. + */ + lock->readers = alloc_percpu(unsigned); + } +#endif +} +EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/libbcachefs/six.h b/libbcachefs/six.h new file mode 100644 index 0000000..a7104ac --- /dev/null +++ b/libbcachefs/six.h @@ -0,0 +1,386 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_SIX_H +#define _LINUX_SIX_H + +/** + * DOC: SIX locks overview + * + * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores + * but with an additional state: read/shared, intent, exclusive/write + * + * The purpose of the intent state is to allow for greater concurrency on tree + * structures without deadlocking. In general, a read can't be upgraded to a + * write lock without deadlocking, so an operation that updates multiple nodes + * will have to take write locks for the full duration of the operation. + * + * But by adding an intent state, which is exclusive with other intent locks but + * not with readers, we can take intent locks at thte start of the operation, + * and then take write locks only for the actual update to each individual + * nodes, without deadlocking. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * + * An intent lock must be held before taking a write lock: + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); + * + * Other operations: + * six_trylock_read() + * six_trylock_intent() + * six_trylock_write() + * + * six_lock_downgrade() convert from intent to read + * six_lock_tryupgrade() attempt to convert from read to intent, may fail + * + * There are also interfaces that take the lock type as an enum: + * + * six_lock_type(&foo->lock, SIX_LOCK_read); + * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) + * six_lock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_intent); + * + * Lock sequence numbers - unlock(), relock(): + * + * Locks embed sequences numbers, which are incremented on write lock/unlock. + * This allows locks to be dropped and the retaken iff the state they protect + * hasn't changed; this makes it much easier to avoid holding locks while e.g. + * doing IO or allocating memory. + * + * Example usage: + * six_lock_read(&foo->lock); + * u32 seq = six_lock_seq(&foo->lock); + * six_unlock_read(&foo->lock); + * + * some_operation_that_may_block(); + * + * if (six_relock_read(&foo->lock, seq)) { ... } + * + * If the relock operation succeeds, it is as if the lock was never unlocked. + * + * Reentrancy: + * + * Six locks are not by themselves reentrent, but have counters for both the + * read and intent states that can be used to provide reentrency by an upper + * layer that tracks held locks. If a lock is known to already be held in the + * read or intent state, six_lock_increment() can be used to bump the "lock + * held in this state" counter, increasing the number of unlock calls that + * will be required to fully unlock it. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_lock_increment(&foo->lock, SIX_LOCK_read); + * six_unlock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * foo->lock is now fully unlocked. + * + * Since the intent state supercedes read, it's legal to increment the read + * counter when holding an intent lock, but not the reverse. + * + * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) + * is not legal. + * + * should_sleep_fn: + * + * There is a six_lock() variant that takes a function pointer that is called + * immediately prior to schedule() when blocking, and may return an error to + * abort. + * + * One possible use for this feature is when objects being locked are part of + * a cache and may reused, and lock ordering is based on a property of the + * object that will change when the object is reused - i.e. logical key order. + * + * If looking up an object in the cache may race with object reuse, and lock + * ordering is required to prevent deadlock, object reuse may change the + * correct lock order for that object and cause a deadlock. should_sleep_fn + * can be used to check if the object is still the object we want and avoid + * this deadlock. + * + * Wait list entry interface: + * + * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a + * wait list entry. By embedding six_lock_waiter into another object, and by + * traversing lock waitlists, it is then possible for an upper layer to + * implement full cycle detection for deadlock avoidance. + * + * should_sleep_fn should be used for invoking the cycle detector, walking the + * graph of held locks to check for a deadlock. The upper layer must track + * held locks for each thread, and each thread's held locks must be reachable + * from its six_lock_waiter object. + * + * six_lock_waiter() will add the wait object to the waitlist re-trying taking + * the lock, and before calling should_sleep_fn, and the wait object will not + * be removed from the waitlist until either the lock has been successfully + * acquired, or we aborted because should_sleep_fn returned an error. + * + * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will + * have timestamps in strictly ascending order - this is so the timestamp can + * be used as a cursor for lock graph traverse. + */ + +#include +#include +#include + +enum six_lock_type { + SIX_LOCK_read, + SIX_LOCK_intent, + SIX_LOCK_write, +}; + +struct six_lock { + atomic_t state; + u32 seq; + unsigned intent_lock_recurse; + struct task_struct *owner; + unsigned __percpu *readers; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; + enum six_lock_type lock_want; + bool lock_acquired; + u64 start_time; +}; + +typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); + +void six_lock_exit(struct six_lock *lock); + +enum six_lock_init_flags { + SIX_LOCK_INIT_PCPU = 1U << 0, +}; + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags); + +/** + * six_lock_init - initialize a six lock + * @lock: lock to initialize + * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU + */ +#define six_lock_init(lock, flags) \ +do { \ + static struct lock_class_key __key; \ + \ + __six_lock_init((lock), #lock, &__key, flags); \ +} while (0) + +/** + * six_lock_seq - obtain current lock sequence number + * @lock: six_lock to obtain sequence number for + * + * @lock should be held for read or intent, and not write + * + * By saving the lock sequence number, we can unlock @lock and then (typically + * after some blocking operation) attempt to relock it: the relock will succeed + * if the sequence number hasn't changed, meaning no write locks have been taken + * and state corresponding to what @lock protects is still valid. + */ +static inline u32 six_lock_seq(const struct six_lock *lock) +{ + return lock->seq; +} + +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_trylock_type - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * Return: true on success, false on failure. + */ +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return six_trylock_ip(lock, type, _THIS_IP_); +} + +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip); + +/** + * six_lock_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * This is a convenience wrapper around six_lock_ip_waiter(), see that function + * for full documentation. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); +} + +/** + * six_lock_ip - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); +} + +/** + * six_lock_type - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); +} + +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip); + +/** + * six_relock_type - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * + * Return: true on success, false on failure. + */ +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + return six_relock_ip(lock, type, seq, _THIS_IP_); +} + +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_unlock_type - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + six_unlock_ip(lock, type, _THIS_IP_); +} + +#define __SIX_LOCK(type) \ +static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} \ + \ +static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ +{ \ + return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ +} \ + \ +static inline int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ +{ \ + return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ +} \ + \ +static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ +} \ + \ +static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ +} \ + \ +static inline int six_lock_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn fn, void *p)\ +{ \ + return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ +} \ + \ +static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) +#undef __SIX_LOCK + +void six_lock_downgrade(struct six_lock *); +bool six_lock_tryupgrade(struct six_lock *); +bool six_trylock_convert(struct six_lock *, enum six_lock_type, + enum six_lock_type); + +void six_lock_increment(struct six_lock *, enum six_lock_type); + +void six_lock_wakeup_all(struct six_lock *); + +struct six_lock_count { + unsigned n[3]; +}; + +struct six_lock_count six_lock_counts(struct six_lock *); +void six_lock_readers_add(struct six_lock *, int); + +#endif /* _LINUX_SIX_H */ diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c new file mode 100644 index 0000000..b23550b --- /dev/null +++ b/libbcachefs/snapshot.c @@ -0,0 +1,1713 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "buckets.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "snapshot.h" + +#include + +/* + * Snapshot trees: + * + * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they + * exist to provide a stable identifier for the whole lifetime of a snapshot + * tree. + */ + +void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); + + prt_printf(out, "subvol %u root snapshot %u", + le32_to_cpu(t.v->master_subvol), + le32_to_cpu(t.v->root_snapshot)); +} + +int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_tree_pos_bad, + "bad pos"); +fsck_err: + return ret; +} + +int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot_tree *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + + if (bch2_err_matches(ret, ENOENT)) + ret = -BCH_ERR_ENOENT_snapshot_tree; + return ret; +} + +struct bkey_i_snapshot_tree * +__bch2_snapshot_tree_create(struct btree_trans *trans) +{ + struct btree_iter iter; + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, U32_MAX)); + struct bkey_i_snapshot_tree *s_t; + + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_snapshot_tree; + if (ret) + return ERR_PTR(ret); + + s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : s_t; +} + +static int bch2_snapshot_tree_create(struct btree_trans *trans, + u32 root_id, u32 subvol_id, u32 *tree_id) +{ + struct bkey_i_snapshot_tree *n_tree = + __bch2_snapshot_tree_create(trans); + + if (IS_ERR(n_tree)) + return PTR_ERR(n_tree); + + n_tree->v.master_subvol = cpu_to_le32(subvol_id); + n_tree->v.root_snapshot = cpu_to_le32(root_id); + *tree_id = n_tree->k.p.offset; + return 0; +} + +/* Snapshot nodes: */ + +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor) + id = __snapshot_t(t, id)->parent; + rcu_read_unlock(); + + return id == ancestor; +} + +static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + + if (s->skip[2] <= ancestor) + return s->skip[2]; + if (s->skip[1] <= ancestor) + return s->skip[1]; + if (s->skip[0] <= ancestor) + return s->skip[0]; + return s->parent; +} + +bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + bool ret; + + EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); + + if (id && id < ancestor) { + ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); + + EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); + } else { + ret = id == ancestor; + } + + rcu_read_unlock(); + + return ret; +} + +static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + size_t new_size; + struct snapshot_table *new, *old; + + new_size = max(16UL, roundup_pow_of_two(idx + 1)); + + new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + if (!new) + return NULL; + + old = rcu_dereference_protected(c->snapshots, true); + if (old) + memcpy(new->s, + rcu_dereference_protected(c->snapshots, true)->s, + sizeof(new->s[0]) * c->snapshot_table_size); + + rcu_assign_pointer(c->snapshots, new); + c->snapshot_table_size = new_size; + kvfree_rcu_mightsleep(old); + + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; +} + +static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + + lockdep_assert_held(&c->snapshot_table_lock); + + if (likely(idx < c->snapshot_table_size)) + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + + return __snapshot_t_mut(c, id); +} + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol), + le32_to_cpu(s.v->tree)); + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) + prt_printf(out, " depth %u skiplist %u %u %u", + le32_to_cpu(s.v->depth), + le32_to_cpu(s.v->skip[0]), + le32_to_cpu(s.v->skip[1]), + le32_to_cpu(s.v->skip[2])); +} + +int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1)), c, err, + snapshot_pos_bad, + "bad pos"); + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + bkey_fsck_err_on(id && id <= k.k->p.offset, c, err, + snapshot_parent_bad, + "bad parent node (%u <= %llu)", + id, k.k->p.offset); + + bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err, + snapshot_children_not_normalized, + "children not normalized"); + + bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err, + snapshot_child_duplicate, + "duplicate child nodes"); + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + bkey_fsck_err_on(id >= k.k->p.offset, c, err, + snapshot_child_bad, + "bad child node (%u >= %llu)", + id, k.k->p.offset); + } + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { + bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err, + snapshot_skiplist_not_normalized, + "skiplist not normalized"); + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + + bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err, + snapshot_skiplist_bad, + "bad skiplist node %u", id); + } + } +fsck_err: + return ret; +} + +static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + struct snapshot_t *t = snapshot_t_mut(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); +} + +static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + __set_is_ancestor_bitmap(c, id); + mutex_unlock(&c->snapshot_table_lock); +} + +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; + u32 id = new.k->p.offset; + int ret = 0; + + mutex_lock(&c->snapshot_table_lock); + + t = snapshot_t_mut(c, id); + if (!t) { + ret = -BCH_ERR_ENOMEM_mark_snapshot; + goto err; + } + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + t->tree = le32_to_cpu(s.v->tree); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { + t->depth = le32_to_cpu(s.v->depth); + t->skip[0] = le32_to_cpu(s.v->skip[0]); + t->skip[1] = le32_to_cpu(s.v->skip[1]); + t->skip[2] = le32_to_cpu(s.v->skip[2]); + } else { + t->depth = 0; + t->skip[0] = 0; + t->skip[1] = 0; + t->skip[2] = 0; + } + + __set_is_ancestor_bitmap(c, id); + + if (BCH_SNAPSHOT_DELETED(s.v)) { + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + bch2_delete_dead_snapshots_async(c); + } + } else { + memset(t, 0, sizeof(*t)); + } +err: + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + +int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot, s); +} + +static int bch2_snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +/* + * If @k is a snapshot with just one live child, it's part of a linear chain, + * which we consider to be an equivalence class: and then after snapshot + * deletion cleanup, there should only be a single key at a given position in + * this equivalence class. + * + * This sets the equivalence class of @k to be the child's equivalence class, if + * it's part of such a linear chain: this correctly sets equivalence classes on + * startup if we run leaf to root (i.e. in natural key order). + */ +static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + unsigned i, nr_live = 0, live_idx = 0; + struct bkey_s_c_snapshot snap; + u32 id = k.k->p.offset, child[2]; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + int ret = bch2_snapshot_live(trans, child[i]); + + if (ret < 0) + return ret; + + if (ret) + live_idx = i; + nr_live += ret; + } + + mutex_lock(&c->snapshot_table_lock); + + snapshot_t_mut(c, id)->equiv = nr_live == 1 + ? snapshot_t_mut(c, child[live_idx])->equiv + : id; + + mutex_unlock(&c->snapshot_table_lock); + + return 0; +} + +/* fsck: */ + +static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +{ + return snapshot_t(c, id)->children[child]; +} + +static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 0); +} + +static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 1); +} + +static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) +{ + u32 n, parent; + + n = bch2_snapshot_left_child(c, id); + if (n) + return n; + + while ((parent = bch2_snapshot_parent(c, id))) { + n = bch2_snapshot_right_child(c, parent); + if (n && n != id) + return n; + id = parent; + } + + return 0; +} + +static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +{ + u32 id = snapshot_root; + u32 subvol = 0, s; + + while (id) { + s = snapshot_t(c, id)->subvol; + + if (s && (!subvol || s < subvol)) + subvol = s; + + id = bch2_snapshot_tree_next(c, id); + } + + return subvol; +} + +static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + u32 snapshot_root, u32 *subvol_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume s; + bool found = false; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + s = bkey_s_c_to_subvolume(k); + if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) + continue; + if (!BCH_SUBVOLUME_SNAP(s.v)) { + *subvol_id = s.k->p.offset; + found = true; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + + if (!ret && !found) { + struct bkey_i_subvolume *u; + + *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + + u = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, *subvol_id), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&u->v, false); + } + + return ret; +} + +static int check_snapshot_tree(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot_tree st; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct printbuf buf = PRINTBUF; + u32 root_id; + int ret; + + if (k.k->type != KEY_TYPE_snapshot_tree) + return 0; + + st = bkey_s_c_to_snapshot_tree(k); + root_id = le32_to_cpu(st.v->root_snapshot); + + ret = bch2_snapshot_lookup(trans, root_id, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), + c, snapshot_tree_to_missing_snapshot, + "snapshot tree points to missing/incorrect snapshot:\n %s", + (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto err; + } + + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), + false, 0, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, snapshot_tree_to_missing_subvol, + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), + root_id), + c, snapshot_tree_to_wrong_subvol, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), + c, snapshot_tree_to_snapshot_subvol, + "snapshot tree points to snapshot subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + struct bkey_i_snapshot_tree *u; + u32 subvol_id; + + ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + if (ret) + goto err; + + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * For each snapshot_tree, make sure it points to the root of a snapshot tree + * and that snapshot entry points back to it, or delete it. + * + * And, make sure it points to a subvolume within that snapshot tree, or correct + * it to point to the oldest subvolume within that snapshot tree. + */ +int bch2_check_snapshot_trees(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_snapshot_trees, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_snapshot_tree(trans, &iter, k))); + + if (ret) + bch_err(c, "error %i checking snapshot trees", ret); + return ret; +} + +/* + * Look up snapshot tree for @tree_id and find root, + * make sure @snap_id is a descendent: + */ +static int snapshot_tree_ptr_good(struct btree_trans *trans, + u32 snap_id, u32 tree_id) +{ + struct bch_snapshot_tree s_t; + int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + + if (bch2_err_matches(ret, ENOENT)) + return 0; + if (ret) + return ret; + + return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); +} + +u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + + if (!id) + return 0; + + rcu_read_lock(); + s = snapshot_t(c, id); + if (s->parent) + id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); + rcu_read_unlock(); + + return id; +} + +static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) +{ + unsigned i; + + for (i = 0; i < 3; i++) + if (!s.parent) { + if (s.skip[i]) + return false; + } else { + if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) + return false; + } + + return true; +} + +/* + * snapshot_tree pointer was incorrect: look up root snapshot node, make sure + * its snapshot_tree pointer is correct (allocate new one if necessary), then + * update this node's pointer to root node's pointer: + */ +static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_snapshot *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter root_iter; + struct bch_snapshot_tree s_t; + struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; + int ret; + + root = bch2_bkey_get_iter_typed(trans, &root_iter, + BTREE_ID_snapshots, POS(0, root_id), + BTREE_ITER_WITH_UPDATES, snapshot); + ret = bkey_err(root); + if (ret) + goto err; + + tree_id = le32_to_cpu(root.v->tree); + + ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { + u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u) ?: + bch2_snapshot_tree_create(trans, root_id, + bch2_snapshot_tree_oldest_subvol(c, root_id), + &tree_id); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + if (k.k->p.offset == root_id) + *s = u->v; + } + + if (k.k->p.offset != root_id) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + *s = u->v; + } +err: + bch2_trans_iter_exit(trans, &root_iter); + return ret; +} + +static int check_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct bch_snapshot v; + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; + struct printbuf buf = PRINTBUF; + bool should_have_subvol; + u32 i, id; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, bkey_val_bytes(k.k)); + + id = le32_to_cpu(s.parent); + if (id) { + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot with nonexistent parent:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (le32_to_cpu(v.children[0]) != k.k->p.offset && + le32_to_cpu(v.children[1]) != k.k->p.offset) { + bch_err(c, "snapshot parent %u missing pointer to child %llu", + id, k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + for (i = 0; i < 2 && s.children[i]; i++) { + id = le32_to_cpu(s.children[i]); + + ret = bch2_snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot node %llu has nonexistent child %u", + k.k->p.offset, id); + if (ret) + goto err; + + if (le32_to_cpu(v.parent) != k.k->p.offset) { + bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + !BCH_SNAPSHOT_DELETED(&s); + + if (should_have_subvol) { + id = le32_to_cpu(s.subvol); + ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { + bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + k.k->p.offset); + ret = -EINVAL; + goto err; + } + } else { + if (fsck_err_on(s.subvol, + c, snapshot_should_not_have_subvol, + "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.subvol = 0; + s = u->v; + } + } + + ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree, + "snapshot points to missing/incorrect tree:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) + goto err; + } + ret = 0; + + real_depth = bch2_snapshot_depth(c, parent_id); + + if (le32_to_cpu(s.depth) != real_depth && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, snapshot_bad_depth, + "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.depth = cpu_to_le32(real_depth); + s = u->v; + } + + ret = snapshot_skiplist_good(trans, k.k->p.offset, s); + if (ret < 0) + goto err; + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, snapshot_bad_skiplist, + "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) + u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); + + bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); + s = u->v; + } + ret = 0; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_snapshots(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ + ret = bch2_trans_run(c, + for_each_btree_key_reverse_commit(trans, iter, + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, + check_snapshot(trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_i_snapshot *s; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); + return ret; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(&s->v)) + goto err; + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) +{ + if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) + swap(s->children[0], s->children[1]); +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct btree_iter c_iter = (struct btree_iter) { NULL }; + struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct bkey_s_c_snapshot s; + u32 parent_id, child_id; + unsigned i; + int ret = 0; + + s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT, snapshot); + ret = bkey_err(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); + + if (ret) + goto err; + + BUG_ON(s.v->children[1]); + + parent_id = le32_to_cpu(s.v->parent); + child_id = le32_to_cpu(s.v->children[0]); + + if (parent_id) { + struct bkey_i_snapshot *parent; + + parent = bch2_bkey_get_mut_typed(trans, &p_iter, + BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(parent); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); + if (unlikely(ret)) + goto err; + + /* find entry in parent->children for node being deleted */ + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (bch2_fs_inconsistent_on(i == 2, c, + "snapshot %u missing child pointer to %u", + parent_id, id)) + goto err; + + parent->v.children[i] = cpu_to_le32(child_id); + + normalize_snapshot_child_pointers(&parent->v); + } + + if (child_id) { + struct bkey_i_snapshot *child; + + child = bch2_bkey_get_mut_typed(trans, &c_iter, + BTREE_ID_snapshots, POS(0, child_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(child); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", child_id); + if (unlikely(ret)) + goto err; + + child->v.parent = cpu_to_le32(parent_id); + + if (!child->v.parent) { + child->v.skip[0] = 0; + child->v.skip[1] = 0; + child->v.skip[2] = 0; + } + } + + if (!parent_id) { + /* + * We're deleting the root of a snapshot tree: update the + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ + struct bkey_i_snapshot_tree *s_t; + + BUG_ON(s.v->children[1]); + + s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + if (ret) + goto err; + + if (s.v->children[0]) { + s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &tree_iter); + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &c_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -BCH_ERR_ENOSPC_snapshot_create; + goto err; + } + + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + goto err; + + new_snapids[i] = iter.pos.offset; + + mutex_lock(&c->snapshot_table_lock); + snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; + mutex_unlock(&c->snapshot_table_lock); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create new snapshot IDs as children of an existing snapshot ID: + */ +static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n_parent; + int ret = 0; + + n_parent = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, parent), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", parent); + return ret; + } + + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + goto err; + + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create a snapshot node that is the root of a new tree: + */ +static int bch2_snapshot_node_create_tree(struct btree_trans *trans, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bkey_i_snapshot_tree *n_tree; + int ret; + + n_tree = __bch2_snapshot_tree_create(trans); + ret = PTR_ERR_OR_ZERO(n_tree) ?: + create_snapids(trans, 0, n_tree->k.p.offset, + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + return ret; + + n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); + n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); + return 0; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + BUG_ON((parent == 0) != (nr_snapids == 1)); + BUG_ON((parent != 0) != (nr_snapids == 2)); + + return parent + ? bch2_snapshot_node_create_children(trans, parent, + new_snapids, snapshot_subvols, nr_snapids) + : bch2_snapshot_node_create_tree(trans, + new_snapids, snapshot_subvols, nr_snapids); + +} + +/* + * If we have an unlinked inode in an internal snapshot node, and the inode + * really has been deleted in all child snapshots, how does this get cleaned up? + * + * first there is the problem of how keys that have been overwritten in all + * child snapshots get deleted (unimplemented?), but inodes may perhaps be + * special? + * + * also: unlinked inode in internal snapshot appears to not be getting deleted + * correctly if inode doesn't exist in leaf snapshots + * + * solution: + * + * for a key in an interior snapshot node that needs work to be done that + * requires it to be mutated: iterate over all descendent leaf nodes and copy + * that key to snapshot leaf nodes, where we can mutate it + */ + +static int snapshot_delete_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *deleted, + snapshot_id_list *equiv_seen, + struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (!bkey_eq(k.k->p, *last_pos)) + equiv_seen->nr = 0; + *last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(equiv_seen, equiv)) { + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } else { + return snapshot_list_add(c, equiv_seen, equiv); + } +} + +static int move_key_to_correct_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + /* + * When we have a linear chain of snapshot nodes, we consider + * those to form an equivalence class: we're going to collapse + * them all down to a single node, and keep the leaf-most node - + * which has the same id as the equivalence class id. + * + * If there are multiple keys in different snapshots at the same + * position, we're only going to keep the one in the newest + * snapshot - the rest have been overwritten and are redundant, + * and for the key we're going to keep we need to move it to the + * equivalance class ID if it's not there already. + */ + if (equiv != k.k->p.snapshot) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + struct btree_iter new_iter; + int ret; + + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + new->k.p.snapshot = equiv; + + bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&new_iter) ?: + bch2_trans_update(trans, &new_iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &new_iter); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot snap; + u32 children[2]; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + return 0; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = bch2_snapshot_live(trans, children[0]) ?: + bch2_snapshot_live(trans, children[1]); + if (ret < 0) + return ret; + return !ret; +} + +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +{ + int ret = bch2_snapshot_needs_delete(trans, k); + + return ret <= 0 + ? ret + : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); +} + +static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, + snapshot_id_list *skip) +{ + rcu_read_lock(); + while (snapshot_list_has_id(skip, id)) + id = __bch2_snapshot_parent(c, id); + + while (n--) { + do { + id = __bch2_snapshot_parent(c, id); + } while (snapshot_list_has_id(skip, id)); + } + rcu_read_unlock(); + + return id; +} + +static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k, + snapshot_id_list *deleted) +{ + struct bch_fs *c = trans->c; + u32 nr_deleted_ancestors = 0; + struct bkey_i_snapshot *s; + u32 *i; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + if (snapshot_list_has_id(deleted, k.k->p.offset)) + return 0; + + s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + darray_for_each(*deleted, i) + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + + if (!nr_deleted_ancestors) + return 0; + + le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); + + if (!s->v.depth) { + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + u32 depth = le32_to_cpu(s->v.depth); + u32 parent = bch2_snapshot_parent(c, s->k.p.offset); + + for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { + u32 id = le32_to_cpu(s->v.skip[j]); + + if (snapshot_list_has_id(deleted, id)) { + id = bch2_snapshot_nth_parent_skip(c, + parent, + depth > 1 + ? get_random_u32_below(depth - 1) + : 0, + deleted); + s->v.skip[j] = cpu_to_le32(id); + } + } + + bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); + } + + return bch2_trans_update(trans, iter, &s->k_i, 0); +} + +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct btree_trans *trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + snapshot_id_list deleted = { 0 }; + snapshot_id_list deleted_interior = { 0 }; + u32 *i, id; + int ret = 0; + + if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + return 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); + return ret; + } + } + + trans = bch2_trans_get(c); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, + bch2_delete_redundant_snapshot(trans, k)); + if (ret) { + bch_err_msg(c, ret, "deleting redundant snapshots"); + goto err; + } + + ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(trans, k)); + if (ret) { + bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); + goto err; + } + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_list_add(c, &deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "walking snapshots"); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + struct bpos last_pos = POS_MIN; + snapshot_id_list equiv_seen = { 0 }; + struct disk_reservation res = { 0 }; + + if (!btree_type_has_snapshots(id)) + continue; + + /* + * deleted inodes btree is maintained by a trigger on the inodes + * btree - no work for us to do here, and it's not safe to scan + * it because we'll see out of date keys due to the btree write + * buffer: + */ + if (id == BTREE_ID_deleted_inodes) + continue; + + ret = for_each_btree_key_commit(trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, + snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: + for_each_btree_key_commit(trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, + move_key_to_correct_snapshot(trans, &iter, k)); + + bch2_disk_reservation_put(c, &res); + darray_exit(&equiv_seen); + + if (ret) { + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + goto err; + } + } + + bch2_trans_unlock(trans); + down_write(&c->snapshot_create_lock); + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 snapshot = k.k->p.offset; + u32 equiv = bch2_snapshot_equiv(c, snapshot); + + if (equiv != snapshot) + snapshot_list_add(c, &deleted_interior, snapshot); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err_create_lock; + + /* + * Fixing children of deleted snapshots can't be done completely + * atomically, if we crash between here and when we delete the interior + * nodes some depth fields will be off: + */ + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_INTENT, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + if (ret) + goto err_create_lock; + + darray_for_each(deleted, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err_create_lock; + } + } + + darray_for_each(deleted_interior, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err_create_lock; + } + } +err_create_lock: + up_write(&c->snapshot_create_lock); +err: + darray_exit(&deleted_interior); + darray_exit(&deleted); + bch2_trans_put(trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + + bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_delete_dead_snapshots_async(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (!bkey_eq(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s = snapshot_t(c, id); + + return s->children[1] ?: s->children[0]; +} + +static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) +{ + u32 child; + + while ((child = bch2_snapshot_smallest_child(c, id))) + id = child; + return id; +} + +static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c interior_k, + u32 leaf_id, struct bpos *new_min_pos) +{ + struct btree_iter iter; + struct bpos pos = interior_k.k->p; + struct bkey_s_c k; + struct bkey_i *new; + int ret; + + pos.snapshot = leaf_id; + + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; + + /* key already overwritten in this snapshot? */ + if (k.k->p.snapshot != interior_k.k->p.snapshot) + goto out; + + if (bpos_eq(*new_min_pos, POS_MIN)) { + *new_min_pos = k.k->p; + new_min_pos->snapshot = leaf_id; + } + + new = bch2_bkey_make_mut_noupdate(trans, interior_k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + new->k.p.snapshot = leaf_id; + ret = bch2_trans_update(trans, &iter, new, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c k, + struct bpos *new_min_pos) +{ + struct bch_fs *c = trans->c; + struct bkey_buf sk; + u32 restart_count = trans->restart_count; + int ret = 0; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + *new_min_pos = POS_MIN; + + for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); + id < k.k->p.snapshot; + id++) { + if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || + !bch2_snapshot_is_leaf(c, id)) + continue; +again: + ret = btree_trans_too_many_iters(trans) ?: + bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_begin(trans); + goto again; + } + + if (ret) + break; + } + + bch2_bkey_buf_exit(&sk, c); + + return ret ?: trans_was_restarted(trans, restart_count); +} + +static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot snap; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || + (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { + set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + return 0; + } + + return ret; +} + +int bch2_snapshots_read(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(trans, k) ?: + bch2_check_snapshot_needs_deletion(trans, k)) ?: + for_each_btree_key2(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + kfree(rcu_dereference_protected(c->snapshots, true)); +} diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h new file mode 100644 index 0000000..f09a22f --- /dev/null +++ b/libbcachefs/snapshot.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_H +#define _BCACHEFS_SNAPSHOT_H + +enum bkey_invalid_flags; + +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_tree_invalid, \ + .val_to_text = bch2_snapshot_tree_to_text, \ + .min_val_size = 8, \ +}) + +struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); + +int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ + .atomic_trigger = bch2_mark_snapshot, \ + .min_val_size = 24, \ +}) + +static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) +{ + return &t->s[U32_MAX - id]; +} + +static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return __snapshot_t(rcu_dereference(c->snapshots), id); +} + +static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = snapshot_t(c, id)->tree; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent_early(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + u32 parent = snapshot_t(c, id)->parent; + + if (parent && + snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", + id, snapshot_t(c, id)->depth, + parent, snapshot_t(c, parent)->depth); + + return parent; +#else + return snapshot_t(c, id)->parent; +#endif +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) +{ + rcu_read_lock(); + while (n--) + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); + +static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +{ + u32 parent; + + rcu_read_lock(); + while ((parent = __bch2_snapshot_parent(c, id))) + id = parent; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->equiv; +} + +static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_equiv(c, id); + rcu_read_unlock(); + + return id; +} + +static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) +{ + return id == bch2_snapshot_equiv(c, id); +} + +static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + bool ret; + + rcu_read_lock(); + s = snapshot_t(c, id); + ret = s->children[0]; + rcu_read_unlock(); + + return ret; +} + +static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +{ + return !bch2_snapshot_is_internal_node(c, id); +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + u32 parent = __bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, __bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) +{ + u32 depth; + + rcu_read_lock(); + depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; + rcu_read_unlock(); + + return depth; +} + +bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); + +static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + return id == ancestor + ? true + : __bch2_snapshot_is_ancestor(c, id, ancestor); +} + +static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *t; + bool ret; + + rcu_read_lock(); + t = snapshot_t(c, id); + ret = (t->children[0]|t->children[1]) != 0; + rcu_read_unlock(); + + return ret; +} + +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (*i == id) + return true; + return false; +} + +static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (bch2_snapshot_is_ancestor(c, id, *i)) + return true; + return false; +} + +static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret; + + BUG_ON(snapshot_list_has_id(s, id)); + ret = darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); + +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + +int bch2_check_snapshot_trees(struct bch_fs *); +int bch2_check_snapshots(struct bch_fs *); + +int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); +void bch2_delete_dead_snapshots_work(struct work_struct *); + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); + +static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __bch2_key_has_snapshot_overwrites(trans, id, pos); +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, + struct bkey_s_c, struct bpos *); + +int bch2_snapshots_read(struct bch_fs *); +void bch2_fs_snapshots_exit(struct bch_fs *); + +#endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/make-release-tarball.sh b/make-release-tarball.sh new file mode 100755 index 0000000..9986eac --- /dev/null +++ b/make-release-tarball.sh @@ -0,0 +1,60 @@ +#!/bin/bash + +set -o errexit + +version=$1 + +git checkout v$version +git clean -xfd + +(cd rust-src; cargo license) > COPYING.rust-dependencies + +git ls-files| + tar --create --file bcachefs-tools-$version.tar -T - \ + --transform="s_^_bcachefs-tools-$version/_" + +tar --append --file bcachefs-tools-vendored-$version.tar \ + --transform="s_^_bcachefs-tools-$version/_" \ + COPYING.rust-dependencies + +zstd -z --ultra bcachefs-tools-$version.tar + +gpg --armor --detach-sign bcachefs-tools-$version.tar +mv bcachefs-tools-$version.tar.asc bcachefs-tools-$version.tar.sign + +gpg --armor --sign bcachefs-tools-$version.tar + +scp bcachefs-tools-$version.tar.zst evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ +scp bcachefs-tools-$version.tar.asc evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ +scp bcachefs-tools-$version.tar.sign evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ + +cargo vendor --manifest-path rust-src/Cargo.toml + +mkdir .cargo +cat > .cargo/config.toml <<-ZZ +[source.crates-io] +replace-with = "vendored-sources" + +[source."git+https://evilpiepirate.org/git/rust-bindgen.git"] +git = "https://evilpiepirate.org/git/rust-bindgen.git" +replace-with = "vendored-sources" + +[source.vendored-sources] +directory = "vendor" +ZZ + +cp bcachefs-tools-$version.tar bcachefs-tools-vendored-$version.tar +tar --append --file bcachefs-tools-vendored-$version.tar \ + --transform="s_^_bcachefs-tools-$version/_" \ + .cargo vendor + +zstd -z --ultra bcachefs-tools-vendored-$version.tar + +gpg --armor --detach-sign bcachefs-tools-vendored-$version.tar +mv bcachefs-tools-vendored-$version.tar.asc bcachefs-tools-vendored-$version.tar.sign + +gpg --armor --sign bcachefs-tools-vendored-$version.tar + +scp bcachefs-tools-vendored-$version.tar.zst evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ +scp bcachefs-tools-vendored-$version.tar.asc evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ +scp bcachefs-tools-vendored-$version.tar.sign evilpiepirate.org:/var/www/htdocs/bcachefs-tools/ diff --git a/rust-src/src/cmd_completions.rs b/rust-src/src/cmd_completions.rs new file mode 100644 index 0000000..5185969 --- /dev/null +++ b/rust-src/src/cmd_completions.rs @@ -0,0 +1,24 @@ +use crate::transform_c_args; +use clap::{Command, CommandFactory, Parser}; +use clap_complete::{generate, Generator, Shell}; +use std::ffi::{c_char, c_int}; +use std::io; + +/// Generate shell completions +#[derive(clap::Parser, Debug)] +pub struct Cli { + shell: Shell, +} + +fn print_completions(gen: G, cmd: &mut Command) { + generate(gen, cmd, cmd.get_name().to_string(), &mut io::stdout()); +} + +#[no_mangle] +#[allow(clippy::not_unsafe_ptr_arg_deref)] +pub extern "C" fn cmd_completions(argc: c_int, argv: *const *const c_char) -> c_int { + transform_c_args!(argv, argc, argv); + let cli = Cli::parse_from(argv); + print_completions(cli.shell, &mut super::Cli::command()); + 0 +} -- 2.39.2