]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Upload to experimental
authorJonathan Carter <jcc@debian.org>
Wed, 27 Dec 2023 17:51:21 +0000 (19:51 +0200)
committerJonathan Carter <jcc@debian.org>
Wed, 27 Dec 2023 17:51:21 +0000 (19:51 +0200)
41 files changed:
Makefile
debian/changelog
debian/patches/do-not-install-mount-symlink [new file with mode: 0644]
debian/patches/series [new file with mode: 0644]
debian/rules
libbcachefs/bbpos_types.h [new file with mode: 0644]
libbcachefs/btree_journal_iter.c [new file with mode: 0644]
libbcachefs/btree_journal_iter.h [new file with mode: 0644]
libbcachefs/btree_key_cache_types.h [new file with mode: 0644]
libbcachefs/btree_trans_commit.c [new file with mode: 0644]
libbcachefs/btree_update.c [new file with mode: 0644]
libbcachefs/darray.c [new file with mode: 0644]
libbcachefs/disk_groups_types.h [new file with mode: 0644]
libbcachefs/fs-io-buffered.c [new file with mode: 0644]
libbcachefs/fs-io-buffered.h [new file with mode: 0644]
libbcachefs/fs-io-direct.c [new file with mode: 0644]
libbcachefs/fs-io-direct.h [new file with mode: 0644]
libbcachefs/fs-io-pagecache.c [new file with mode: 0644]
libbcachefs/fs-io-pagecache.h [new file with mode: 0644]
libbcachefs/io_misc.c [new file with mode: 0644]
libbcachefs/io_misc.h [new file with mode: 0644]
libbcachefs/io_read.c [new file with mode: 0644]
libbcachefs/io_read.h [new file with mode: 0644]
libbcachefs/io_write.c [new file with mode: 0644]
libbcachefs/io_write.h [new file with mode: 0644]
libbcachefs/io_write_types.h [new file with mode: 0644]
libbcachefs/logged_ops.c [new file with mode: 0644]
libbcachefs/logged_ops.h [new file with mode: 0644]
libbcachefs/sb-clean.c [new file with mode: 0644]
libbcachefs/sb-clean.h [new file with mode: 0644]
libbcachefs/sb-errors.c [new file with mode: 0644]
libbcachefs/sb-errors.h [new file with mode: 0644]
libbcachefs/sb-errors_types.h [new file with mode: 0644]
libbcachefs/sb-members.c [new file with mode: 0644]
libbcachefs/sb-members.h [new file with mode: 0644]
libbcachefs/six.c [new file with mode: 0644]
libbcachefs/six.h [new file with mode: 0644]
libbcachefs/snapshot.c [new file with mode: 0644]
libbcachefs/snapshot.h [new file with mode: 0644]
make-release-tarball.sh [new file with mode: 0755]
rust-src/src/cmd_completions.rs [new file with mode: 0644]

index 61a624558e5876c9ab624699403d67474600037a..4e4de54dc9781d0b346ef3c51c0148b8b745c349 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -159,7 +159,8 @@ install: bcachefs
        $(INSTALL) -m0755 -D initramfs/hook   $(DESTDIR)$(INITRAMFS_HOOK)
        $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs
        $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs
-       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
+       # See: #1057295
+       # $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
        $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs
        $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs
        $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs
index f1e4e5e6df80d7a5bc5a446320a990bb5cec44bf..e25d80966dd192c4a2ba1dcf6e0362d3e32f2fca 100644 (file)
@@ -1,3 +1,14 @@
+bcachefs-tools (24+really1.3.4-2~exp1) experimental; urgency=medium
+
+  [ Chris Hofstaedtler ]
+  * Non-maintainer upload.
+  * Install files into /usr instead of /. (Closes: #1059373)
+
+  [ Jonathan Carter ]
+  * Do not install bcachefs.mount symlink (Closes: #1057295) 
+
+ -- Jonathan Carter <jcc@debian.org>  Wed, 27 Dec 2023 19:22:06 +0200
+
 bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium
 
   * New upstream release
diff --git a/debian/patches/do-not-install-mount-symlink b/debian/patches/do-not-install-mount-symlink
new file mode 100644 (file)
index 0000000..cbbbf62
--- /dev/null
@@ -0,0 +1,16 @@
+Description: do not install mount.bcachefs symlink 
+Bug-Debian: https://bugs.debian.org/1059373
+Last-Update: 2023-12-27
+
+--- bcachefs-tools-24+really1.3.4.orig/Makefile
++++ bcachefs-tools-24+really1.3.4/Makefile
+@@ -159,7 +159,8 @@ install: bcachefs
+       $(INSTALL) -m0755 -D initramfs/hook   $(DESTDIR)$(INITRAMFS_HOOK)
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs
+-      $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
++      # See: #1057295
++      # $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs
diff --git a/debian/patches/series b/debian/patches/series
new file mode 100644 (file)
index 0000000..adfba3c
--- /dev/null
@@ -0,0 +1 @@
+do-not-install-mount-symlink
index a202318e1731d696b77368b13e704b211253e60b..fd523507ca0a417387107635177daaccc694a8bf 100755 (executable)
@@ -5,6 +5,7 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all
 export NO_RUST=-true
 
 PREFIX := /usr
+ROOT_SBINDIR := /usr/sbin
 
 DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH)
 
@@ -17,6 +18,6 @@ endif
        dh $@
 
 override_dh_auto_install:
-       dh_auto_install -- "PREFIX=$(PREFIX)"
+       dh_auto_install -- "PREFIX=$(PREFIX)" "ROOT_SBINDIR=$(ROOT_SBINDIR)"
 
 override_dh_auto_test:
diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h
new file mode 100644 (file)
index 0000000..5198e94
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+       enum btree_id           btree;
+       struct bpos             pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+       return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN      BBPOS(0, POS_MIN)
+#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c
new file mode 100644 (file)
index 0000000..58a981b
--- /dev/null
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bset.h"
+#include "btree_journal_iter.h"
+#include "journal_io.h"
+
+#include <linux/sort.h>
+
+/*
+ * For managing keys we read from the journal: until journal replay works normal
+ * btree lookups need to be able to find and return keys from the journal where
+ * they overwrite what's in the btree, so we have a special iterator and
+ * operations for the regular btree iter code to use:
+ */
+
+static int __journal_key_cmp(enum btree_id     l_btree_id,
+                            unsigned           l_level,
+                            struct bpos        l_pos,
+                            const struct journal_key *r)
+{
+       return (cmp_int(l_btree_id,     r->btree_id) ?:
+               cmp_int(l_level,        r->level) ?:
+               bpos_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
+{
+       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
+}
+
+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
+{
+       size_t gap_size = keys->size - keys->nr;
+
+       if (idx >= keys->gap)
+               idx += gap_size;
+       return idx;
+}
+
+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
+{
+       return keys->d + idx_to_pos(keys, idx);
+}
+
+static size_t __bch2_journal_key_search(struct journal_keys *keys,
+                                       enum btree_id id, unsigned level,
+                                       struct bpos pos)
+{
+       size_t l = 0, r = keys->nr, m;
+
+       while (l < r) {
+               m = l + ((r - l) >> 1);
+               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
+                       l = m + 1;
+               else
+                       r = m;
+       }
+
+       BUG_ON(l < keys->nr &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
+
+       BUG_ON(l &&
+              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
+
+       return l;
+}
+
+static size_t bch2_journal_key_search(struct journal_keys *keys,
+                                     enum btree_id id, unsigned level,
+                                     struct bpos pos)
+{
+       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
+}
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos,
+                                          struct bpos end_pos, size_t *idx)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       unsigned iters = 0;
+       struct journal_key *k;
+search:
+       if (!*idx)
+               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+
+       while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
+               if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
+                       return NULL;
+
+               if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
+                   !k->overwritten)
+                       return k->k;
+
+               (*idx)++;
+               iters++;
+               if (iters == 10) {
+                       *idx = 0;
+                       goto search;
+               }
+       }
+
+       return NULL;
+}
+
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
+                                          unsigned level, struct bpos pos)
+{
+       size_t idx = 0;
+
+       return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
+}
+
+static void journal_iters_fix(struct bch_fs *c)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       /* The key we just inserted is immediately before the gap: */
+       size_t gap_end = keys->gap + (keys->size - keys->nr);
+       struct btree_and_journal_iter *iter;
+
+       /*
+        * If an iterator points one after the key we just inserted, decrement
+        * the iterator so it points at the key we just inserted - if the
+        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
+        * handle that:
+        */
+       list_for_each_entry(iter, &c->journal_iters, journal.list)
+               if (iter->journal.idx == gap_end)
+                       iter->journal.idx = keys->gap - 1;
+}
+
+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       size_t gap_size = keys->size - keys->nr;
+
+       list_for_each_entry(iter, &c->journal_iters, list) {
+               if (iter->idx > old_gap)
+                       iter->idx -= gap_size;
+               if (iter->idx >= new_gap)
+                       iter->idx += gap_size;
+       }
+}
+
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+                                unsigned level, struct bkey_i *k)
+{
+       struct journal_key n = {
+               .btree_id       = id,
+               .level          = level,
+               .k              = k,
+               .allocated      = true,
+               /*
+                * Ensure these keys are done last by journal replay, to unblock
+                * journal reclaim:
+                */
+               .journal_seq    = U32_MAX,
+       };
+       struct journal_keys *keys = &c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+       BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+
+       if (idx < keys->size &&
+           journal_key_cmp(&n, &keys->d[idx]) == 0) {
+               if (keys->d[idx].allocated)
+                       kfree(keys->d[idx].k);
+               keys->d[idx] = n;
+               return 0;
+       }
+
+       if (idx > keys->gap)
+               idx -= keys->size - keys->nr;
+
+       if (keys->nr == keys->size) {
+               struct journal_keys new_keys = {
+                       .nr                     = keys->nr,
+                       .size                   = max_t(size_t, keys->size, 8) * 2,
+               };
+
+               new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
+               if (!new_keys.d) {
+                       bch_err(c, "%s: error allocating new key array (size %zu)",
+                               __func__, new_keys.size);
+                       return -BCH_ERR_ENOMEM_journal_key_insert;
+               }
+
+               /* Since @keys was full, there was no gap: */
+               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+               kvfree(keys->d);
+               *keys = new_keys;
+
+               /* And now the gap is at the end: */
+               keys->gap = keys->nr;
+       }
+
+       journal_iters_move_gap(c, keys->gap, idx);
+
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
+       keys->gap = idx;
+
+       keys->nr++;
+       keys->d[keys->gap++] = n;
+
+       journal_iters_fix(c);
+
+       return 0;
+}
+
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bkey_i *k)
+{
+       struct bkey_i *n;
+       int ret;
+
+       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+       if (!n)
+               return -BCH_ERR_ENOMEM_journal_key_insert;
+
+       bkey_copy(n, k);
+       ret = bch2_journal_key_insert_take(c, id, level, n);
+       if (ret)
+               kfree(n);
+       return ret;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bpos pos)
+{
+       struct bkey_i whiteout;
+
+       bkey_init(&whiteout.k);
+       whiteout.k.p = pos;
+
+       return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+                                 unsigned level, struct bpos pos)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+       if (idx < keys->size &&
+           keys->d[idx].btree_id       == btree &&
+           keys->d[idx].level          == level &&
+           bpos_eq(keys->d[idx].k->k.p, pos))
+               keys->d[idx].overwritten = true;
+}
+
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->idx < iter->keys->size) {
+               iter->idx++;
+               if (iter->idx == iter->keys->gap)
+                       iter->idx += iter->keys->size - iter->keys->nr;
+       }
+}
+
+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+{
+       struct journal_key *k = iter->keys->d + iter->idx;
+
+       while (k < iter->keys->d + iter->keys->size &&
+              k->btree_id      == iter->btree_id &&
+              k->level         == iter->level) {
+               if (!k->overwritten)
+                       return bkey_i_to_s_c(k->k);
+
+               bch2_journal_iter_advance(iter);
+               k = iter->keys->d + iter->idx;
+       }
+
+       return bkey_s_c_null;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+       list_del(&iter->list);
+}
+
+static void bch2_journal_iter_init(struct bch_fs *c,
+                                  struct journal_iter *iter,
+                                  enum btree_id id, unsigned level,
+                                  struct bpos pos)
+{
+       iter->btree_id  = id;
+       iter->level     = level;
+       iter->keys      = &c->journal_keys;
+       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                               iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+}
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
+{
+       if (bpos_eq(iter->pos, SPOS_MAX))
+               iter->at_end = true;
+       else
+               iter->pos = bpos_successor(iter->pos);
+}
+
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
+{
+       struct bkey_s_c btree_k, journal_k, ret;
+again:
+       if (iter->at_end)
+               return bkey_s_c_null;
+
+       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
+              bpos_lt(btree_k.k->p, iter->pos))
+               bch2_journal_iter_advance_btree(iter);
+
+       while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+              bpos_lt(journal_k.k->p, iter->pos))
+               bch2_journal_iter_advance(&iter->journal);
+
+       ret = journal_k.k &&
+               (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
+               ? journal_k
+               : btree_k;
+
+       if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
+               ret = bkey_s_c_null;
+
+       if (ret.k) {
+               iter->pos = ret.k->p;
+               if (bkey_deleted(ret.k)) {
+                       bch2_btree_and_journal_iter_advance(iter);
+                       goto again;
+               }
+       } else {
+               iter->pos = SPOS_MAX;
+               iter->at_end = true;
+       }
+
+       return ret;
+}
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
+{
+       bch2_journal_iter_exit(&iter->journal);
+}
+
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                                 struct bch_fs *c,
+                                                 struct btree *b,
+                                                 struct btree_node_iter node_iter,
+                                                 struct bpos pos)
+{
+       memset(iter, 0, sizeof(*iter));
+
+       iter->b = b;
+       iter->node_iter = node_iter;
+       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+       INIT_LIST_HEAD(&iter->journal.list);
+       iter->pos = b->data->min_key;
+       iter->at_end = false;
+}
+
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct bch_fs *c,
+                                               struct btree *b)
+{
+       struct btree_node_iter node_iter;
+
+       bch2_btree_node_iter_init_from_start(&node_iter, b);
+       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+       list_add(&iter->journal.list, &c->journal_iters);
+}
+
+/* sort and dedup all keys in the journal: */
+
+void bch2_journal_entries_free(struct bch_fs *c)
+{
+       struct journal_replay **i;
+       struct genradix_iter iter;
+
+       genradix_for_each(&c->journal_entries, iter, i)
+               if (*i)
+                       kvpfree(*i, offsetof(struct journal_replay, j) +
+                               vstruct_bytes(&(*i)->j));
+       genradix_free(&c->journal_entries);
+}
+
+/*
+ * When keys compare equal, oldest compares first:
+ */
+static int journal_sort_key_cmp(const void *_l, const void *_r)
+{
+       const struct journal_key *l = _l;
+       const struct journal_key *r = _r;
+
+       return  journal_key_cmp(l, r) ?:
+               cmp_int(l->journal_seq, r->journal_seq) ?:
+               cmp_int(l->journal_offset, r->journal_offset);
+}
+
+void bch2_journal_keys_free(struct journal_keys *keys)
+{
+       struct journal_key *i;
+
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
+       for (i = keys->d; i < keys->d + keys->nr; i++)
+               if (i->allocated)
+                       kfree(i->k);
+
+       kvfree(keys->d);
+       keys->d = NULL;
+       keys->nr = keys->gap = keys->size = 0;
+}
+
+static void __journal_keys_sort(struct journal_keys *keys)
+{
+       struct journal_key *src, *dst;
+
+       sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+
+       src = dst = keys->d;
+       while (src < keys->d + keys->nr) {
+               while (src + 1 < keys->d + keys->nr &&
+                      src[0].btree_id  == src[1].btree_id &&
+                      src[0].level     == src[1].level &&
+                      bpos_eq(src[0].k->k.p, src[1].k->k.p))
+                       src++;
+
+               *dst++ = *src++;
+       }
+
+       keys->nr = dst - keys->d;
+}
+
+int bch2_journal_keys_sort(struct bch_fs *c)
+{
+       struct genradix_iter iter;
+       struct journal_replay *i, **_i;
+       struct jset_entry *entry;
+       struct bkey_i *k;
+       struct journal_keys *keys = &c->journal_keys;
+       size_t nr_keys = 0, nr_read = 0;
+
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               for_each_jset_key(k, entry, &i->j)
+                       nr_keys++;
+       }
+
+       if (!nr_keys)
+               return 0;
+
+       keys->size = roundup_pow_of_two(nr_keys);
+
+       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+       if (!keys->d) {
+               bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
+                       nr_keys);
+
+               do {
+                       keys->size >>= 1;
+                       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
+               } while (!keys->d && keys->size > nr_keys / 8);
+
+               if (!keys->d) {
+                       bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
+                               keys->size);
+                       return -BCH_ERR_ENOMEM_journal_keys_sort;
+               }
+       }
+
+       genradix_for_each(&c->journal_entries, iter, _i) {
+               i = *_i;
+
+               if (!i || i->ignore)
+                       continue;
+
+               cond_resched();
+
+               for_each_jset_key(k, entry, &i->j) {
+                       if (keys->nr == keys->size) {
+                               __journal_keys_sort(keys);
+
+                               if (keys->nr > keys->size * 7 / 8) {
+                                       bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
+                                               keys->nr, keys->size, nr_read, nr_keys);
+                                       return -BCH_ERR_ENOMEM_journal_keys_sort;
+                               }
+                       }
+
+                       keys->d[keys->nr++] = (struct journal_key) {
+                               .btree_id       = entry->btree_id,
+                               .level          = entry->level,
+                               .k              = k,
+                               .journal_seq    = le64_to_cpu(i->j.seq),
+                               .journal_offset = k->_data - i->j._data,
+                       };
+
+                       nr_read++;
+               }
+       }
+
+       __journal_keys_sort(keys);
+       keys->gap = keys->nr;
+
+       bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+       return 0;
+}
diff --git a/libbcachefs/btree_journal_iter.h b/libbcachefs/btree_journal_iter.h
new file mode 100644 (file)
index 0000000..5d64e7e
--- /dev/null
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H
+#define _BCACHEFS_BTREE_JOURNAL_ITER_H
+
+struct journal_iter {
+       struct list_head        list;
+       enum btree_id           btree_id;
+       unsigned                level;
+       size_t                  idx;
+       struct journal_keys     *keys;
+};
+
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
+
+struct btree_and_journal_iter {
+       struct btree            *b;
+       struct btree_node_iter  node_iter;
+       struct bkey             unpacked;
+
+       struct journal_iter     journal;
+       struct bpos             pos;
+       bool                    at_end;
+};
+
+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
+                               unsigned, struct bpos, struct bpos, size_t *);
+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
+                                          unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+                                unsigned, struct bkey_i *);
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+                           unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+                           unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+                                 unsigned, struct bpos);
+
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
+
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                               struct bch_fs *, struct btree *,
+                               struct btree_node_iter, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                                               struct bch_fs *,
+                                               struct btree *);
+
+void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_entries_free(struct bch_fs *);
+
+int bch2_journal_keys_sort(struct bch_fs *);
+
+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/libbcachefs/btree_key_cache_types.h b/libbcachefs/btree_key_cache_types.h
new file mode 100644 (file)
index 0000000..cfd09f5
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+#define _BCACHEFS_BTREE_KEY_CACHE_TYPES_H
+
+struct btree_key_cache_freelist {
+       struct bkey_cached      *objs[16];
+       unsigned                nr;
+};
+
+struct btree_key_cache {
+       struct mutex            lock;
+       struct rhashtable       table;
+       bool                    table_init_done;
+
+       struct list_head        freed_pcpu;
+       size_t                  nr_freed_pcpu;
+       struct list_head        freed_nonpcpu;
+       size_t                  nr_freed_nonpcpu;
+
+       struct shrinker         shrink;
+       unsigned                shrink_iter;
+       struct btree_key_cache_freelist __percpu *pcpu_freed;
+
+       atomic_long_t           nr_freed;
+       atomic_long_t           nr_keys;
+       atomic_long_t           nr_dirty;
+};
+
+struct bkey_cached_key {
+       u32                     btree_id;
+       struct bpos             pos;
+} __packed __aligned(4);
+
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_TYPES_H */
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
new file mode 100644 (file)
index 0000000..70077ef
--- /dev/null
@@ -0,0 +1,1172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_gc.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_key_cache.h"
+#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "snapshot.h"
+
+#include <linux/prefetch.h>
+
+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bch_fs *c = trans->c;
+       struct bkey u;
+       struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+
+       if (unlikely(trans->journal_replay_not_finished)) {
+               struct bkey_i *j_k =
+                       bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
+
+               if (j_k)
+                       k = bkey_i_to_s_c(j_k);
+       }
+
+       u = *k.k;
+       u.needs_whiteout = i->old_k.needs_whiteout;
+
+       BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
+       BUG_ON(i->old_v != k.v);
+#endif
+}
+
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+       return i->path->l + i->level;
+}
+
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
+                                    struct btree_insert_entry *i)
+{
+       return i != trans->updates &&
+               insert_l(&i[0])->b == insert_l(&i[-1])->b;
+}
+
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+                                    struct btree_insert_entry *i)
+{
+       return i + 1 < trans->updates + trans->nr_updates &&
+               insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
+
+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+                                          struct btree_path *path,
+                                          struct btree *b)
+{
+       struct bch_fs *c = trans->c;
+
+       if (unlikely(btree_node_just_written(b)) &&
+           bch2_btree_post_write_cleanup(c, b))
+               bch2_trans_node_reinit_iter(trans, b);
+
+       /*
+        * If the last bset has been written, or if it's gotten too big - start
+        * a new bset to insert into:
+        */
+       if (want_new_bset(c, b))
+               bch2_btree_init_next(trans, b);
+}
+
+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
+{
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
+       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
+}
+
+static inline int bch2_trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       EBUG_ON(trans->write_locked);
+
+       trans_for_each_update(trans, i) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
+
+               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+                       return trans_lock_write_fail(trans, i);
+
+               if (!i->cached)
+                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+       }
+
+       trans->write_locked = true;
+       return 0;
+}
+
+static inline void bch2_trans_unlock_write(struct btree_trans *trans)
+{
+       if (likely(trans->write_locked)) {
+               struct btree_insert_entry *i;
+
+               trans_for_each_update(trans, i)
+                       if (!same_leaf_as_prev(trans, i))
+                               bch2_btree_node_unlock_write_inlined(trans, i->path,
+                                                                    insert_l(i)->b);
+               trans->write_locked = false;
+       }
+}
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+                               struct btree_path *path,
+                               struct btree *b,
+                               struct btree_node_iter *node_iter,
+                               struct bkey_i *insert)
+{
+       struct bkey_packed *k;
+       unsigned clobber_u64s = 0, new_u64s = 0;
+
+       EBUG_ON(btree_node_just_written(b));
+       EBUG_ON(bset_written(b, btree_bset_last(b)));
+       EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+       EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
+       EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
+       EBUG_ON(insert->k.u64s >
+               bch_btree_keys_u64s_remaining(trans->c, b));
+       EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
+
+       k = bch2_btree_node_iter_peek_all(node_iter, b);
+       if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
+               k = NULL;
+
+       /* @k is the key being overwritten/deleted, if any: */
+       EBUG_ON(k && bkey_deleted(k));
+
+       /* Deleting, but not found? nothing to do: */
+       if (bkey_deleted(&insert->k) && !k)
+               return false;
+
+       if (bkey_deleted(&insert->k)) {
+               /* Deleting: */
+               btree_account_key_drop(b, k);
+               k->type = KEY_TYPE_deleted;
+
+               if (k->needs_whiteout)
+                       push_whiteout(trans->c, b, insert->k.p);
+               k->needs_whiteout = false;
+
+               if (k >= btree_bset_last(b)->start) {
+                       clobber_u64s = k->u64s;
+                       bch2_bset_delete(b, k, clobber_u64s);
+                       goto fix_iter;
+               } else {
+                       bch2_btree_path_fix_key_modified(trans, b, k);
+               }
+
+               return true;
+       }
+
+       if (k) {
+               /* Overwriting: */
+               btree_account_key_drop(b, k);
+               k->type = KEY_TYPE_deleted;
+
+               insert->k.needs_whiteout = k->needs_whiteout;
+               k->needs_whiteout = false;
+
+               if (k >= btree_bset_last(b)->start) {
+                       clobber_u64s = k->u64s;
+                       goto overwrite;
+               } else {
+                       bch2_btree_path_fix_key_modified(trans, b, k);
+               }
+       }
+
+       k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
+overwrite:
+       bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+       new_u64s = k->u64s;
+fix_iter:
+       if (clobber_u64s != new_u64s)
+               bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
+                                        clobber_u64s, new_u64s);
+       return true;
+}
+
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+                              unsigned i, u64 seq)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct btree_write *w = container_of(pin, struct btree_write, journal);
+       struct btree *b = container_of(w, struct btree, writes[i]);
+       struct btree_trans *trans = bch2_trans_get(c);
+       unsigned long old, new, v;
+       unsigned idx = w - b->writes;
+
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+       v = READ_ONCE(b->flags);
+
+       do {
+               old = new = v;
+
+               if (!(old & (1 << BTREE_NODE_dirty)) ||
+                   !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
+                   w->journal.seq != seq)
+                       break;
+
+               new &= ~BTREE_WRITE_TYPE_MASK;
+               new |= BTREE_WRITE_journal_reclaim;
+               new |= 1 << BTREE_NODE_need_write;
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       btree_node_write_if_need(c, b, SIX_LOCK_read);
+       six_unlock_read(&b->c.lock);
+
+       bch2_trans_put(trans);
+       return 0;
+}
+
+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+       return __btree_node_flush(j, pin, 0, seq);
+}
+
+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+       return __btree_node_flush(j, pin, 1, seq);
+}
+
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+                                      struct btree *b, u64 seq)
+{
+       struct btree_write *w = btree_current_write(b);
+
+       bch2_journal_pin_add(&c->journal, seq, &w->journal,
+                            btree_node_write_idx(b) == 0
+                            ? bch2_btree_node_flush0
+                            : bch2_btree_node_flush1);
+}
+
+/**
+ * bch2_btree_insert_key_leaf() - insert a key one key into a leaf node
+ * @trans:             btree transaction object
+ * @path:              path pointing to @insert's pos
+ * @insert:            key to insert
+ * @journal_seq:       sequence number of journal reservation
+ */
+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
+                                      struct btree_path *path,
+                                      struct bkey_i *insert,
+                                      u64 journal_seq)
+{
+       struct bch_fs *c = trans->c;
+       struct btree *b = path_l(path)->b;
+       struct bset_tree *t = bset_tree_last(b);
+       struct bset *i = bset(b, t);
+       int old_u64s = bset_u64s(t);
+       int old_live_u64s = b->nr.live_u64s;
+       int live_u64s_added, u64s_added;
+
+       if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
+                                       &path_l(path)->iter, insert)))
+               return;
+
+       i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
+
+       bch2_btree_add_journal_pin(c, b, journal_seq);
+
+       if (unlikely(!btree_node_dirty(b))) {
+               EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+               set_btree_node_dirty_acct(c, b);
+       }
+
+       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+       u64s_added = (int) bset_u64s(t) - old_u64s;
+
+       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+       if (u64s_added > live_u64s_added &&
+           bch2_maybe_compact_whiteouts(c, b))
+               bch2_trans_node_reinit_iter(trans, b);
+}
+
+/* Cached btree updates: */
+
+/* Normal update interface: */
+
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
+                                            struct btree_insert_entry *i)
+{
+       BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
+       BUG_ON(i->cached        != i->path->cached);
+       BUG_ON(i->level         != i->path->level);
+       BUG_ON(i->btree_id      != i->path->btree_id);
+       EBUG_ON(!i->level &&
+               btree_type_has_snapshots(i->btree_id) &&
+               !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+               test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+               i->k->k.p.snapshot &&
+               bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+}
+
+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
+                                                     unsigned flags)
+{
+       return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
+                                   trans->journal_u64s, flags);
+}
+
+#define JSET_ENTRY_LOG_U64s            4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct journal *j = &c->journal;
+       struct jset_entry *entry =
+               bch2_journal_add_entry(j, &trans->journal_res,
+                                      BCH_JSET_ENTRY_log, 0, 0,
+                                      JSET_ENTRY_LOG_U64s);
+       struct jset_entry_log *l =
+               container_of(entry, struct jset_entry_log, entry);
+
+       strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
+}
+
+static inline int btree_key_can_insert(struct btree_trans *trans,
+                                      struct btree *b, unsigned u64s)
+{
+       struct bch_fs *c = trans->c;
+
+       if (!bch2_btree_node_insert_fits(c, b, u64s))
+               return -BCH_ERR_btree_insert_btree_node_full;
+
+       return 0;
+}
+
+noinline static int
+btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
+                                    struct btree_path *path, unsigned new_u64s)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct bkey_cached *ck = (void *) path->l[0].b;
+       struct bkey_i *new_k;
+       int ret;
+
+       bch2_trans_unlock_write(trans);
+       bch2_trans_unlock(trans);
+
+       new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
+       if (!new_k) {
+               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+                       bch2_btree_id_str(path->btree_id), new_u64s);
+               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
+       }
+
+       ret =   bch2_trans_relock(trans) ?:
+               bch2_trans_lock_write(trans);
+       if (unlikely(ret)) {
+               kfree(new_k);
+               return ret;
+       }
+
+       memcpy(new_k, ck->k, ck->u64s * sizeof(u64));
+
+       trans_for_each_update(trans, i)
+               if (i->old_v == &ck->k->v)
+                       i->old_v = &new_k->v;
+
+       kfree(ck->k);
+       ck->u64s        = new_u64s;
+       ck->k           = new_k;
+       return 0;
+}
+
+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
+                                      struct btree_path *path, unsigned u64s)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_cached *ck = (void *) path->l[0].b;
+       struct btree_insert_entry *i;
+       unsigned new_u64s;
+       struct bkey_i *new_k;
+
+       EBUG_ON(path->level);
+
+       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+           bch2_btree_key_cache_must_wait(c) &&
+           !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+               return -BCH_ERR_btree_insert_need_journal_reclaim;
+
+       /*
+        * bch2_varint_decode can read past the end of the buffer by at most 7
+        * bytes (it won't be used):
+        */
+       u64s += 1;
+
+       if (u64s <= ck->u64s)
+               return 0;
+
+       new_u64s        = roundup_pow_of_two(u64s);
+       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+       if (unlikely(!new_k))
+               return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
+
+       trans_for_each_update(trans, i)
+               if (i->old_v == &ck->k->v)
+                       i->old_v = &new_k->v;
+
+       ck->u64s        = new_u64s;
+       ck->k           = new_k;
+       return 0;
+}
+
+/* Triggers: */
+
+static int run_one_mem_trigger(struct btree_trans *trans,
+                              struct btree_insert_entry *i,
+                              unsigned flags)
+{
+       struct bkey_s_c old = { &i->old_k, i->old_v };
+       struct bkey_i *new = i->k;
+       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+       int ret;
+
+       verify_update_old_key(trans, i);
+
+       if (unlikely(flags & BTREE_TRIGGER_NORUN))
+               return 0;
+
+       if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
+               return 0;
+
+       if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
+               ret   = bch2_mark_key(trans, i->btree_id, i->level,
+                               old, bkey_i_to_s_c(new),
+                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
+       } else {
+               struct bkey             _deleted = KEY(0, 0, 0);
+               struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
+
+               _deleted.p = i->path->pos;
+
+               ret   = bch2_mark_key(trans, i->btree_id, i->level,
+                               deleted, bkey_i_to_s_c(new),
+                               BTREE_TRIGGER_INSERT|flags) ?:
+                       bch2_mark_key(trans, i->btree_id, i->level,
+                               old, deleted,
+                               BTREE_TRIGGER_OVERWRITE|flags);
+       }
+
+       return ret;
+}
+
+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+                                bool overwrite)
+{
+       /*
+        * Transactional triggers create new btree_insert_entries, so we can't
+        * pass them a pointer to a btree_insert_entry, that memory is going to
+        * move:
+        */
+       struct bkey old_k = i->old_k;
+       struct bkey_s_c old = { &old_k, i->old_v };
+       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
+       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+
+       verify_update_old_key(trans, i);
+
+       if ((i->flags & BTREE_TRIGGER_NORUN) ||
+           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+               return 0;
+
+       if (!i->insert_trigger_run &&
+           !i->overwrite_trigger_run &&
+           old_ops->trans_trigger == new_ops->trans_trigger) {
+               i->overwrite_trigger_run = true;
+               i->insert_trigger_run = true;
+               return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
+                                          BTREE_TRIGGER_INSERT|
+                                          BTREE_TRIGGER_OVERWRITE|
+                                          i->flags) ?: 1;
+       } else if (overwrite && !i->overwrite_trigger_run) {
+               i->overwrite_trigger_run = true;
+               return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+       } else if (!overwrite && !i->insert_trigger_run) {
+               i->insert_trigger_run = true;
+               return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+       } else {
+               return 0;
+       }
+}
+
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+                             struct btree_insert_entry *btree_id_start)
+{
+       struct btree_insert_entry *i;
+       bool trans_trigger_run;
+       int ret, overwrite;
+
+       for (overwrite = 1; overwrite >= 0; --overwrite) {
+
+               /*
+                * Running triggers will append more updates to the list of updates as
+                * we're walking it:
+                */
+               do {
+                       trans_trigger_run = false;
+
+                       for (i = btree_id_start;
+                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+                            i++) {
+                               if (i->btree_id != btree_id)
+                                       continue;
+
+                               ret = run_one_trans_trigger(trans, i, overwrite);
+                               if (ret < 0)
+                                       return ret;
+                               if (ret)
+                                       trans_trigger_run = true;
+                       }
+               } while (trans_trigger_run);
+       }
+
+       return 0;
+}
+
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+       unsigned btree_id = 0;
+       int ret = 0;
+
+       /*
+        *
+        * For a given btree, this algorithm runs insert triggers before
+        * overwrite triggers: this is so that when extents are being moved
+        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+        * they are re-added.
+        */
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               if (btree_id == BTREE_ID_alloc)
+                       continue;
+
+               while (btree_id_start < trans->updates + trans->nr_updates &&
+                      btree_id_start->btree_id < btree_id)
+                       btree_id_start++;
+
+               ret = run_btree_triggers(trans, btree_id, btree_id_start);
+               if (ret)
+                       return ret;
+       }
+
+       trans_for_each_update(trans, i) {
+               if (i->btree_id > BTREE_ID_alloc)
+                       break;
+               if (i->btree_id == BTREE_ID_alloc) {
+                       ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+                       if (ret)
+                               return ret;
+                       break;
+               }
+       }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans_for_each_update(trans, i)
+               BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+                      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
+#endif
+       return 0;
+}
+
+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       int ret = 0;
+
+       trans_for_each_update(trans, i) {
+               /*
+                * XXX: synchronization of cached update triggers with gc
+                * XXX: synchronization of interior node updates with gc
+                */
+               BUG_ON(i->cached || i->level);
+
+               if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+                       ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+                       if (ret)
+                               break;
+               }
+       }
+
+       return ret;
+}
+
+static inline int
+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
+                              struct btree_insert_entry **stopped_at,
+                              unsigned long trace_ip)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct btree_write_buffered_key *wb;
+       struct btree_trans_commit_hook *h;
+       unsigned u64s = 0;
+       int ret;
+
+       if (race_fault()) {
+               trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
+               return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
+       }
+
+       /*
+        * Check if the insert will fit in the leaf node with the write lock
+        * held, otherwise another thread could write the node changing the
+        * amount of space available:
+        */
+
+       prefetch(&trans->c->journal.flags);
+
+       trans_for_each_update(trans, i) {
+               /* Multiple inserts might go to same leaf: */
+               if (!same_leaf_as_prev(trans, i))
+                       u64s = 0;
+
+               u64s += i->k->k.u64s;
+               ret = !i->cached
+                       ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+                       : btree_key_can_insert_cached(trans, flags, i->path, u64s);
+               if (ret) {
+                       *stopped_at = i;
+                       return ret;
+               }
+
+               i->k->k.needs_whiteout = false;
+       }
+
+       if (trans->nr_wb_updates &&
+           trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
+               return -BCH_ERR_btree_insert_need_flush_buffer;
+
+       /*
+        * Don't get journal reservation until after we know insert will
+        * succeed:
+        */
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
+               ret = bch2_trans_journal_res_get(trans,
+                               (flags & BCH_WATERMARK_MASK)|
+                               JOURNAL_RES_GET_NONBLOCK);
+               if (ret)
+                       return ret;
+
+               if (unlikely(trans->journal_transaction_names))
+                       journal_transaction_name(trans);
+       }
+
+       /*
+        * Not allowed to fail after we've gotten our journal reservation - we
+        * have to use it:
+        */
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
+           !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
+               if (bch2_journal_seq_verify)
+                       trans_for_each_update(trans, i)
+                               i->k->k.version.lo = trans->journal_res.seq;
+               else if (bch2_inject_invalid_keys)
+                       trans_for_each_update(trans, i)
+                               i->k->k.version = MAX_VERSION;
+       }
+
+       if (trans->fs_usage_deltas &&
+           bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+               return -BCH_ERR_btree_insert_need_mark_replicas;
+
+       if (trans->nr_wb_updates) {
+               EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
+
+               ret = bch2_btree_insert_keys_write_buffer(trans);
+               if (ret)
+                       goto revert_fs_usage;
+       }
+
+       h = trans->hooks;
+       while (h) {
+               ret = h->fn(trans, h);
+               if (ret)
+                       goto revert_fs_usage;
+               h = h->next;
+       }
+
+       trans_for_each_update(trans, i)
+               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+                       ret = run_one_mem_trigger(trans, i, i->flags);
+                       if (ret)
+                               goto fatal_err;
+               }
+
+       if (unlikely(c->gc_pos.phase)) {
+               ret = bch2_trans_commit_run_gc_triggers(trans);
+               if  (ret)
+                       goto fatal_err;
+       }
+
+       if (unlikely(trans->extra_journal_entries.nr)) {
+               memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+                                 trans->extra_journal_entries.data,
+                                 trans->extra_journal_entries.nr);
+
+               trans->journal_res.offset       += trans->extra_journal_entries.nr;
+               trans->journal_res.u64s         -= trans->extra_journal_entries.nr;
+       }
+
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
+               struct journal *j = &c->journal;
+               struct jset_entry *entry;
+
+               trans_for_each_update(trans, i) {
+                       if (i->key_cache_already_flushed)
+                               continue;
+
+                       if (i->flags & BTREE_UPDATE_NOJOURNAL)
+                               continue;
+
+                       verify_update_old_key(trans, i);
+
+                       if (trans->journal_transaction_names) {
+                               entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                                      BCH_JSET_ENTRY_overwrite,
+                                                      i->btree_id, i->level,
+                                                      i->old_k.u64s);
+                               bkey_reassemble((struct bkey_i *) entry->start,
+                                               (struct bkey_s_c) { &i->old_k, i->old_v });
+                       }
+
+                       entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                              BCH_JSET_ENTRY_btree_keys,
+                                              i->btree_id, i->level,
+                                              i->k->k.u64s);
+                       bkey_copy((struct bkey_i *) entry->start, i->k);
+               }
+
+               trans_for_each_wb_update(trans, wb) {
+                       entry = bch2_journal_add_entry(j, &trans->journal_res,
+                                              BCH_JSET_ENTRY_btree_keys,
+                                              wb->btree, 0,
+                                              wb->k.k.u64s);
+                       bkey_copy((struct bkey_i *) entry->start, &wb->k);
+               }
+
+               if (trans->journal_seq)
+                       *trans->journal_seq = trans->journal_res.seq;
+       }
+
+       trans_for_each_update(trans, i) {
+               if (!i->cached) {
+                       bch2_btree_insert_key_leaf(trans, i->path, i->k, trans->journal_res.seq);
+               } else if (!i->key_cache_already_flushed)
+                       bch2_btree_insert_key_cached(trans, flags, i);
+               else {
+                       bch2_btree_key_cache_drop(trans, i->path);
+                       btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+               }
+       }
+
+       return 0;
+fatal_err:
+       bch2_fatal_error(c);
+revert_fs_usage:
+       if (trans->fs_usage_deltas)
+               bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
+       return ret;
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+       struct btree_write_buffered_key *wb;
+
+       trans_for_each_update(trans, i)
+               bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
+
+       trans_for_each_wb_update(trans, wb)
+               bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
+}
+
+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
+                                                  enum bkey_invalid_flags flags,
+                                                  struct btree_insert_entry *i,
+                                                  struct printbuf *err)
+{
+       struct bch_fs *c = trans->c;
+
+       printbuf_reset(err);
+       prt_printf(err, "invalid bkey on insert from %s -> %ps",
+                  trans->fn, (void *) i->ip_allocated);
+       prt_newline(err);
+       printbuf_indent_add(err, 2);
+
+       bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
+       prt_newline(err);
+
+       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->bkey_type, flags, err);
+       bch2_print_string_as_lines(KERN_ERR, err->buf);
+
+       bch2_inconsistent_error(c);
+       bch2_dump_trans_updates(trans);
+
+       return -EINVAL;
+}
+
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
+ */
+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
+                                      struct btree_insert_entry **stopped_at,
+                                      unsigned long trace_ip)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       int ret = 0, u64s_delta = 0;
+
+       trans_for_each_update(trans, i) {
+               if (i->cached)
+                       continue;
+
+               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+               u64s_delta -= i->old_btree_u64s;
+
+               if (!same_leaf_as_next(trans, i)) {
+                       if (u64s_delta <= 0) {
+                               ret = bch2_foreground_maybe_merge(trans, i->path,
+                                                       i->level, flags);
+                               if (unlikely(ret))
+                                       return ret;
+                       }
+
+                       u64s_delta = 0;
+               }
+       }
+
+       ret = bch2_trans_lock_write(trans);
+       if (unlikely(ret))
+               return ret;
+
+       ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
+
+       if (!ret && unlikely(trans->journal_replay_not_finished))
+               bch2_drop_overwrites_from_journal(trans);
+
+       bch2_trans_unlock_write(trans);
+
+       if (!ret && trans->journal_pin)
+               bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
+                                    trans->journal_pin,
+                                    bch2_trans_commit_journal_pin_flush);
+
+       /*
+        * Drop journal reservation after dropping write locks, since dropping
+        * the journal reservation may kick off a journal write:
+        */
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+               bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+       return ret;
+}
+
+static int journal_reclaim_wait_done(struct bch_fs *c)
+{
+       int ret = bch2_journal_error(&c->journal) ?:
+               !bch2_btree_key_cache_must_wait(c);
+
+       if (!ret)
+               journal_reclaim_kick(&c->journal);
+       return ret;
+}
+
+static noinline
+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
+                           struct btree_insert_entry *i,
+                           int ret, unsigned long trace_ip)
+{
+       struct bch_fs *c = trans->c;
+
+       switch (ret) {
+       case -BCH_ERR_btree_insert_btree_node_full:
+               ret = bch2_btree_split_leaf(trans, i->path, flags);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+               break;
+       case -BCH_ERR_btree_insert_need_mark_replicas:
+               ret = drop_locks_do(trans,
+                       bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
+               break;
+       case -BCH_ERR_journal_res_get_blocked:
+               /*
+                * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
+                * flag
+                */
+               if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
+                   (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
+                       break;
+               }
+
+               ret = drop_locks_do(trans,
+                       bch2_trans_journal_res_get(trans,
+                                       (flags & BCH_WATERMARK_MASK)|
+                                       JOURNAL_RES_GET_CHECK));
+               break;
+       case -BCH_ERR_btree_insert_need_journal_reclaim:
+               bch2_trans_unlock(trans);
+
+               trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
+
+               wait_event_freezable(c->journal.reclaim_wait,
+                                    (ret = journal_reclaim_wait_done(c)));
+               if (ret < 0)
+                       break;
+
+               ret = bch2_trans_relock(trans);
+               break;
+       case -BCH_ERR_btree_insert_need_flush_buffer: {
+               struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+               ret = 0;
+
+               if (wb->state.nr > wb->size * 3 / 4) {
+                       bch2_trans_unlock(trans);
+                       mutex_lock(&wb->flush_lock);
+
+                       if (wb->state.nr > wb->size * 3 / 4) {
+                               bch2_trans_begin(trans);
+                               ret = __bch2_btree_write_buffer_flush(trans,
+                                               flags|BCH_TRANS_COMMIT_no_check_rw, true);
+                               if (!ret) {
+                                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+                                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+                               }
+                       } else {
+                               mutex_unlock(&wb->flush_lock);
+                               ret = bch2_trans_relock(trans);
+                       }
+               }
+               break;
+       }
+       default:
+               BUG_ON(ret >= 0);
+               break;
+       }
+
+       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
+
+       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
+                               (flags & BCH_TRANS_COMMIT_no_enospc), c,
+               "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
+
+       return ret;
+}
+
+static noinline int
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       int ret;
+
+       if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
+           test_bit(BCH_FS_STARTED, &c->flags))
+               return -BCH_ERR_erofs_trans_commit;
+
+       ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
+       if (ret)
+               return ret;
+
+       bch2_write_ref_get(c, BCH_WRITE_REF_trans);
+       return 0;
+}
+
+/*
+ * This is for updates done in the early part of fsck - btree_gc - before we've
+ * gone RW. we only add the new key to the list of keys for journal replay to
+ * do.
+ */
+static noinline int
+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       int ret = 0;
+
+       trans_for_each_update(trans, i) {
+               ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i = NULL;
+       struct btree_write_buffered_key *wb;
+       int ret = 0;
+
+       if (!trans->nr_updates &&
+           !trans->nr_wb_updates &&
+           !trans->extra_journal_entries.nr)
+               goto out_reset;
+
+       ret = bch2_trans_commit_run_triggers(trans);
+       if (ret)
+               goto out_reset;
+
+       trans_for_each_update(trans, i) {
+               struct printbuf buf = PRINTBUF;
+               enum bkey_invalid_flags invalid_flags = 0;
+
+               if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+                       invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+               if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
+                                              i->bkey_type, invalid_flags, &buf)))
+                       ret = bch2_trans_commit_bkey_invalid(trans, invalid_flags, i, &buf);
+               btree_insert_entry_checks(trans, i);
+               printbuf_exit(&buf);
+
+               if (ret)
+                       return ret;
+       }
+
+       if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+               ret = do_bch2_trans_commit_to_journal_replay(trans);
+               goto out_reset;
+       }
+
+       if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
+           unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
+               ret = bch2_trans_commit_get_rw_cold(trans, flags);
+               if (ret)
+                       goto out_reset;
+       }
+
+       if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
+           mutex_trylock(&c->btree_write_buffer.flush_lock)) {
+               bch2_trans_begin(trans);
+               bch2_trans_unlock(trans);
+
+               ret = __bch2_btree_write_buffer_flush(trans,
+                                       flags|BCH_TRANS_COMMIT_no_check_rw, true);
+               if (!ret) {
+                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
+                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
+               }
+               goto out;
+       }
+
+       EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+
+       trans->journal_u64s             = trans->extra_journal_entries.nr;
+       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+       if (trans->journal_transaction_names)
+               trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
+
+       trans_for_each_update(trans, i) {
+               EBUG_ON(!i->path->should_be_locked);
+
+               ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+               if (unlikely(ret))
+                       goto out;
+
+               EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+
+               if (i->key_cache_already_flushed)
+                       continue;
+
+               if (i->flags & BTREE_UPDATE_NOJOURNAL)
+                       continue;
+
+               /* we're going to journal the key being updated: */
+               trans->journal_u64s += jset_u64s(i->k->k.u64s);
+
+               /* and we're also going to log the overwrite: */
+               if (trans->journal_transaction_names)
+                       trans->journal_u64s += jset_u64s(i->old_k.u64s);
+       }
+
+       trans_for_each_wb_update(trans, wb)
+               trans->journal_u64s += jset_u64s(wb->k.k.u64s);
+
+       if (trans->extra_journal_res) {
+               ret = bch2_disk_reservation_add(c, trans->disk_res,
+                               trans->extra_journal_res,
+                               (flags & BCH_TRANS_COMMIT_no_enospc)
+                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
+               if (ret)
+                       goto err;
+       }
+retry:
+       bch2_trans_verify_not_in_restart(trans);
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+               memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+       ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+
+       /* make sure we didn't drop or screw up locks: */
+       bch2_trans_verify_locks(trans);
+
+       if (ret)
+               goto err;
+
+       trace_and_count(c, transaction_commit, trans, _RET_IP_);
+out:
+       if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
+               bch2_write_ref_put(c, BCH_WRITE_REF_trans);
+out_reset:
+       if (!ret)
+               bch2_trans_downgrade(trans);
+       bch2_trans_reset_updates(trans);
+
+       return ret;
+err:
+       ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+       if (ret)
+               goto out;
+
+       /*
+        * We might have done another transaction commit in the error path -
+        * i.e. btree write buffer flush - which will have made use of
+        * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+        * how the journal sequence number to pin is passed in - so we must
+        * restart:
+        */
+       if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+               ret = -BCH_ERR_transaction_restart_nested;
+               goto out;
+       }
+
+       goto retry;
+}
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
new file mode 100644 (file)
index 0000000..1837f84
--- /dev/null
@@ -0,0 +1,910 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_iter.h"
+#include "btree_journal_iter.h"
+#include "btree_locking.h"
+#include "buckets.h"
+#include "debug.h"
+#include "errcode.h"
+#include "error.h"
+#include "extents.h"
+#include "keylist.h"
+#include "snapshot.h"
+#include "trace.h"
+
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
+                                        const struct btree_insert_entry *r)
+{
+       return   cmp_int(l->btree_id,   r->btree_id) ?:
+                cmp_int(l->cached,     r->cached) ?:
+                -cmp_int(l->level,     r->level) ?:
+                bpos_cmp(l->k->k.p,    r->k->k.p);
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                         struct bkey_i *, enum btree_update_flags,
+                         unsigned long ip);
+
+static noinline int extent_front_merge(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      struct bkey_s_c k,
+                                      struct bkey_i **insert,
+                                      enum btree_update_flags flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i *update;
+       int ret;
+
+       update = bch2_bkey_make_mut_noupdate(trans, k);
+       ret = PTR_ERR_OR_ZERO(update);
+       if (ret)
+               return ret;
+
+       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
+               return 0;
+
+       ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?:
+               bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       ret = bch2_btree_delete_at(trans, iter, flags);
+       if (ret)
+               return ret;
+
+       *insert = update;
+       return 0;
+}
+
+static noinline int extent_back_merge(struct btree_trans *trans,
+                                     struct btree_iter *iter,
+                                     struct bkey_i *insert,
+                                     struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       int ret;
+
+       ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
+               bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
+       if (ret < 0)
+               return ret;
+       if (ret)
+               return 0;
+
+       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       return 0;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+                                     enum btree_id btree_id, struct bpos pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot = pos.snapshot;
+       int ret;
+
+       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+               return 0;
+
+       pos.snapshot++;
+
+       for_each_btree_key_norestart(trans, iter, btree_id, pos,
+                          BTREE_ITER_ALL_SNAPSHOTS|
+                          BTREE_ITER_NOPRESERVE, k, ret) {
+               if (!bkey_eq(k.k->p, pos))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+                                             k.k->p.snapshot)) {
+                       ret = !bkey_whiteout(k.k);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
+                                  enum btree_id id,
+                                  struct bpos old_pos,
+                                  struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter old_iter, new_iter = { NULL };
+       struct bkey_s_c old_k, new_k;
+       snapshot_id_list s;
+       struct bkey_i *update;
+       int ret = 0;
+
+       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
+               return 0;
+
+       darray_init(&s);
+
+       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
+              !(ret = bkey_err(old_k)) &&
+              bkey_eq(old_pos, old_k.k->p)) {
+               struct bpos whiteout_pos =
+                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+
+               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
+                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
+                       continue;
+
+               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
+                                          BTREE_ITER_NOT_EXTENTS|
+                                          BTREE_ITER_INTENT);
+               ret = bkey_err(new_k);
+               if (ret)
+                       break;
+
+               if (new_k.k->type == KEY_TYPE_deleted) {
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p             = whiteout_pos;
+                       update->k.type          = KEY_TYPE_whiteout;
+
+                       ret = bch2_trans_update(trans, &new_iter, update,
+                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               }
+               bch2_trans_iter_exit(trans, &new_iter);
+
+               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &new_iter);
+       bch2_trans_iter_exit(trans, &old_iter);
+       darray_exit(&s);
+
+       return ret;
+}
+
+int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
+                                      struct btree_iter *iter,
+                                      enum btree_update_flags flags,
+                                      struct bkey_s_c old,
+                                      struct bkey_s_c new)
+{
+       enum btree_id btree_id = iter->btree_id;
+       struct bkey_i *update;
+       struct bpos new_start = bkey_start_pos(new.k);
+       bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
+       bool back_split  = bkey_gt(old.k->p, new.k->p);
+       int ret = 0, compressed_sectors;
+
+       /*
+        * If we're going to be splitting a compressed extent, note it
+        * so that __bch2_trans_commit() can increase our disk
+        * reservation:
+        */
+       if (((front_split && back_split) ||
+            ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
+           (compressed_sectors = bch2_bkey_sectors_compressed(old)))
+               trans->extra_journal_res += compressed_sectors;
+
+       if (front_split) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_back(new_start, update);
+
+               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                       old.k->p, update->k.p) ?:
+                       bch2_btree_insert_nonextent(trans, btree_id, update,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       /* If we're overwriting in a different snapshot - middle split: */
+       if (old.k->p.snapshot != new.k->p.snapshot &&
+           (front_split || back_split)) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_front(new_start, update);
+               bch2_cut_back(new.k->p, update);
+
+               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
+                                       old.k->p, update->k.p) ?:
+                       bch2_btree_insert_nonextent(trans, btree_id, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       if (bkey_le(old.k->p, new.k->p)) {
+               update = bch2_trans_kmalloc(trans, sizeof(*update));
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bkey_init(&update->k);
+               update->k.p = old.k->p;
+               update->k.p.snapshot = new.k->p.snapshot;
+
+               if (new.k->p.snapshot != old.k->p.snapshot) {
+                       update->k.type = KEY_TYPE_whiteout;
+               } else if (btree_type_has_snapshots(btree_id)) {
+                       ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
+                       if (ret < 0)
+                               return ret;
+                       if (ret)
+                               update->k.type = KEY_TYPE_whiteout;
+               }
+
+               ret = bch2_btree_insert_nonextent(trans, btree_id, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
+               if (ret)
+                       return ret;
+       }
+
+       if (back_split) {
+               update = bch2_bkey_make_mut_noupdate(trans, old);
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       return ret;
+
+               bch2_cut_front(new.k->p, update);
+
+               ret = bch2_trans_update_by_path(trans, iter->path, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                         flags, _RET_IP_);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int bch2_trans_update_extent(struct btree_trans *trans,
+                                   struct btree_iter *orig_iter,
+                                   struct bkey_i *insert,
+                                   enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       enum btree_id btree_id = orig_iter->btree_id;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_NOT_EXTENTS);
+       k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+       if ((ret = bkey_err(k)))
+               goto err;
+       if (!k.k)
+               goto out;
+
+       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
+               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
+                       if (ret)
+                               goto err;
+               }
+
+               goto next;
+       }
+
+       while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
+               bool done = bkey_lt(insert->k.p, k.k->p);
+
+               ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
+               if (ret)
+                       goto err;
+
+               if (done)
+                       goto out;
+next:
+               bch2_btree_iter_advance(&iter);
+               k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
+               if ((ret = bkey_err(k)))
+                       goto err;
+               if (!k.k)
+                       goto out;
+       }
+
+       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+               ret = extent_back_merge(trans, &iter, insert, k);
+               if (ret)
+                       goto err;
+       }
+out:
+       if (!bkey_deleted(&insert->k))
+               ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static noinline int flush_new_cached_update(struct btree_trans *trans,
+                                           struct btree_path *path,
+                                           struct btree_insert_entry *i,
+                                           enum btree_update_flags flags,
+                                           unsigned long ip)
+{
+       struct btree_path *btree_path;
+       struct bkey k;
+       int ret;
+
+       btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                  BTREE_ITER_INTENT, _THIS_IP_);
+       ret = bch2_btree_path_traverse(trans, btree_path, 0);
+       if (ret)
+               goto out;
+
+       /*
+        * The old key in the insert entry might actually refer to an existing
+        * key in the btree that has been deleted from cache and not yet
+        * flushed. Check for this and skip the flush so we don't run triggers
+        * against a stale key.
+        */
+       bch2_btree_path_peek_slot_exact(btree_path, &k);
+       if (!bkey_deleted(&k))
+               goto out;
+
+       i->key_cache_already_flushed = true;
+       i->flags |= BTREE_TRIGGER_NORUN;
+
+       btree_path_set_should_be_locked(btree_path);
+       ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+out:
+       bch2_path_put(trans, btree_path, true);
+       return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags,
+                         unsigned long ip)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i, n;
+       int cmp;
+
+       EBUG_ON(!path->should_be_locked);
+       EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+       EBUG_ON(!bpos_eq(k->k.p, path->pos));
+
+       n = (struct btree_insert_entry) {
+               .flags          = flags,
+               .bkey_type      = __btree_node_type(path->level, path->btree_id),
+               .btree_id       = path->btree_id,
+               .level          = path->level,
+               .cached         = path->cached,
+               .path           = path,
+               .k              = k,
+               .ip_allocated   = ip,
+       };
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans_for_each_update(trans, i)
+               BUG_ON(i != trans->updates &&
+                      btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
+
+       /*
+        * Pending updates are kept sorted: first, find position of new update,
+        * then delete/trim any updates the new update overwrites:
+        */
+       trans_for_each_update(trans, i) {
+               cmp = btree_insert_entry_cmp(&n, i);
+               if (cmp <= 0)
+                       break;
+       }
+
+       if (!cmp && i < trans->updates + trans->nr_updates) {
+               EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
+
+               bch2_path_put(trans, i->path, true);
+               i->flags        = n.flags;
+               i->cached       = n.cached;
+               i->k            = n.k;
+               i->path         = n.path;
+               i->ip_allocated = n.ip_allocated;
+       } else {
+               array_insert_item(trans->updates, trans->nr_updates,
+                                 i - trans->updates, n);
+
+               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
+               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
+
+               if (unlikely(trans->journal_replay_not_finished)) {
+                       struct bkey_i *j_k =
+                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
+
+                       if (j_k) {
+                               i->old_k = j_k->k;
+                               i->old_v = &j_k->v;
+                       }
+               }
+       }
+
+       __btree_path_get(i->path, true);
+
+       /*
+        * If a key is present in the key cache, it must also exist in the
+        * btree - this is necessary for cache coherency. When iterating over
+        * a btree that's cached in the key cache, the btree iter code checks
+        * the key cache - but the key has to exist in the btree for that to
+        * work:
+        */
+       if (path->cached && bkey_deleted(&i->old_k))
+               return flush_new_cached_update(trans, path, i, flags, ip);
+
+       return 0;
+}
+
+static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
+                                                   struct btree_iter *iter,
+                                                   struct btree_path *path)
+{
+       if (!iter->key_cache_path ||
+           !iter->key_cache_path->should_be_locked ||
+           !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+               struct bkey_cached *ck;
+               int ret;
+
+               if (!iter->key_cache_path)
+                       iter->key_cache_path =
+                               bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                             BTREE_ITER_INTENT|
+                                             BTREE_ITER_CACHED, _THIS_IP_);
+
+               iter->key_cache_path =
+                       bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                               iter->flags & BTREE_ITER_INTENT,
+                                               _THIS_IP_);
+
+               ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                              BTREE_ITER_CACHED);
+               if (unlikely(ret))
+                       return ret;
+
+               ck = (void *) iter->key_cache_path->l[0].b;
+
+               if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                       trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
+                       return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
+               }
+
+               btree_path_set_should_be_locked(iter->key_cache_path);
+       }
+
+       return 0;
+}
+
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+                                  struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_path *path = iter->update_path ?: iter->path;
+       int ret;
+
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return bch2_trans_update_extent(trans, iter, k, flags);
+
+       if (bkey_deleted(&k->k) &&
+           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               if (unlikely(ret < 0))
+                       return ret;
+
+               if (ret)
+                       k->k.type = KEY_TYPE_whiteout;
+       }
+
+       /*
+        * Ensure that updates to cached btrees go to the key cache:
+        */
+       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           !path->cached &&
+           !path->level &&
+           btree_id_cached(trans->c, path->btree_id)) {
+               ret = bch2_trans_update_get_key_cache(trans, iter, path);
+               if (ret)
+                       return ret;
+
+               path = iter->key_cache_path;
+       }
+
+       return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+}
+
+int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+                                           enum btree_id btree,
+                                           struct bkey_i *k)
+{
+       struct btree_write_buffered_key *i;
+       int ret;
+
+       EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
+       EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+
+       trans_for_each_wb_update(trans, i) {
+               if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
+                       bkey_copy(&i->k, k);
+                       return 0;
+               }
+       }
+
+       if (!trans->wb_updates ||
+           trans->nr_wb_updates == trans->wb_updates_size) {
+               struct btree_write_buffered_key *u;
+
+               if (trans->nr_wb_updates == trans->wb_updates_size) {
+                       struct btree_transaction_stats *s = btree_trans_stats(trans);
+
+                       BUG_ON(trans->wb_updates_size > U8_MAX / 2);
+                       trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
+                       if (s)
+                               s->wb_updates_size = trans->wb_updates_size;
+               }
+
+               u = bch2_trans_kmalloc_nomemzero(trans,
+                                       trans->wb_updates_size *
+                                       sizeof(struct btree_write_buffered_key));
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       return ret;
+
+               if (trans->nr_wb_updates)
+                       memcpy(u, trans->wb_updates, trans->nr_wb_updates *
+                              sizeof(struct btree_write_buffered_key));
+               trans->wb_updates = u;
+       }
+
+       trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
+               .btree  = btree,
+       };
+
+       bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
+       trans->nr_wb_updates++;
+
+       return 0;
+}
+
+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
+                            enum btree_id btree, struct bpos end)
+{
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_prev(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_advance(iter);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       BUG_ON(k.k->type != KEY_TYPE_deleted);
+
+       if (bkey_gt(k.k->p, end)) {
+               ret = -BCH_ERR_ENOSPC_btree_slot;
+               goto err;
+       }
+
+       return 0;
+err:
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
+}
+
+void bch2_trans_commit_hook(struct btree_trans *trans,
+                           struct btree_trans_commit_hook *h)
+{
+       h->next = trans->hooks;
+       trans->hooks = h;
+}
+
+int bch2_btree_insert_nonextent(struct btree_trans *trans,
+                               enum btree_id btree, struct bkey_i *k,
+                               enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, flags);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id,
+                           struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, flags);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/**
+ * bch2_btree_insert - insert keys into the extent btree
+ * @c:                 pointer to struct bch_fs
+ * @id:                        btree to insert into
+ * @k:                 key to insert
+ * @disk_res:          must be non-NULL whenever inserting or potentially
+ *                     splitting data extents
+ * @flags:             transaction commit flags
+ *
+ * Returns:            0 on success, error code on failure
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
+                     struct disk_reservation *disk_res, int flags)
+{
+       return bch2_trans_do(c, disk_res, NULL, flags,
+                            bch2_btree_insert_trans(trans, id, k, 0));
+}
+
+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
+                               unsigned len, unsigned update_flags)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = iter->pos;
+       bch2_key_resize(&k->k, len);
+       return bch2_trans_update(trans, iter, k, update_flags);
+}
+
+int bch2_btree_delete_at(struct btree_trans *trans,
+                        struct btree_iter *iter, unsigned update_flags)
+{
+       return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
+}
+
+int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+                                 enum btree_id btree, struct bpos pos)
+{
+       struct bkey_i *k;
+
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
+
+       bkey_init(&k->k);
+       k->k.p = pos;
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
+int bch2_btree_delete(struct btree_trans *trans,
+                     enum btree_id btree, struct bpos pos,
+                     unsigned update_flags)
+{
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, btree, pos,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(trans, &iter, update_flags);
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
+                                 struct bpos start, struct bpos end,
+                                 unsigned update_flags,
+                                 u64 *journal_seq)
+{
+       u32 restart_count = trans->restart_count;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
+       while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(trans->c, 0);
+               struct bkey_i delete;
+
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               bkey_init(&delete.k);
+
+               /*
+                * This could probably be more efficient for extents:
+                */
+
+               /*
+                * For extents, iter.pos won't necessarily be the same as
+                * bkey_start_pos(k.k) (for non extents they always will be the
+                * same). It's important that we delete starting from iter.pos
+                * because the range we want to delete could start in the middle
+                * of k.
+                *
+                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+                * bkey_start_pos(k.k)).
+                */
+               delete.k.p = iter.pos;
+
+               if (iter.flags & BTREE_ITER_IS_EXTENTS)
+                       bch2_key_resize(&delete.k,
+                                       bpos_min(end, k.k->p).offset -
+                                       iter.pos.offset);
+
+               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
+                       bch2_trans_commit(trans, &disk_res, journal_seq,
+                                         BCH_TRANS_COMMIT_no_enospc);
+               bch2_disk_reservation_put(trans->c, &disk_res);
+err:
+               /*
+                * the bch2_trans_begin() call is in a weird place because we
+                * need to call it after every transaction commit, to avoid path
+                * overflow, but don't want to call it if the delete operation
+                * is a no-op and we have no work to do:
+                */
+               bch2_trans_begin(trans);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       ret = 0;
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+                           struct bpos start, struct bpos end,
+                           unsigned update_flags,
+                           u64 *journal_seq)
+{
+       int ret = bch2_trans_run(c,
+                       bch2_btree_delete_range_trans(trans, id, start, end,
+                                                     update_flags, journal_seq));
+       if (ret == -BCH_ERR_transaction_restart_nested)
+               ret = 0;
+       return ret;
+}
+
+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
+                      struct bpos pos, bool set)
+{
+       struct bkey_i *k;
+       int ret = 0;
+
+       k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
+       ret = PTR_ERR_OR_ZERO(k);
+       if (unlikely(ret))
+               return ret;
+
+       bkey_init(&k->k);
+       k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       k->k.p = pos;
+
+       return bch2_trans_update_buffered(trans, btree, k);
+}
+
+__printf(2, 0)
+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+{
+       struct printbuf buf = PRINTBUF;
+       struct jset_entry_log *l;
+       unsigned u64s;
+       int ret;
+
+       prt_vprintf(&buf, fmt, args);
+       ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+       if (ret)
+               goto err;
+
+       u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+
+       ret = darray_make_room(entries, jset_u64s(u64s));
+       if (ret)
+               goto err;
+
+       l = (void *) &darray_top(*entries);
+       l->entry.u64s           = cpu_to_le16(u64s);
+       l->entry.btree_id       = 0;
+       l->entry.level          = 1;
+       l->entry.type           = BCH_JSET_ENTRY_log;
+       l->entry.pad[0]         = 0;
+       l->entry.pad[1]         = 0;
+       l->entry.pad[2]         = 0;
+       memcpy(l->d, buf.buf, buf.pos);
+       while (buf.pos & 7)
+               l->d[buf.pos++] = '\0';
+
+       entries->nr += jset_u64s(u64s);
+err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+__printf(3, 0)
+static int
+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
+                 va_list args)
+{
+       int ret;
+
+       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
+               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+       } else {
+               ret = bch2_trans_do(c, NULL, NULL,
+                       BCH_TRANS_COMMIT_lazy_rw|commit_flags,
+                       __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+       }
+
+       return ret;
+}
+
+__printf(2, 3)
+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, 0, fmt, args);
+       va_end(args);
+       return ret;
+}
+
+/*
+ * Use for logging messages during recovery to enable reserved space and avoid
+ * blocking.
+ */
+__printf(2, 3)
+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
+{
+       va_list args;
+       int ret;
+
+       va_start(args, fmt);
+       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
+       va_end(args);
+       return ret;
+}
diff --git a/libbcachefs/darray.c b/libbcachefs/darray.c
new file mode 100644 (file)
index 0000000..aae07be
--- /dev/null
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include "darray.h"
+
+int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, gfp_t gfp)
+{
+       if (new_size > d->size) {
+               new_size = roundup_pow_of_two(new_size);
+
+               void *data = krealloc_array(d->data, new_size, element_size, gfp);
+               if (!data)
+                       return -ENOMEM;
+
+               d->data = data;
+               d->size = new_size;
+       }
+
+       return 0;
+}
diff --git a/libbcachefs/disk_groups_types.h b/libbcachefs/disk_groups_types.h
new file mode 100644 (file)
index 0000000..a54ef08
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+       bool                            deleted;
+       u16                             parent;
+       u8                              label[BCH_SB_LABEL_SIZE];
+       struct bch_devs_mask            devs;
+};
+
+struct bch_disk_groups_cpu {
+       struct rcu_head                 rcu;
+       unsigned                        nr;
+       struct bch_disk_group_cpu       entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
new file mode 100644 (file)
index 0000000..52f0e7a
--- /dev/null
@@ -0,0 +1,1106 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/backing-dev.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+
+static inline bool bio_full(struct bio *bio, unsigned len)
+{
+       if (bio->bi_vcnt >= bio->bi_max_vecs)
+               return true;
+       if (bio->bi_iter.bi_size > UINT_MAX - len)
+               return true;
+       return false;
+}
+
+/* readpage(s): */
+
+static void bch2_readpages_end_io(struct bio *bio)
+{
+       struct folio_iter fi;
+
+       bio_for_each_folio_all(fi, bio) {
+               if (!bio->bi_status) {
+                       folio_mark_uptodate(fi.folio);
+               } else {
+                       folio_clear_uptodate(fi.folio);
+                       folio_set_error(fi.folio);
+               }
+               folio_unlock(fi.folio);
+       }
+
+       bio_put(bio);
+}
+
+struct readpages_iter {
+       struct address_space    *mapping;
+       unsigned                idx;
+       folios                  folios;
+};
+
+static int readpages_iter_init(struct readpages_iter *iter,
+                              struct readahead_control *ractl)
+{
+       struct folio **fi;
+       int ret;
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->mapping = ractl->mapping;
+
+       ret = bch2_filemap_get_contig_folios_d(iter->mapping,
+                               ractl->_index << PAGE_SHIFT,
+                               (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
+                               0, mapping_gfp_mask(iter->mapping),
+                               &iter->folios);
+       if (ret)
+               return ret;
+
+       darray_for_each(iter->folios, fi) {
+               ractl->_nr_pages -= 1U << folio_order(*fi);
+               __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
+               folio_put(*fi);
+               folio_put(*fi);
+       }
+
+       return 0;
+}
+
+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
+{
+       if (iter->idx >= iter->folios.nr)
+               return NULL;
+       return iter->folios.data[iter->idx];
+}
+
+static inline void readpage_iter_advance(struct readpages_iter *iter)
+{
+       iter->idx++;
+}
+
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       bkey_for_each_crc(k.k, ptrs, crc, i)
+               if (crc.csum_type || crc.compression_type)
+                       return true;
+       return false;
+}
+
+static int readpage_bio_extend(struct btree_trans *trans,
+                              struct readpages_iter *iter,
+                              struct bio *bio,
+                              unsigned sectors_this_extent,
+                              bool get_more)
+{
+       /* Don't hold btree locks while allocating memory: */
+       bch2_trans_unlock(trans);
+
+       while (bio_sectors(bio) < sectors_this_extent &&
+              bio->bi_vcnt < bio->bi_max_vecs) {
+               struct folio *folio = readpage_iter_peek(iter);
+               int ret;
+
+               if (folio) {
+                       readpage_iter_advance(iter);
+               } else {
+                       pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
+
+                       if (!get_more)
+                               break;
+
+                       folio = xa_load(&iter->mapping->i_pages, folio_offset);
+                       if (folio && !xa_is_value(folio))
+                               break;
+
+                       folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
+                       if (!folio)
+                               break;
+
+                       if (!__bch2_folio_create(folio, GFP_KERNEL)) {
+                               folio_put(folio);
+                               break;
+                       }
+
+                       ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
+                       if (ret) {
+                               __bch2_folio_release(folio);
+                               folio_put(folio);
+                               break;
+                       }
+
+                       folio_put(folio);
+               }
+
+               BUG_ON(folio_sector(folio) != bio_end_sector(bio));
+
+               BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
+       }
+
+       return bch2_trans_relock(trans);
+}
+
+static void bchfs_read(struct btree_trans *trans,
+                      struct bch_read_bio *rbio,
+                      subvol_inum inum,
+                      struct readpages_iter *readpages_iter)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       int flags = BCH_READ_RETRY_IF_STALE|
+               BCH_READ_MAY_PROMOTE;
+       u32 snapshot;
+       int ret = 0;
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
+
+       bch2_bkey_buf_init(&sk);
+retry:
+       bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
+       while (1) {
+               struct bkey_s_c k;
+               unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
+
+               /*
+                * read_extent -> io_time_reset may cause a transaction restart
+                * without returning an error, we need to check for that here:
+                */
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       break;
+
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               offset_into_extent = iter.pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               bch2_bkey_buf_reassemble(&sk, c, k);
+
+               ret = bch2_read_indirect_extent(trans, &data_btree,
+                                       &offset_into_extent, &sk);
+               if (ret)
+                       break;
+
+               k = bkey_i_to_s_c(sk.k);
+
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               if (readpages_iter) {
+                       ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
+                                                 extent_partial_reads_expensive(k));
+                       if (ret)
+                               break;
+               }
+
+               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+
+               if (rbio->bio.bi_iter.bi_size == bytes)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               bch2_bio_page_state_set(&rbio->bio, k);
+
+               bch2_read_extent(trans, rbio, iter.pos,
+                                data_btree, k, offset_into_extent, flags);
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       break;
+
+               swap(rbio->bio.bi_iter.bi_size, bytes);
+               bio_advance(&rbio->bio, bytes);
+
+               ret = btree_trans_too_many_iters(trans);
+               if (ret)
+                       break;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+
+       if (ret) {
+               bch_err_inum_offset_ratelimited(c,
+                               iter.pos.inode,
+                               iter.pos.offset << 9,
+                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
+               bio_endio(&rbio->bio);
+       }
+
+       bch2_bkey_buf_exit(&sk, c);
+}
+
+void bch2_readahead(struct readahead_control *ractl)
+{
+       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts;
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct folio *folio;
+       struct readpages_iter readpages_iter;
+       int ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       ret = readpages_iter_init(&readpages_iter, ractl);
+       BUG_ON(ret);
+
+       bch2_pagecache_add_get(inode);
+
+       while ((folio = readpage_iter_peek(&readpages_iter))) {
+               unsigned n = min_t(unsigned,
+                                  readpages_iter.folios.nr -
+                                  readpages_iter.idx,
+                                  BIO_MAX_VECS);
+               struct bch_read_bio *rbio =
+                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
+                                                  GFP_KERNEL, &c->bio_read),
+                                 opts);
+
+               readpage_iter_advance(&readpages_iter);
+
+               rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+               rbio->bio.bi_end_io = bch2_readpages_end_io;
+               BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+               bchfs_read(trans, rbio, inode_inum(inode),
+                          &readpages_iter);
+               bch2_trans_unlock(trans);
+       }
+
+       bch2_pagecache_add_put(inode);
+
+       bch2_trans_put(trans);
+       darray_exit(&readpages_iter.folios);
+}
+
+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
+                            subvol_inum inum, struct folio *folio)
+{
+       bch2_folio_create(folio, __GFP_NOFAIL);
+
+       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+       rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+       BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+       bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
+}
+
+static void bch2_read_single_folio_end_io(struct bio *bio)
+{
+       complete(bio->bi_private);
+}
+
+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_read_bio *rbio;
+       struct bch_io_opts opts;
+       int ret;
+       DECLARE_COMPLETION_ONSTACK(done);
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
+                        opts);
+       rbio->bio.bi_private = &done;
+       rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
+
+       __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+       wait_for_completion(&done);
+
+       ret = blk_status_to_errno(rbio->bio.bi_status);
+       bio_put(&rbio->bio);
+
+       if (ret < 0)
+               return ret;
+
+       folio_mark_uptodate(folio);
+       return 0;
+}
+
+int bch2_read_folio(struct file *file, struct folio *folio)
+{
+       int ret;
+
+       ret = bch2_read_single_folio(folio, folio->mapping);
+       folio_unlock(folio);
+       return bch2_err_class(ret);
+}
+
+/* writepages: */
+
+struct bch_writepage_io {
+       struct bch_inode_info           *inode;
+
+       /* must be last: */
+       struct bch_write_op             op;
+};
+
+struct bch_writepage_state {
+       struct bch_writepage_io *io;
+       struct bch_io_opts      opts;
+       struct bch_folio_sector *tmp;
+       unsigned                tmp_sectors;
+};
+
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
+                                                                 struct bch_inode_info *inode)
+{
+       struct bch_writepage_state ret = { 0 };
+
+       bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
+       return ret;
+}
+
+/*
+ * Determine when a writepage io is full. We have to limit writepage bios to a
+ * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to
+ * what the bounce path in bch2_write_extent() can handle. In theory we could
+ * loosen this restriction for non-bounce I/O, but we don't have that context
+ * here. Ideally, we can up this limit and make it configurable in the future
+ * when the bounce path can be enhanced to accommodate larger source bios.
+ */
+static inline bool bch_io_full(struct bch_writepage_io *io, unsigned len)
+{
+       struct bio *bio = &io->op.wbio.bio;
+       return bio_full(bio, len) ||
+               (bio->bi_iter.bi_size + len > BIO_MAX_VECS * PAGE_SIZE);
+}
+
+static void bch2_writepage_io_done(struct bch_write_op *op)
+{
+       struct bch_writepage_io *io =
+               container_of(op, struct bch_writepage_io, op);
+       struct bch_fs *c = io->op.c;
+       struct bio *bio = &io->op.wbio.bio;
+       struct folio_iter fi;
+       unsigned i;
+
+       if (io->op.error) {
+               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
+
+               bio_for_each_folio_all(fi, bio) {
+                       struct bch_folio *s;
+
+                       folio_set_error(fi.folio);
+                       mapping_set_error(fi.folio->mapping, -EIO);
+
+                       s = __bch2_folio(fi.folio);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < folio_sectors(fi.folio); i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
+       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+               bio_for_each_folio_all(fi, bio) {
+                       struct bch_folio *s;
+
+                       s = __bch2_folio(fi.folio);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < folio_sectors(fi.folio); i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
+       /*
+        * racing with fallocate can cause us to add fewer sectors than
+        * expected - but we shouldn't add more sectors than expected:
+        */
+       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
+
+       /*
+        * (error (due to going RO) halfway through a page can screw that up
+        * slightly)
+        * XXX wtf?
+          BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
+        */
+
+       /*
+        * PageWriteback is effectively our ref on the inode - fixup i_blocks
+        * before calling end_page_writeback:
+        */
+       bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
+
+       bio_for_each_folio_all(fi, bio) {
+               struct bch_folio *s = __bch2_folio(fi.folio);
+
+               if (atomic_dec_and_test(&s->write_count))
+                       folio_end_writeback(fi.folio);
+       }
+
+       bio_put(&io->op.wbio.bio);
+}
+
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
+{
+       struct bch_writepage_io *io = w->io;
+
+       w->io = NULL;
+       closure_call(&io->op.cl, bch2_write, NULL, NULL);
+}
+
+/*
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
+ * possible, else allocating a new one:
+ */
+static void bch2_writepage_io_alloc(struct bch_fs *c,
+                                   struct writeback_control *wbc,
+                                   struct bch_writepage_state *w,
+                                   struct bch_inode_info *inode,
+                                   u64 sector,
+                                   unsigned nr_replicas)
+{
+       struct bch_write_op *op;
+
+       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
+                                             REQ_OP_WRITE,
+                                             GFP_KERNEL,
+                                             &c->writepage_bioset),
+                            struct bch_writepage_io, op.wbio.bio);
+
+       w->io->inode            = inode;
+       op                      = &w->io->op;
+       bch2_write_op_init(op, c, w->opts);
+       op->target              = w->opts.foreground_target;
+       op->nr_replicas         = nr_replicas;
+       op->res.nr_replicas     = nr_replicas;
+       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->subvol              = inode->ei_subvol;
+       op->pos                 = POS(inode->v.i_ino, sector);
+       op->end_io              = bch2_writepage_io_done;
+       op->devs_need_flush     = &inode->ei_devs_need_flush;
+       op->wbio.bio.bi_iter.bi_sector = sector;
+       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
+}
+
+static int __bch2_writepage(struct folio *folio,
+                           struct writeback_control *wbc,
+                           void *data)
+{
+       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_writepage_state *w = data;
+       struct bch_folio *s;
+       unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
+       loff_t i_size = i_size_read(&inode->v);
+       int ret;
+
+       EBUG_ON(!folio_test_uptodate(folio));
+
+       /* Is the folio fully inside i_size? */
+       if (folio_end_pos(folio) <= i_size)
+               goto do_io;
+
+       /* Is the folio fully outside i_size? (truncate in progress) */
+       if (folio_pos(folio) >= i_size) {
+               folio_unlock(folio);
+               return 0;
+       }
+
+       /*
+        * The folio straddles i_size.  It must be zeroed out on each and every
+        * writepage invocation because it may be mmapped.  "A file is mapped
+        * in multiples of the folio size.  For a file that is not a multiple of
+        * the  folio size, the remaining memory is zeroed when mapped, and
+        * writes to that region are not written out to the file."
+        */
+       folio_zero_segment(folio,
+                          i_size - folio_pos(folio),
+                          folio_size(folio));
+do_io:
+       f_sectors = folio_sectors(folio);
+       s = bch2_folio(folio);
+
+       if (f_sectors > w->tmp_sectors) {
+               kfree(w->tmp);
+               w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL);
+               w->tmp_sectors = f_sectors;
+       }
+
+       /*
+        * Things get really hairy with errors during writeback:
+        */
+       ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
+       BUG_ON(ret);
+
+       /* Before unlocking the page, get copy of reservations: */
+       spin_lock(&s->lock);
+       memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
+
+       for (i = 0; i < f_sectors; i++) {
+               if (s->s[i].state < SECTOR_dirty)
+                       continue;
+
+               nr_replicas_this_write =
+                       min_t(unsigned, nr_replicas_this_write,
+                             s->s[i].nr_replicas +
+                             s->s[i].replicas_reserved);
+       }
+
+       for (i = 0; i < f_sectors; i++) {
+               if (s->s[i].state < SECTOR_dirty)
+                       continue;
+
+               s->s[i].nr_replicas = w->opts.compression
+                       ? 0 : nr_replicas_this_write;
+
+               s->s[i].replicas_reserved = 0;
+               bch2_folio_sector_set(folio, s, i, SECTOR_allocated);
+       }
+       spin_unlock(&s->lock);
+
+       BUG_ON(atomic_read(&s->write_count));
+       atomic_set(&s->write_count, 1);
+
+       BUG_ON(folio_test_writeback(folio));
+       folio_start_writeback(folio);
+
+       folio_unlock(folio);
+
+       offset = 0;
+       while (1) {
+               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
+               u64 sector;
+
+               while (offset < f_sectors &&
+                      w->tmp[offset].state < SECTOR_dirty)
+                       offset++;
+
+               if (offset == f_sectors)
+                       break;
+
+               while (offset + sectors < f_sectors &&
+                      w->tmp[offset + sectors].state >= SECTOR_dirty) {
+                       reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
+                       dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
+                       sectors++;
+               }
+               BUG_ON(!sectors);
+
+               sector = folio_sector(folio) + offset;
+
+               if (w->io &&
+                   (w->io->op.res.nr_replicas != nr_replicas_this_write ||
+                    bch_io_full(w->io, sectors << 9) ||
+                    bio_end_sector(&w->io->op.wbio.bio) != sector))
+                       bch2_writepage_do_io(w);
+
+               if (!w->io)
+                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
+                                               nr_replicas_this_write);
+
+               atomic_inc(&s->write_count);
+
+               BUG_ON(inode != w->io->inode);
+               BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
+                                    sectors << 9, offset << 9));
+
+               /* Check for writing past i_size: */
+               WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
+                         round_up(i_size, block_bytes(c)) &&
+                         !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+                         "writing past i_size: %llu > %llu (unrounded %llu)\n",
+                         bio_end_sector(&w->io->op.wbio.bio) << 9,
+                         round_up(i_size, block_bytes(c)),
+                         i_size);
+
+               w->io->op.res.sectors += reserved_sectors;
+               w->io->op.i_sectors_delta -= dirty_sectors;
+               w->io->op.new_i_size = i_size;
+
+               offset += sectors;
+       }
+
+       if (atomic_dec_and_test(&s->write_count))
+               folio_end_writeback(folio);
+
+       return 0;
+}
+
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
+       struct bch_writepage_state w =
+               bch_writepage_state_init(c, to_bch_ei(mapping->host));
+       struct blk_plug plug;
+       int ret;
+
+       blk_start_plug(&plug);
+       ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
+       if (w.io)
+               bch2_writepage_do_io(&w);
+       blk_finish_plug(&plug);
+       kfree(w.tmp);
+       return bch2_err_class(ret);
+}
+
+/* buffered writes: */
+
+int bch2_write_begin(struct file *file, struct address_space *mapping,
+                    loff_t pos, unsigned len,
+                    struct page **pagep, void **fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation *res;
+       struct folio *folio;
+       unsigned offset;
+       int ret = -ENOMEM;
+
+       res = kmalloc(sizeof(*res), GFP_KERNEL);
+       if (!res)
+               return -ENOMEM;
+
+       bch2_folio_reservation_init(c, inode, res);
+       *fsdata = res;
+
+       bch2_pagecache_add_get(inode);
+
+       folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
+                               FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
+                               mapping_gfp_mask(mapping));
+       if (IS_ERR_OR_NULL(folio))
+               goto err_unlock;
+
+       offset = pos - folio_pos(folio);
+       len = min_t(size_t, len, folio_end_pos(folio) - pos);
+
+       if (folio_test_uptodate(folio))
+               goto out;
+
+       /* If we're writing entire folio, don't need to read it in first: */
+       if (!offset && len == folio_size(folio))
+               goto out;
+
+       if (!offset && pos + len >= inode->v.i_size) {
+               folio_zero_segment(folio, len, folio_size(folio));
+               flush_dcache_folio(folio);
+               goto out;
+       }
+
+       if (folio_pos(folio) >= inode->v.i_size) {
+               folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
+               flush_dcache_folio(folio);
+               goto out;
+       }
+readpage:
+       ret = bch2_read_single_folio(folio, mapping);
+       if (ret)
+               goto err;
+out:
+       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
+       if (ret)
+               goto err;
+
+       ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
+       if (ret) {
+               if (!folio_test_uptodate(folio)) {
+                       /*
+                        * If the folio hasn't been read in, we won't know if we
+                        * actually need a reservation - we don't actually need
+                        * to read here, we just need to check if the folio is
+                        * fully backed by uncompressed data:
+                        */
+                       goto readpage;
+               }
+
+               goto err;
+       }
+
+       *pagep = &folio->page;
+       return 0;
+err:
+       folio_unlock(folio);
+       folio_put(folio);
+       *pagep = NULL;
+err_unlock:
+       bch2_pagecache_add_put(inode);
+       kfree(res);
+       *fsdata = NULL;
+       return bch2_err_class(ret);
+}
+
+int bch2_write_end(struct file *file, struct address_space *mapping,
+                  loff_t pos, unsigned len, unsigned copied,
+                  struct page *page, void *fsdata)
+{
+       struct bch_inode_info *inode = to_bch_ei(mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation *res = fsdata;
+       struct folio *folio = page_folio(page);
+       unsigned offset = pos - folio_pos(folio);
+
+       lockdep_assert_held(&inode->v.i_rwsem);
+       BUG_ON(offset + copied > folio_size(folio));
+
+       if (unlikely(copied < len && !folio_test_uptodate(folio))) {
+               /*
+                * The folio needs to be read in, but that would destroy
+                * our partial write - simplest thing is to just force
+                * userspace to redo the write:
+                */
+               folio_zero_range(folio, 0, folio_size(folio));
+               flush_dcache_folio(folio);
+               copied = 0;
+       }
+
+       spin_lock(&inode->v.i_lock);
+       if (pos + copied > inode->v.i_size)
+               i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
+
+       if (copied) {
+               if (!folio_test_uptodate(folio))
+                       folio_mark_uptodate(folio);
+
+               bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
+
+               inode->ei_last_dirtied = (unsigned long) current;
+       }
+
+       folio_unlock(folio);
+       folio_put(folio);
+       bch2_pagecache_add_put(inode);
+
+       bch2_folio_reservation_put(c, inode, res);
+       kfree(res);
+
+       return copied;
+}
+
+static noinline void folios_trunc(folios *fs, struct folio **fi)
+{
+       while (fs->data + fs->nr > fi) {
+               struct folio *f = darray_pop(fs);
+
+               folio_unlock(f);
+               folio_put(f);
+       }
+}
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+                                struct address_space *mapping,
+                                struct iov_iter *iter,
+                                loff_t pos, unsigned len)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation res;
+       folios fs;
+       struct folio **fi, *f;
+       unsigned copied = 0, f_offset, f_copied;
+       u64 end = pos + len, f_pos, f_len;
+       loff_t last_folio_pos = inode->v.i_size;
+       int ret = 0;
+
+       BUG_ON(!len);
+
+       bch2_folio_reservation_init(c, inode, &res);
+       darray_init(&fs);
+
+       ret = bch2_filemap_get_contig_folios_d(mapping, pos, end,
+                                  FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
+                                  mapping_gfp_mask(mapping),
+                                  &fs);
+       if (ret)
+               goto out;
+
+       BUG_ON(!fs.nr);
+
+       f = darray_first(fs);
+       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
+               ret = bch2_read_single_folio(f, mapping);
+               if (ret)
+                       goto out;
+       }
+
+       f = darray_last(fs);
+       end = min(end, folio_end_pos(f));
+       last_folio_pos = folio_pos(f);
+       if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
+               if (end >= inode->v.i_size) {
+                       folio_zero_range(f, 0, folio_size(f));
+               } else {
+                       ret = bch2_read_single_folio(f, mapping);
+                       if (ret)
+                               goto out;
+               }
+       }
+
+       ret = bch2_folio_set(c, inode_inum(inode), fs.data, fs.nr);
+       if (ret)
+               goto out;
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
+
+               /*
+                * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
+                * supposed to write as much as we have disk space for.
+                *
+                * On failure here we should still write out a partial page if
+                * we aren't completely out of disk space - we don't do that
+                * yet:
+                */
+               ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
+               if (unlikely(ret)) {
+                       folios_trunc(&fs, fi);
+                       if (!fs.nr)
+                               goto out;
+
+                       end = min(end, folio_end_pos(darray_last(fs)));
+                       break;
+               }
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       if (mapping_writably_mapped(mapping))
+               darray_for_each(fs, fi)
+                       flush_dcache_folio(*fi);
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
+               f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
+               if (!f_copied) {
+                       folios_trunc(&fs, fi);
+                       break;
+               }
+
+               if (!folio_test_uptodate(f) &&
+                   f_copied != folio_size(f) &&
+                   pos + copied + f_copied < inode->v.i_size) {
+                       iov_iter_revert(iter, f_copied);
+                       folio_zero_range(f, 0, folio_size(f));
+                       folios_trunc(&fs, fi);
+                       break;
+               }
+
+               flush_dcache_folio(f);
+               copied += f_copied;
+
+               if (f_copied != f_len) {
+                       folios_trunc(&fs, fi + 1);
+                       break;
+               }
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       if (!copied)
+               goto out;
+
+       end = pos + copied;
+
+       spin_lock(&inode->v.i_lock);
+       if (end > inode->v.i_size)
+               i_size_write(&inode->v, end);
+       spin_unlock(&inode->v.i_lock);
+
+       f_pos = pos;
+       f_offset = pos - folio_pos(darray_first(fs));
+       darray_for_each(fs, fi) {
+               f = *fi;
+               f_len = min(end, folio_end_pos(f)) - f_pos;
+
+               if (!folio_test_uptodate(f))
+                       folio_mark_uptodate(f);
+
+               bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
+
+               f_pos = folio_end_pos(f);
+               f_offset = 0;
+       }
+
+       inode->ei_last_dirtied = (unsigned long) current;
+out:
+       darray_for_each(fs, fi) {
+               folio_unlock(*fi);
+               folio_put(*fi);
+       }
+
+       /*
+        * If the last folio added to the mapping starts beyond current EOF, we
+        * performed a short write but left around at least one post-EOF folio.
+        * Clean up the mapping before we return.
+        */
+       if (last_folio_pos >= inode->v.i_size)
+               truncate_pagecache(&inode->v, inode->v.i_size);
+
+       darray_exit(&fs);
+       bch2_folio_reservation_put(c, inode, &res);
+
+       return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       loff_t pos = iocb->ki_pos;
+       ssize_t written = 0;
+       int ret = 0;
+
+       bch2_pagecache_add_get(inode);
+
+       do {
+               unsigned offset = pos & (PAGE_SIZE - 1);
+               unsigned bytes = iov_iter_count(iter);
+again:
+               /*
+                * Bring in the user page that we will copy from _first_.
+                * Otherwise there's a nasty deadlock on copying from the
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                *
+                * Not only is this an optimisation, but it is also required
+                * to check that the address is actually valid, when atomic
+                * usercopies are used, below.
+                */
+               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                       bytes = min_t(unsigned long, iov_iter_count(iter),
+                                     PAGE_SIZE - offset);
+
+                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
+                               ret = -EFAULT;
+                               break;
+                       }
+               }
+
+               if (unlikely(fatal_signal_pending(current))) {
+                       ret = -EINTR;
+                       break;
+               }
+
+               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+               if (unlikely(ret < 0))
+                       break;
+
+               cond_resched();
+
+               if (unlikely(ret == 0)) {
+                       /*
+                        * If we were unable to copy any data at all, we must
+                        * fall back to a single segment length write.
+                        *
+                        * If we didn't fallback here, we could livelock
+                        * because not all segments in the iov can be copied at
+                        * once without a pagefault.
+                        */
+                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
+                                     iov_iter_single_seg_count(iter));
+                       goto again;
+               }
+               pos += ret;
+               written += ret;
+               ret = 0;
+
+               balance_dirty_pages_ratelimited(mapping);
+       } while (iov_iter_count(iter));
+
+       bch2_pagecache_add_put(inode);
+
+       return written ? written : ret;
+}
+
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+       struct file *file = iocb->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       ssize_t ret;
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               ret = bch2_direct_write(iocb, from);
+               goto out;
+       }
+
+       inode_lock(&inode->v);
+
+       ret = generic_write_checks(iocb, from);
+       if (ret <= 0)
+               goto unlock;
+
+       ret = file_remove_privs(file);
+       if (ret)
+               goto unlock;
+
+       ret = file_update_time(file);
+       if (ret)
+               goto unlock;
+
+       ret = bch2_buffered_write(iocb, from);
+       if (likely(ret > 0))
+               iocb->ki_pos += ret;
+unlock:
+       inode_unlock(&inode->v);
+
+       if (ret > 0)
+               ret = generic_write_sync(iocb, ret);
+out:
+       return bch2_err_class(ret);
+}
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c)
+{
+       bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fs_io_buffered_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->writepage_bioset,
+                       4, offsetof(struct bch_writepage_io, op.wbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_writepage_bioset_init;
+
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h
new file mode 100644 (file)
index 0000000..a6126ff
--- /dev/null
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_BUFFERED_H
+#define _BCACHEFS_FS_IO_BUFFERED_H
+
+#ifndef NO_BCACHEFS_FS
+
+int bch2_read_single_folio(struct folio *, struct address_space *);
+int bch2_read_folio(struct file *, struct folio *);
+
+int bch2_writepages(struct address_space *, struct writeback_control *);
+void bch2_readahead(struct readahead_control *);
+
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
+                    unsigned, struct page **, void **);
+int bch2_write_end(struct file *, struct address_space *, loff_t,
+                  unsigned, unsigned, struct page *, void *);
+
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_buffered_exit(struct bch_fs *);
+int bch2_fs_fs_io_buffered_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */
diff --git a/libbcachefs/fs-io-direct.c b/libbcachefs/fs-io-direct.c
new file mode 100644 (file)
index 0000000..5b42a76
--- /dev/null
@@ -0,0 +1,680 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "fs.h"
+#include "fs-io.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
+#include "io_read.h"
+#include "io_write.h"
+
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/prefetch.h>
+#include <linux/task_io_accounting_ops.h>
+
+/* O_DIRECT reads */
+
+struct dio_read {
+       struct closure                  cl;
+       struct kiocb                    *req;
+       long                            ret;
+       bool                            should_dirty;
+       struct bch_read_bio             rbio;
+};
+
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
+{
+       if (check_dirty) {
+               bio_check_pages_dirty(bio);
+       } else {
+               bio_release_pages(bio, false);
+               bio_put(bio);
+       }
+}
+
+static void bch2_dio_read_complete(struct closure *cl)
+{
+       struct dio_read *dio = container_of(cl, struct dio_read, cl);
+
+       dio->req->ki_complete(dio->req, dio->ret);
+       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+}
+
+static void bch2_direct_IO_read_endio(struct bio *bio)
+{
+       struct dio_read *dio = bio->bi_private;
+
+       if (bio->bi_status)
+               dio->ret = blk_status_to_errno(bio->bi_status);
+
+       closure_put(&dio->cl);
+}
+
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
+{
+       struct dio_read *dio = bio->bi_private;
+       bool should_dirty = dio->should_dirty;
+
+       bch2_direct_IO_read_endio(bio);
+       bio_check_or_release(bio, should_dirty);
+}
+
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_io_opts opts;
+       struct dio_read *dio;
+       struct bio *bio;
+       loff_t offset = req->ki_pos;
+       bool sync = is_sync_kiocb(req);
+       size_t shorten;
+       ssize_t ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       if ((offset|iter->count) & (block_bytes(c) - 1))
+               return -EINVAL;
+
+       ret = min_t(loff_t, iter->count,
+                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
+
+       if (!ret)
+               return ret;
+
+       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+       iter->count -= shorten;
+
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_READ,
+                              GFP_KERNEL,
+                              &c->dio_read_bioset);
+
+       bio->bi_end_io = bch2_direct_IO_read_endio;
+
+       dio = container_of(bio, struct dio_read, rbio.bio);
+       closure_init(&dio->cl, NULL);
+
+       /*
+        * this is a _really_ horrible hack just to avoid an atomic sub at the
+        * end:
+        */
+       if (!sync) {
+               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER -
+                          CLOSURE_RUNNING +
+                          CLOSURE_DESTRUCTOR);
+       } else {
+               atomic_set(&dio->cl.remaining,
+                          CLOSURE_REMAINING_INITIALIZER + 1);
+               dio->cl.closure_get_happened = true;
+       }
+
+       dio->req        = req;
+       dio->ret        = ret;
+       /*
+        * This is one of the sketchier things I've encountered: we have to skip
+        * the dirtying of requests that are internal from the kernel (i.e. from
+        * loopback), because we'll deadlock on page_lock.
+        */
+       dio->should_dirty = iter_is_iovec(iter);
+
+       goto start;
+       while (iter->count) {
+               bio = bio_alloc_bioset(NULL,
+                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                                      REQ_OP_READ,
+                                      GFP_KERNEL,
+                                      &c->bio_read);
+               bio->bi_end_io          = bch2_direct_IO_read_split_endio;
+start:
+               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
+               bio->bi_iter.bi_sector  = offset >> 9;
+               bio->bi_private         = dio;
+
+               ret = bio_iov_iter_get_pages(bio, iter);
+               if (ret < 0) {
+                       /* XXX: fault inject this path */
+                       bio->bi_status = BLK_STS_RESOURCE;
+                       bio_endio(bio);
+                       break;
+               }
+
+               offset += bio->bi_iter.bi_size;
+
+               if (dio->should_dirty)
+                       bio_set_pages_dirty(bio);
+
+               if (iter->count)
+                       closure_get(&dio->cl);
+
+               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
+       }
+
+       iter->count += shorten;
+
+       if (sync) {
+               closure_sync(&dio->cl);
+               closure_debug_destroy(&dio->cl);
+               ret = dio->ret;
+               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
+               return ret;
+       } else {
+               return -EIOCBQUEUED;
+       }
+}
+
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       struct file *file = iocb->ki_filp;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       size_t count = iov_iter_count(iter);
+       ssize_t ret;
+
+       if (!count)
+               return 0; /* skip atime */
+
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               struct blk_plug plug;
+
+               if (unlikely(mapping->nrpages)) {
+                       ret = filemap_write_and_wait_range(mapping,
+                                               iocb->ki_pos,
+                                               iocb->ki_pos + count - 1);
+                       if (ret < 0)
+                               goto out;
+               }
+
+               file_accessed(file);
+
+               blk_start_plug(&plug);
+               ret = bch2_direct_IO_read(iocb, iter);
+               blk_finish_plug(&plug);
+
+               if (ret >= 0)
+                       iocb->ki_pos += ret;
+       } else {
+               bch2_pagecache_add_get(inode);
+               ret = generic_file_read_iter(iocb, iter);
+               bch2_pagecache_add_put(inode);
+       }
+out:
+       return bch2_err_class(ret);
+}
+
+/* O_DIRECT writes */
+
+struct dio_write {
+       struct kiocb                    *req;
+       struct address_space            *mapping;
+       struct bch_inode_info           *inode;
+       struct mm_struct                *mm;
+       unsigned                        loop:1,
+                                       extending:1,
+                                       sync:1,
+                                       flush:1,
+                                       free_iov:1;
+       struct quota_res                quota_res;
+       u64                             written;
+
+       struct iov_iter                 iter;
+       struct iovec                    inline_vecs[2];
+
+       /* must be last: */
+       struct bch_write_op             op;
+};
+
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+                                      u64 offset, u64 size,
+                                      unsigned nr_replicas, bool compressed)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 end = offset + size;
+       u32 snapshot;
+       bool ret = true;
+       int err;
+retry:
+       bch2_trans_begin(trans);
+
+       err = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (err)
+               goto err;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, err) {
+               if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
+                       break;
+
+               if (k.k->p.snapshot != snapshot ||
+                   nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
+                       ret = false;
+                       break;
+               }
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(trans, &iter);
+err:
+       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
+               goto retry;
+       bch2_trans_put(trans);
+
+       return err ? false : ret;
+}
+
+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct bch_inode_info *inode = dio->inode;
+       struct bio *bio = &dio->op.wbio.bio;
+
+       return bch2_check_range_allocated(c, inode_inum(inode),
+                               dio->op.pos.offset, bio_sectors(bio),
+                               dio->op.opts.data_replicas,
+                               dio->op.opts.compression != 0);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *);
+static __always_inline long bch2_dio_write_done(struct dio_write *dio);
+
+/*
+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
+ * caller's stack, we're not guaranteed that it will live for the duration of
+ * the IO:
+ */
+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
+{
+       struct iovec *iov = dio->inline_vecs;
+
+       /*
+        * iov_iter has a single embedded iovec - nothing to do:
+        */
+       if (iter_is_ubuf(&dio->iter))
+               return 0;
+
+       /*
+        * We don't currently handle non-iovec iov_iters here - return an error,
+        * and we'll fall back to doing the IO synchronously:
+        */
+       if (!iter_is_iovec(&dio->iter))
+               return -1;
+
+       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
+               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
+                                   GFP_KERNEL);
+               if (unlikely(!iov))
+                       return -ENOMEM;
+
+               dio->free_iov = true;
+       }
+
+       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
+       dio->iter.__iov = iov;
+       return 0;
+}
+
+static void bch2_dio_write_flush_done(struct closure *cl)
+{
+       struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+       struct bch_fs *c = dio->op.c;
+
+       closure_debug_destroy(cl);
+
+       dio->op.error = bch2_journal_error(&c->journal);
+
+       bch2_dio_write_done(dio);
+}
+
+static noinline void bch2_dio_write_flush(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct bch_inode_unpacked inode;
+       int ret;
+
+       dio->flush = 0;
+
+       closure_init(&dio->op.cl, NULL);
+
+       if (!dio->op.error) {
+               ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
+               if (ret) {
+                       dio->op.error = ret;
+               } else {
+                       bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq,
+                                                    &dio->op.cl);
+                       bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
+               }
+       }
+
+       if (dio->sync) {
+               closure_sync(&dio->op.cl);
+               closure_debug_destroy(&dio->op.cl);
+       } else {
+               continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
+       }
+}
+
+static __always_inline long bch2_dio_write_done(struct dio_write *dio)
+{
+       struct kiocb *req = dio->req;
+       struct bch_inode_info *inode = dio->inode;
+       bool sync = dio->sync;
+       long ret;
+
+       if (unlikely(dio->flush)) {
+               bch2_dio_write_flush(dio);
+               if (!sync)
+                       return -EIOCBQUEUED;
+       }
+
+       bch2_pagecache_block_put(inode);
+
+       if (dio->free_iov)
+               kfree(dio->iter.__iov);
+
+       ret = dio->op.error ?: ((long) dio->written << 9);
+       bio_put(&dio->op.wbio.bio);
+
+       /* inode->i_dio_count is our ref on inode and thus bch_fs */
+       inode_dio_end(&inode->v);
+
+       if (ret < 0)
+               ret = bch2_err_class(ret);
+
+       if (!sync) {
+               req->ki_complete(req, ret);
+               ret = -EIOCBQUEUED;
+       }
+       return ret;
+}
+
+static __always_inline void bch2_dio_write_end(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct kiocb *req = dio->req;
+       struct bch_inode_info *inode = dio->inode;
+       struct bio *bio = &dio->op.wbio.bio;
+
+       req->ki_pos     += (u64) dio->op.written << 9;
+       dio->written    += dio->op.written;
+
+       if (dio->extending) {
+               spin_lock(&inode->v.i_lock);
+               if (req->ki_pos > inode->v.i_size)
+                       i_size_write(&inode->v, req->ki_pos);
+               spin_unlock(&inode->v.i_lock);
+       }
+
+       if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
+               __bch2_quota_reservation_put(c, inode, &dio->quota_res);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+
+       bio_release_pages(bio, false);
+
+       if (unlikely(dio->op.error))
+               set_bit(EI_INODE_ERROR, &inode->ei_flags);
+}
+
+static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
+{
+       struct bch_fs *c = dio->op.c;
+       struct kiocb *req = dio->req;
+       struct address_space *mapping = dio->mapping;
+       struct bch_inode_info *inode = dio->inode;
+       struct bch_io_opts opts;
+       struct bio *bio = &dio->op.wbio.bio;
+       unsigned unaligned, iter_count;
+       bool sync = dio->sync, dropped_locks;
+       long ret;
+
+       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
+
+       while (1) {
+               iter_count = dio->iter.count;
+
+               EBUG_ON(current->faults_disabled_mapping);
+               current->faults_disabled_mapping = mapping;
+
+               ret = bio_iov_iter_get_pages(bio, &dio->iter);
+
+               dropped_locks = fdm_dropped_locks();
+
+               current->faults_disabled_mapping = NULL;
+
+               /*
+                * If the fault handler returned an error but also signalled
+                * that it dropped & retook ei_pagecache_lock, we just need to
+                * re-shoot down the page cache and retry:
+                */
+               if (dropped_locks && ret)
+                       ret = 0;
+
+               if (unlikely(ret < 0))
+                       goto err;
+
+               if (unlikely(dropped_locks)) {
+                       ret = bch2_write_invalidate_inode_pages_range(mapping,
+                                       req->ki_pos,
+                                       req->ki_pos + iter_count - 1);
+                       if (unlikely(ret))
+                               goto err;
+
+                       if (!bio->bi_iter.bi_size)
+                               continue;
+               }
+
+               unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
+               bio->bi_iter.bi_size -= unaligned;
+               iov_iter_revert(&dio->iter, unaligned);
+
+               if (!bio->bi_iter.bi_size) {
+                       /*
+                        * bio_iov_iter_get_pages was only able to get <
+                        * blocksize worth of pages:
+                        */
+                       ret = -EFAULT;
+                       goto err;
+               }
+
+               bch2_write_op_init(&dio->op, c, opts);
+               dio->op.end_io          = sync
+                       ? NULL
+                       : bch2_dio_write_loop_async;
+               dio->op.target          = dio->op.opts.foreground_target;
+               dio->op.write_point     = writepoint_hashed((unsigned long) current);
+               dio->op.nr_replicas     = dio->op.opts.data_replicas;
+               dio->op.subvol          = inode->ei_subvol;
+               dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
+               dio->op.devs_need_flush = &inode->ei_devs_need_flush;
+
+               if (sync)
+                       dio->op.flags |= BCH_WRITE_SYNC;
+               dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
+
+               ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
+                                                bio_sectors(bio), true);
+               if (unlikely(ret))
+                       goto err;
+
+               ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
+                                               dio->op.opts.data_replicas, 0);
+               if (unlikely(ret) &&
+                   !bch2_dio_write_check_allocated(dio))
+                       goto err;
+
+               task_io_account_write(bio->bi_iter.bi_size);
+
+               if (unlikely(dio->iter.count) &&
+                   !dio->sync &&
+                   !dio->loop &&
+                   bch2_dio_write_copy_iov(dio))
+                       dio->sync = sync = true;
+
+               dio->loop = true;
+               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
+
+               if (!sync)
+                       return -EIOCBQUEUED;
+
+               bch2_dio_write_end(dio);
+
+               if (likely(!dio->iter.count) || dio->op.error)
+                       break;
+
+               bio_reset(bio, NULL, REQ_OP_WRITE);
+       }
+out:
+       return bch2_dio_write_done(dio);
+err:
+       dio->op.error = ret;
+
+       bio_release_pages(bio, false);
+
+       bch2_quota_reservation_put(c, inode, &dio->quota_res);
+       goto out;
+}
+
+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+{
+       struct mm_struct *mm = dio->mm;
+
+       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+
+       if (mm)
+               kthread_use_mm(mm);
+       bch2_dio_write_loop(dio);
+       if (mm)
+               kthread_unuse_mm(mm);
+}
+
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
+{
+       struct dio_write *dio = container_of(op, struct dio_write, op);
+
+       bch2_dio_write_end(dio);
+
+       if (likely(!dio->iter.count) || dio->op.error)
+               bch2_dio_write_done(dio);
+       else
+               bch2_dio_write_continue(dio);
+}
+
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+{
+       struct file *file = req->ki_filp;
+       struct address_space *mapping = file->f_mapping;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct dio_write *dio;
+       struct bio *bio;
+       bool locked = true, extending;
+       ssize_t ret;
+
+       prefetch(&c->opts);
+       prefetch((void *) &c->opts + 64);
+       prefetch(&inode->ei_inode);
+       prefetch((void *) &inode->ei_inode + 64);
+
+       inode_lock(&inode->v);
+
+       ret = generic_write_checks(req, iter);
+       if (unlikely(ret <= 0))
+               goto err;
+
+       ret = file_remove_privs(file);
+       if (unlikely(ret))
+               goto err;
+
+       ret = file_update_time(file);
+       if (unlikely(ret))
+               goto err;
+
+       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
+               goto err;
+
+       inode_dio_begin(&inode->v);
+       bch2_pagecache_block_get(inode);
+
+       extending = req->ki_pos + iter->count > inode->v.i_size;
+       if (!extending) {
+               inode_unlock(&inode->v);
+               locked = false;
+       }
+
+       bio = bio_alloc_bioset(NULL,
+                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
+                              REQ_OP_WRITE,
+                              GFP_KERNEL,
+                              &c->dio_write_bioset);
+       dio = container_of(bio, struct dio_write, op.wbio.bio);
+       dio->req                = req;
+       dio->mapping            = mapping;
+       dio->inode              = inode;
+       dio->mm                 = current->mm;
+       dio->loop               = false;
+       dio->extending          = extending;
+       dio->sync               = is_sync_kiocb(req) || extending;
+       dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
+       dio->free_iov           = false;
+       dio->quota_res.sectors  = 0;
+       dio->written            = 0;
+       dio->iter               = *iter;
+       dio->op.c               = c;
+
+       if (unlikely(mapping->nrpages)) {
+               ret = bch2_write_invalidate_inode_pages_range(mapping,
+                                               req->ki_pos,
+                                               req->ki_pos + iter->count - 1);
+               if (unlikely(ret))
+                       goto err_put_bio;
+       }
+
+       ret = bch2_dio_write_loop(dio);
+err:
+       if (locked)
+               inode_unlock(&inode->v);
+       return ret;
+err_put_bio:
+       bch2_pagecache_block_put(inode);
+       bio_put(bio);
+       inode_dio_end(&inode->v);
+       goto err;
+}
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
+{
+       bioset_exit(&c->dio_write_bioset);
+       bioset_exit(&c->dio_read_bioset);
+}
+
+int bch2_fs_fs_io_direct_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->dio_read_bioset,
+                       4, offsetof(struct dio_read, rbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_dio_read_bioset_init;
+
+       if (bioset_init(&c->dio_write_bioset,
+                       4, offsetof(struct dio_write, op.wbio.bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_dio_write_bioset_init;
+
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-direct.h b/libbcachefs/fs-io-direct.h
new file mode 100644 (file)
index 0000000..814621e
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_DIRECT_H
+#define _BCACHEFS_FS_IO_DIRECT_H
+
+#ifndef NO_BCACHEFS_FS
+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *);
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
+
+void bch2_fs_fs_io_direct_exit(struct bch_fs *);
+int bch2_fs_fs_io_direct_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; }
+#endif
+
+#endif /* _BCACHEFS_FS_IO_DIRECT_H */
diff --git a/libbcachefs/fs-io-pagecache.c b/libbcachefs/fs-io-pagecache.c
new file mode 100644 (file)
index 0000000..ff664fd
--- /dev/null
@@ -0,0 +1,791 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "btree_iter.h"
+#include "extents.h"
+#include "fs-io.h"
+#include "fs-io-pagecache.h"
+#include "subvolume.h"
+
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+int bch2_filemap_get_contig_folios_d(struct address_space *mapping,
+                                    loff_t start, u64 end,
+                                    fgf_t fgp_flags, gfp_t gfp,
+                                    folios *fs)
+{
+       struct folio *f;
+       u64 pos = start;
+       int ret = 0;
+
+       while (pos < end) {
+               if ((u64) pos >= (u64) start + (1ULL << 20))
+                       fgp_flags &= ~FGP_CREAT;
+
+               ret = darray_make_room_gfp(fs, 1, gfp & GFP_KERNEL);
+               if (ret)
+                       break;
+
+               f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
+               if (IS_ERR_OR_NULL(f))
+                       break;
+
+               BUG_ON(fs->nr && folio_pos(f) != pos);
+
+               pos = folio_end_pos(f);
+               darray_push(fs, f);
+       }
+
+       if (!fs->nr && !ret && (fgp_flags & FGP_CREAT))
+               ret = -ENOMEM;
+
+       return fs->nr ? 0 : ret;
+}
+
+/* pagecache_block must be held */
+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping,
+                                           loff_t start, loff_t end)
+{
+       int ret;
+
+       /*
+        * XXX: the way this is currently implemented, we can spin if a process
+        * is continually redirtying a specific page
+        */
+       do {
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = filemap_write_and_wait_range(mapping, start, end);
+               if (ret)
+                       break;
+
+               if (!mapping->nrpages)
+                       return 0;
+
+               ret = invalidate_inode_pages2_range(mapping,
+                               start >> PAGE_SHIFT,
+                               end >> PAGE_SHIFT);
+       } while (ret == -EBUSY);
+
+       return ret;
+}
+
+#if 0
+/* Useful for debug tracing: */
+static const char * const bch2_folio_sector_states[] = {
+#define x(n)   #n,
+       BCH_FOLIO_SECTOR_STATE()
+#undef x
+       NULL
+};
+#endif
+
+static inline enum bch_folio_sector_state
+folio_sector_dirty(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_unallocated:
+               return SECTOR_dirty;
+       case SECTOR_reserved:
+               return SECTOR_dirty_reserved;
+       default:
+               return state;
+       }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_undirty(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_dirty:
+               return SECTOR_unallocated;
+       case SECTOR_dirty_reserved:
+               return SECTOR_reserved;
+       default:
+               return state;
+       }
+}
+
+static inline enum bch_folio_sector_state
+folio_sector_reserve(enum bch_folio_sector_state state)
+{
+       switch (state) {
+       case SECTOR_unallocated:
+               return SECTOR_reserved;
+       case SECTOR_dirty:
+               return SECTOR_dirty_reserved;
+       default:
+               return state;
+       }
+}
+
+/* for newly allocated folios: */
+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+       struct bch_folio *s;
+
+       s = kzalloc(sizeof(*s) +
+                   sizeof(struct bch_folio_sector) *
+                   folio_sectors(folio), gfp);
+       if (!s)
+               return NULL;
+
+       spin_lock_init(&s->lock);
+       folio_attach_private(folio, s);
+       return s;
+}
+
+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
+{
+       return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
+}
+
+static unsigned bkey_to_sector_state(struct bkey_s_c k)
+{
+       if (bkey_extent_is_reservation(k))
+               return SECTOR_reserved;
+       if (bkey_extent_is_allocation(k.k))
+               return SECTOR_allocated;
+       return SECTOR_unallocated;
+}
+
+static void __bch2_folio_set(struct folio *folio,
+                            unsigned pg_offset, unsigned pg_len,
+                            unsigned nr_ptrs, unsigned state)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, sectors = folio_sectors(folio);
+
+       BUG_ON(pg_offset >= sectors);
+       BUG_ON(pg_offset + pg_len > sectors);
+
+       spin_lock(&s->lock);
+
+       for (i = pg_offset; i < pg_offset + pg_len; i++) {
+               s->s[i].nr_replicas     = nr_ptrs;
+               bch2_folio_sector_set(folio, s, i, state);
+       }
+
+       if (i == sectors)
+               s->uptodate = true;
+
+       spin_unlock(&s->lock);
+}
+
+/*
+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
+ * extents btree:
+ */
+int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
+                  struct folio **fs, unsigned nr_folios)
+{
+       struct btree_trans *trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_folio *s;
+       u64 offset = folio_sector(fs[0]);
+       unsigned folio_idx;
+       u32 snapshot;
+       bool need_set = false;
+       int ret;
+
+       for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
+               s = bch2_folio_create(fs[folio_idx], GFP_KERNEL);
+               if (!s)
+                       return -ENOMEM;
+
+               need_set |= !s->uptodate;
+       }
+
+       if (!need_set)
+               return 0;
+
+       folio_idx = 0;
+       trans = bch2_trans_get(c);
+retry:
+       bch2_trans_begin(trans);
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, ret) {
+               unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+               unsigned state = bkey_to_sector_state(k);
+
+               while (folio_idx < nr_folios) {
+                       struct folio *folio = fs[folio_idx];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end   = folio_end_sector(folio);
+                       unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) -
+                               folio_start;
+                       unsigned folio_len = min(k.k->p.offset, folio_end) -
+                               folio_offset - folio_start;
+
+                       BUG_ON(k.k->p.offset < folio_start);
+                       BUG_ON(bkey_start_offset(k.k) > folio_end);
+
+                       if (!bch2_folio(folio)->uptodate)
+                               __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
+
+                       if (k.k->p.offset < folio_end)
+                               break;
+                       folio_idx++;
+               }
+
+               if (folio_idx == nr_folios)
+                       break;
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(trans, &iter);
+err:
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+       bch2_trans_put(trans);
+
+       return ret;
+}
+
+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+       struct bvec_iter iter;
+       struct folio_vec fv;
+       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+       unsigned state = bkey_to_sector_state(k);
+
+       bio_for_each_folio(fv, bio, iter)
+               __bch2_folio_set(fv.fv_folio,
+                                fv.fv_offset >> 9,
+                                fv.fv_len >> 9,
+                                nr_ptrs, state);
+}
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
+                                    u64 start, u64 end)
+{
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct folio_batch fbatch;
+       unsigned i, j;
+
+       if (end <= start)
+               return;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end = folio_end_sector(folio);
+                       unsigned folio_offset = max(start, folio_start) - folio_start;
+                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+                       struct bch_folio *s;
+
+                       BUG_ON(end <= folio_start);
+
+                       folio_lock(folio);
+                       s = bch2_folio(folio);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = folio_offset; j < folio_offset + folio_len; j++)
+                                       s->s[j].nr_replicas = 0;
+                               spin_unlock(&s->lock);
+                       }
+
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+}
+
+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+                                 u64 start, u64 end)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct folio_batch fbatch;
+       s64 i_sectors_delta = 0;
+       unsigned i, j;
+
+       if (end <= start)
+               return;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(inode->v.i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+                       u64 folio_start = folio_sector(folio);
+                       u64 folio_end = folio_end_sector(folio);
+                       unsigned folio_offset = max(start, folio_start) - folio_start;
+                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+                       struct bch_folio *s;
+
+                       BUG_ON(end <= folio_start);
+
+                       folio_lock(folio);
+                       s = bch2_folio(folio);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = folio_offset; j < folio_offset + folio_len; j++) {
+                                       i_sectors_delta -= s->s[j].state == SECTOR_dirty;
+                                       bch2_folio_sector_set(folio, s, j,
+                                               folio_sector_reserve(s->s[j].state));
+                               }
+                               spin_unlock(&s->lock);
+                       }
+
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
+                                         unsigned nr_replicas)
+{
+       return max(0, (int) nr_replicas -
+                  s->nr_replicas -
+                  s->replicas_reserved);
+}
+
+int bch2_get_folio_disk_reservation(struct bch_fs *c,
+                               struct bch_inode_info *inode,
+                               struct folio *folio, bool check_enospc)
+{
+       struct bch_folio *s = bch2_folio_create(folio, 0);
+       unsigned nr_replicas = inode_nr_replicas(c, inode);
+       struct disk_reservation disk_res = { 0 };
+       unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
+       int ret;
+
+       if (!s)
+               return -ENOMEM;
+
+       for (i = 0; i < sectors; i++)
+               disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
+
+       if (!disk_res_sectors)
+               return 0;
+
+       ret = bch2_disk_reservation_get(c, &disk_res,
+                                       disk_res_sectors, 1,
+                                       !check_enospc
+                                       ? BCH_DISK_RESERVATION_NOFAIL
+                                       : 0);
+       if (unlikely(ret))
+               return ret;
+
+       for (i = 0; i < sectors; i++)
+               s->s[i].replicas_reserved +=
+                       sectors_to_reserve(&s->s[i], nr_replicas);
+
+       return 0;
+}
+
+void bch2_folio_reservation_put(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct bch2_folio_reservation *res)
+{
+       bch2_disk_reservation_put(c, &res->disk);
+       bch2_quota_reservation_put(c, inode, &res->quota);
+}
+
+int bch2_folio_reservation_get(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct folio *folio,
+                       struct bch2_folio_reservation *res,
+                       unsigned offset, unsigned len)
+{
+       struct bch_folio *s = bch2_folio_create(folio, 0);
+       unsigned i, disk_sectors = 0, quota_sectors = 0;
+       int ret;
+
+       if (!s)
+               return -ENOMEM;
+
+       BUG_ON(!s->uptodate);
+
+       for (i = round_down(offset, block_bytes(c)) >> 9;
+            i < round_up(offset + len, block_bytes(c)) >> 9;
+            i++) {
+               disk_sectors += sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
+               quota_sectors += s->s[i].state == SECTOR_unallocated;
+       }
+
+       if (disk_sectors) {
+               ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+               if (unlikely(ret))
+                       return ret;
+       }
+
+       if (quota_sectors) {
+               ret = bch2_quota_reservation_add(c, inode, &res->quota,
+                                                quota_sectors, true);
+               if (unlikely(ret)) {
+                       struct disk_reservation tmp = {
+                               .sectors = disk_sectors
+                       };
+
+                       bch2_disk_reservation_put(c, &tmp);
+                       res->disk.sectors -= disk_sectors;
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+static void bch2_clear_folio_bits(struct folio *folio)
+{
+       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch_folio *s = bch2_folio(folio);
+       struct disk_reservation disk_res = { 0 };
+       int i, sectors = folio_sectors(folio), dirty_sectors = 0;
+
+       if (!s)
+               return;
+
+       EBUG_ON(!folio_test_locked(folio));
+       EBUG_ON(folio_test_writeback(folio));
+
+       for (i = 0; i < sectors; i++) {
+               disk_res.sectors += s->s[i].replicas_reserved;
+               s->s[i].replicas_reserved = 0;
+
+               dirty_sectors -= s->s[i].state == SECTOR_dirty;
+               bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
+       }
+
+       bch2_disk_reservation_put(c, &disk_res);
+
+       bch2_i_sectors_acct(c, inode, NULL, dirty_sectors);
+
+       bch2_folio_release(folio);
+}
+
+void bch2_set_folio_dirty(struct bch_fs *c,
+                         struct bch_inode_info *inode,
+                         struct folio *folio,
+                         struct bch2_folio_reservation *res,
+                         unsigned offset, unsigned len)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, dirty_sectors = 0;
+
+       WARN_ON((u64) folio_pos(folio) + offset + len >
+               round_up((u64) i_size_read(&inode->v), block_bytes(c)));
+
+       BUG_ON(!s->uptodate);
+
+       spin_lock(&s->lock);
+
+       for (i = round_down(offset, block_bytes(c)) >> 9;
+            i < round_up(offset + len, block_bytes(c)) >> 9;
+            i++) {
+               unsigned sectors = sectors_to_reserve(&s->s[i],
+                                               res->disk.nr_replicas);
+
+               /*
+                * This can happen if we race with the error path in
+                * bch2_writepage_io_done():
+                */
+               sectors = min_t(unsigned, sectors, res->disk.sectors);
+
+               s->s[i].replicas_reserved += sectors;
+               res->disk.sectors -= sectors;
+
+               dirty_sectors += s->s[i].state == SECTOR_unallocated;
+
+               bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
+       }
+
+       spin_unlock(&s->lock);
+
+       bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+
+       if (!folio_test_dirty(folio))
+               filemap_dirty_folio(inode->v.i_mapping, folio);
+}
+
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
+{
+       struct file *file = vmf->vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       struct address_space *fdm = faults_disabled_mapping();
+       struct bch_inode_info *inode = file_bch_inode(file);
+       vm_fault_t ret;
+
+       if (fdm == mapping)
+               return VM_FAULT_SIGBUS;
+
+       /* Lock ordering: */
+       if (fdm > mapping) {
+               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
+
+               if (bch2_pagecache_add_tryget(inode))
+                       goto got_lock;
+
+               bch2_pagecache_block_put(fdm_host);
+
+               bch2_pagecache_add_get(inode);
+               bch2_pagecache_add_put(inode);
+
+               bch2_pagecache_block_get(fdm_host);
+
+               /* Signal that lock has been dropped: */
+               set_fdm_dropped_locks();
+               return VM_FAULT_SIGBUS;
+       }
+
+       bch2_pagecache_add_get(inode);
+got_lock:
+       ret = filemap_fault(vmf);
+       bch2_pagecache_add_put(inode);
+
+       return ret;
+}
+
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
+{
+       struct folio *folio = page_folio(vmf->page);
+       struct file *file = vmf->vma->vm_file;
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct address_space *mapping = file->f_mapping;
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct bch2_folio_reservation res;
+       unsigned len;
+       loff_t isize;
+       vm_fault_t ret;
+
+       bch2_folio_reservation_init(c, inode, &res);
+
+       sb_start_pagefault(inode->v.i_sb);
+       file_update_time(file);
+
+       /*
+        * Not strictly necessary, but helps avoid dio writes livelocking in
+        * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get
+        * a bch2_write_invalidate_inode_pages_range() that works without dropping
+        * page lock before invalidating page
+        */
+       bch2_pagecache_add_get(inode);
+
+       folio_lock(folio);
+       isize = i_size_read(&inode->v);
+
+       if (folio->mapping != mapping || folio_pos(folio) >= isize) {
+               folio_unlock(folio);
+               ret = VM_FAULT_NOPAGE;
+               goto out;
+       }
+
+       len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
+
+       if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
+           bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
+               folio_unlock(folio);
+               ret = VM_FAULT_SIGBUS;
+               goto out;
+       }
+
+       bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
+       bch2_folio_reservation_put(c, inode, &res);
+
+       folio_wait_stable(folio);
+       ret = VM_FAULT_LOCKED;
+out:
+       bch2_pagecache_add_put(inode);
+       sb_end_pagefault(inode->v.i_sb);
+
+       return ret;
+}
+
+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+       if (offset || length < folio_size(folio))
+               return;
+
+       bch2_clear_folio_bits(folio);
+}
+
+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
+{
+       if (folio_test_dirty(folio) || folio_test_writeback(folio))
+               return false;
+
+       bch2_clear_folio_bits(folio);
+       return true;
+}
+
+/* fseek: */
+
+static int folio_data_offset(struct folio *folio, loff_t pos,
+                            unsigned min_replicas)
+{
+       struct bch_folio *s = bch2_folio(folio);
+       unsigned i, sectors = folio_sectors(folio);
+
+       if (s)
+               for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
+                       if (s->s[i].state >= SECTOR_dirty &&
+                           s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
+                               return i << SECTOR_SHIFT;
+
+       return -1;
+}
+
+loff_t bch2_seek_pagecache_data(struct inode *vinode,
+                               loff_t start_offset,
+                               loff_t end_offset,
+                               unsigned min_replicas,
+                               bool nonblock)
+{
+       struct folio_batch fbatch;
+       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
+       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
+       pgoff_t index           = start_index;
+       unsigned i;
+       loff_t ret;
+       int offset;
+
+       folio_batch_init(&fbatch);
+
+       while (filemap_get_folios(vinode->i_mapping,
+                                 &index, end_index, &fbatch)) {
+               for (i = 0; i < folio_batch_count(&fbatch); i++) {
+                       struct folio *folio = fbatch.folios[i];
+
+                       if (!nonblock) {
+                               folio_lock(folio);
+                       } else if (!folio_trylock(folio)) {
+                               folio_batch_release(&fbatch);
+                               return -EAGAIN;
+                       }
+
+                       offset = folio_data_offset(folio,
+                                       max(folio_pos(folio), start_offset),
+                                       min_replicas);
+                       if (offset >= 0) {
+                               ret = clamp(folio_pos(folio) + offset,
+                                           start_offset, end_offset);
+                               folio_unlock(folio);
+                               folio_batch_release(&fbatch);
+                               return ret;
+                       }
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
+               cond_resched();
+       }
+
+       return end_offset;
+}
+
+/*
+ * Search for a hole in a folio.
+ *
+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error
+ * code to indicate a pagecache hole exists at the returned offset. Otherwise
+ * return 0 if the folio is filled with data, or an error code. This function
+ * can return -EAGAIN if nonblock is specified.
+ */
+static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
+                             unsigned min_replicas, bool nonblock)
+{
+       struct folio *folio;
+       struct bch_folio *s;
+       unsigned i, sectors;
+       int ret = -ENOENT;
+
+       folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
+                                   FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0);
+       if (IS_ERR(folio))
+               return PTR_ERR(folio);
+
+       s = bch2_folio(folio);
+       if (!s)
+               goto unlock;
+
+       sectors = folio_sectors(folio);
+       for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
+               if (s->s[i].state < SECTOR_dirty ||
+                   s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
+                       *offset = max(*offset,
+                                     folio_pos(folio) + (i << SECTOR_SHIFT));
+                       goto unlock;
+               }
+
+       *offset = folio_end_pos(folio);
+       ret = 0;
+unlock:
+       folio_unlock(folio);
+       folio_put(folio);
+       return ret;
+}
+
+loff_t bch2_seek_pagecache_hole(struct inode *vinode,
+                               loff_t start_offset,
+                               loff_t end_offset,
+                               unsigned min_replicas,
+                               bool nonblock)
+{
+       struct address_space *mapping = vinode->i_mapping;
+       loff_t offset = start_offset;
+       loff_t ret = 0;
+
+       while (!ret && offset < end_offset)
+               ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock);
+
+       if (ret && ret != -ENOENT)
+               return ret;
+       return min(offset, end_offset);
+}
+
+int bch2_clamp_data_hole(struct inode *inode,
+                        u64 *hole_start,
+                        u64 *hole_end,
+                        unsigned min_replicas,
+                        bool nonblock)
+{
+       loff_t ret;
+
+       ret = bch2_seek_pagecache_hole(inode,
+               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+       if (ret < 0)
+               return ret;
+
+       *hole_start = ret;
+
+       if (*hole_start == *hole_end)
+               return 0;
+
+       ret = bch2_seek_pagecache_data(inode,
+               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
+       if (ret < 0)
+               return ret;
+
+       *hole_end = ret;
+       return 0;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io-pagecache.h b/libbcachefs/fs-io-pagecache.h
new file mode 100644 (file)
index 0000000..27f712a
--- /dev/null
@@ -0,0 +1,176 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H
+#define _BCACHEFS_FS_IO_PAGECACHE_H
+
+#include <linux/pagemap.h>
+
+typedef DARRAY(struct folio *) folios;
+
+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t,
+                                    u64, fgf_t, gfp_t, folios *);
+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t);
+
+/*
+ * Use u64 for the end pos and sector helpers because if the folio covers the
+ * max supported range of the mapping, the start offset of the next folio
+ * overflows loff_t. This breaks much of the range based processing in the
+ * buffered write path.
+ */
+static inline u64 folio_end_pos(struct folio *folio)
+{
+       return folio_pos(folio) + folio_size(folio);
+}
+
+static inline size_t folio_sectors(struct folio *folio)
+{
+       return PAGE_SECTORS << folio_order(folio);
+}
+
+static inline loff_t folio_sector(struct folio *folio)
+{
+       return folio_pos(folio) >> 9;
+}
+
+static inline u64 folio_end_sector(struct folio *folio)
+{
+       return folio_end_pos(folio) >> 9;
+}
+
+#define BCH_FOLIO_SECTOR_STATE()       \
+       x(unallocated)                  \
+       x(reserved)                     \
+       x(dirty)                        \
+       x(dirty_reserved)               \
+       x(allocated)
+
+enum bch_folio_sector_state {
+#define x(n)   SECTOR_##n,
+       BCH_FOLIO_SECTOR_STATE()
+#undef x
+};
+
+struct bch_folio_sector {
+       /* Uncompressed, fully allocated replicas (or on disk reservation): */
+       unsigned                nr_replicas:4;
+
+       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+       unsigned                replicas_reserved:4;
+
+       /* i_sectors: */
+       enum bch_folio_sector_state state:8;
+};
+
+struct bch_folio {
+       spinlock_t              lock;
+       atomic_t                write_count;
+       /*
+        * Is the sector state up to date with the btree?
+        * (Not the data itself)
+        */
+       bool                    uptodate;
+       struct bch_folio_sector s[];
+};
+
+/* Helper for when we need to add debug instrumentation: */
+static inline void bch2_folio_sector_set(struct folio *folio,
+                            struct bch_folio *s,
+                            unsigned i, unsigned n)
+{
+       s->s[i].state = n;
+}
+
+/* file offset (to folio offset) to bch_folio_sector index */
+static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
+{
+       u64 f_offset = pos - folio_pos(folio);
+
+       BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
+       return f_offset >> SECTOR_SHIFT;
+}
+
+/* for newly allocated folios: */
+static inline void __bch2_folio_release(struct folio *folio)
+{
+       kfree(folio_detach_private(folio));
+}
+
+static inline void bch2_folio_release(struct folio *folio)
+{
+       EBUG_ON(!folio_test_locked(folio));
+       __bch2_folio_release(folio);
+}
+
+static inline struct bch_folio *__bch2_folio(struct folio *folio)
+{
+       return folio_has_private(folio)
+               ? (struct bch_folio *) folio_get_private(folio)
+               : NULL;
+}
+
+static inline struct bch_folio *bch2_folio(struct folio *folio)
+{
+       EBUG_ON(!folio_test_locked(folio));
+
+       return __bch2_folio(folio);
+}
+
+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t);
+struct bch_folio *bch2_folio_create(struct folio *, gfp_t);
+
+struct bch2_folio_reservation {
+       struct disk_reservation disk;
+       struct quota_res        quota;
+};
+
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
+{
+       /* XXX: this should not be open coded */
+       return inode->ei_inode.bi_data_replicas
+               ? inode->ei_inode.bi_data_replicas - 1
+               : c->opts.data_replicas;
+}
+
+static inline void bch2_folio_reservation_init(struct bch_fs *c,
+                       struct bch_inode_info *inode,
+                       struct bch2_folio_reservation *res)
+{
+       memset(res, 0, sizeof(*res));
+
+       res->disk.nr_replicas = inode_nr_replicas(c, inode);
+}
+
+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
+
+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+
+int bch2_get_folio_disk_reservation(struct bch_fs *,
+                               struct bch_inode_info *,
+                               struct folio *, bool);
+
+void bch2_folio_reservation_put(struct bch_fs *,
+                       struct bch_inode_info *,
+                       struct bch2_folio_reservation *);
+int bch2_folio_reservation_get(struct bch_fs *,
+                       struct bch_inode_info *,
+                       struct folio *,
+                       struct bch2_folio_reservation *,
+                       unsigned, unsigned);
+
+void bch2_set_folio_dirty(struct bch_fs *,
+                         struct bch_inode_info *,
+                         struct folio *,
+                         struct bch2_folio_reservation *,
+                         unsigned, unsigned);
+
+vm_fault_t bch2_page_fault(struct vm_fault *);
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
+void bch2_invalidate_folio(struct folio *, size_t, size_t);
+bool bch2_release_folio(struct folio *, gfp_t);
+
+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool);
+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool);
+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
+
+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
new file mode 100644 (file)
index 0000000..eab0c8c
--- /dev/null
@@ -0,0 +1,524 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * io_misc.c - fallocate, fpunch, truncate:
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "clock.h"
+#include "error.h"
+#include "extents.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "subvolume.h"
+
+/* Overwrites whatever was present with zeroes: */
+int bch2_extent_fallocate(struct btree_trans *trans,
+                         subvol_inum inum,
+                         struct btree_iter *iter,
+                         u64 sectors,
+                         struct bch_io_opts opts,
+                         s64 *i_sectors_delta,
+                         struct write_point_specifier write_point)
+{
+       struct bch_fs *c = trans->c;
+       struct disk_reservation disk_res = { 0 };
+       struct closure cl;
+       struct open_buckets open_buckets = { 0 };
+       struct bkey_s_c k;
+       struct bkey_buf old, new;
+       unsigned sectors_allocated = 0;
+       bool have_reservation = false;
+       bool unwritten = opts.nocow &&
+           c->sb.version >= bcachefs_metadata_version_unwritten_extents;
+       int ret;
+
+       bch2_bkey_buf_init(&old);
+       bch2_bkey_buf_init(&new);
+       closure_init_stack(&cl);
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+
+       if (!have_reservation) {
+               unsigned new_replicas =
+                       max(0, (int) opts.data_replicas -
+                           (int) bch2_bkey_nr_ptrs_fully_allocated(k));
+               /*
+                * Get a disk reservation before (in the nocow case) calling
+                * into the allocator:
+                */
+               ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+               if (unlikely(ret))
+                       goto err;
+
+               bch2_bkey_buf_reassemble(&old, c, k);
+       }
+
+       if (have_reservation) {
+               if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
+                       goto err;
+
+               bch2_key_resize(&new.k->k, sectors);
+       } else if (!unwritten) {
+               struct bkey_i_reservation *reservation;
+
+               bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
+               reservation = bkey_reservation_init(new.k);
+               reservation->k.p = iter->pos;
+               bch2_key_resize(&reservation->k, sectors);
+               reservation->v.nr_replicas = opts.data_replicas;
+       } else {
+               struct bkey_i_extent *e;
+               struct bch_devs_list devs_have;
+               struct write_point *wp;
+               struct bch_extent_ptr *ptr;
+
+               devs_have.nr = 0;
+
+               bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
+
+               e = bkey_extent_init(new.k);
+               e->k.p = iter->pos;
+
+               ret = bch2_alloc_sectors_start_trans(trans,
+                               opts.foreground_target,
+                               false,
+                               write_point,
+                               &devs_have,
+                               opts.data_replicas,
+                               opts.data_replicas,
+                               BCH_WATERMARK_normal, 0, &cl, &wp);
+               if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                       ret = -BCH_ERR_transaction_restart_nested;
+               if (ret)
+                       goto err;
+
+               sectors = min_t(u64, sectors, wp->sectors_free);
+               sectors_allocated = sectors;
+
+               bch2_key_resize(&e->k, sectors);
+
+               bch2_open_bucket_get(c, wp, &open_buckets);
+               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
+               bch2_alloc_sectors_done(c, wp);
+
+               extent_for_each_ptr(extent_i_to_s(e), ptr)
+                       ptr->unwritten = true;
+       }
+
+       have_reservation = true;
+
+       ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
+                                0, i_sectors_delta, true);
+err:
+       if (!ret && sectors_allocated)
+               bch2_increment_clock(c, sectors_allocated, WRITE);
+
+       bch2_open_buckets_put(c, &open_buckets);
+       bch2_disk_reservation_put(c, &disk_res);
+       bch2_bkey_buf_exit(&new, c);
+       bch2_bkey_buf_exit(&old, c);
+
+       if (closure_nr_remaining(&cl) != 1) {
+               bch2_trans_unlock(trans);
+               closure_sync(&cl);
+       }
+
+       return ret;
+}
+
+/*
+ * Returns -BCH_ERR_transacton_restart if we had to drop locks:
+ */
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
+                  subvol_inum inum, u64 end,
+                  s64 *i_sectors_delta)
+{
+       struct bch_fs *c        = trans->c;
+       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
+       struct bpos end_pos = POS(inum.inum, end);
+       struct bkey_s_c k;
+       int ret = 0, ret2 = 0;
+       u32 snapshot;
+
+       while (!ret ||
+              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct bkey_i delete;
+
+               if (ret)
+                       ret2 = ret;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(iter, snapshot);
+
+               /*
+                * peek_upto() doesn't have ideal semantics for extents:
+                */
+               k = bch2_btree_iter_peek_upto(iter, end_pos);
+               if (!k.k)
+                       break;
+
+               ret = bkey_err(k);
+               if (ret)
+                       continue;
+
+               bkey_init(&delete.k);
+               delete.k.p = iter->pos;
+
+               /* create the biggest key we can */
+               bch2_key_resize(&delete.k, max_sectors);
+               bch2_cut_back(end_pos, &delete);
+
+               ret = bch2_extent_update(trans, inum, iter, &delete,
+                               &disk_res, 0, i_sectors_delta, false);
+               bch2_disk_reservation_put(c, &disk_res);
+       }
+
+       return ret ?: ret2;
+}
+
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+               s64 *i_sectors_delta)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, start),
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta);
+
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               ret = 0;
+
+       return ret;
+}
+
+/* truncate: */
+
+void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_logged_op_truncate op = bkey_s_c_to_logged_op_truncate(k);
+
+       prt_printf(out, "subvol=%u", le32_to_cpu(op.v->subvol));
+       prt_printf(out, " inum=%llu", le64_to_cpu(op.v->inum));
+       prt_printf(out, " new_i_size=%llu", le64_to_cpu(op.v->new_i_size));
+}
+
+static int truncate_set_isize(struct btree_trans *trans,
+                             subvol_inum inum,
+                             u64 new_i_size)
+{
+       struct btree_iter iter = { NULL };
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+               (inode_u.bi_size = new_i_size, 0) ?:
+               bch2_inode_write(trans, &iter, &inode_u);
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
+                                           struct bkey_i *op_k,
+                                           u64 *i_sectors_delta)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter fpunch_iter;
+       struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
+       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+       int ret;
+
+       ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       truncate_set_isize(trans, inum, new_i_size));
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents,
+                            POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9),
+                            BTREE_ITER_INTENT);
+       ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta);
+       bch2_trans_iter_exit(trans, &fpunch_iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               ret = 0;
+err:
+       bch2_logged_op_finish(trans, op_k);
+       return ret;
+}
+
+int bch2_resume_logged_op_truncate(struct btree_trans *trans, struct bkey_i *op_k)
+{
+       return __bch2_resume_logged_op_truncate(trans, op_k, NULL);
+}
+
+int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sectors_delta)
+{
+       struct bkey_i_logged_op_truncate op;
+
+       bkey_logged_op_truncate_init(&op.k_i);
+       op.v.subvol     = cpu_to_le32(inum.subvol);
+       op.v.inum       = cpu_to_le64(inum.inum);
+       op.v.new_i_size = cpu_to_le64(new_i_size);
+
+       /*
+        * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+        * snapshot while they're in progress, then crashing, will result in the
+        * resume only proceeding in one of the snapshots
+        */
+       down_read(&c->snapshot_create_lock);
+       int ret = bch2_trans_run(c,
+               bch2_logged_op_start(trans, &op.k_i) ?:
+               __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+       up_read(&c->snapshot_create_lock);
+
+       return ret;
+}
+
+/* finsert/fcollapse: */
+
+void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_logged_op_finsert op = bkey_s_c_to_logged_op_finsert(k);
+
+       prt_printf(out, "subvol=%u",            le32_to_cpu(op.v->subvol));
+       prt_printf(out, " inum=%llu",           le64_to_cpu(op.v->inum));
+       prt_printf(out, " dst_offset=%lli",     le64_to_cpu(op.v->dst_offset));
+       prt_printf(out, " src_offset=%llu",     le64_to_cpu(op.v->src_offset));
+}
+
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+{
+       struct btree_iter iter;
+       struct bch_inode_unpacked inode_u;
+       int ret;
+
+       offset  <<= 9;
+       len     <<= 9;
+
+       ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+       if (ret)
+               return ret;
+
+       if (len > 0) {
+               if (MAX_LFS_FILESIZE - inode_u.bi_size < len) {
+                       ret = -EFBIG;
+                       goto err;
+               }
+
+               if (offset >= inode_u.bi_size) {
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       inode_u.bi_size += len;
+       inode_u.bi_mtime = inode_u.bi_ctime = bch2_current_time(trans->c);
+
+       ret = bch2_inode_write(trans, &iter, &inode_u);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
+                                          struct bkey_i *op_k,
+                                          u64 *i_sectors_delta)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
+       subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       struct bch_io_opts opts;
+       u64 dst_offset = le64_to_cpu(op->v.dst_offset);
+       u64 src_offset = le64_to_cpu(op->v.src_offset);
+       s64 shift = dst_offset - src_offset;
+       u64 len = abs(shift);
+       u64 pos = le64_to_cpu(op->v.pos);
+       bool insert = shift > 0;
+       int ret = 0;
+
+       ret = bch2_inum_opts_get(trans, inum, &opts);
+       if (ret)
+               return ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, 0),
+                            BTREE_ITER_INTENT);
+
+       switch (op->v.state) {
+case LOGGED_OP_FINSERT_start:
+       op->v.state = LOGGED_OP_FINSERT_shift_extents;
+
+       if (insert) {
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                               adjust_i_size(trans, inum, src_offset, len) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+               if (ret)
+                       goto err;
+       } else {
+               bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset));
+
+               ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta);
+               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       goto err;
+
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                               bch2_logged_op_update(trans, &op->k_i));
+       }
+
+       fallthrough;
+case LOGGED_OP_FINSERT_shift_extents:
+       while (1) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(c, 0);
+               struct bkey_i delete, *copy;
+               struct bkey_s_c k;
+               struct bpos src_pos = POS(inum.inum, src_offset);
+               u32 snapshot;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       goto btree_err;
+
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+               bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot));
+
+               k = insert
+                       ? bch2_btree_iter_peek_prev(&iter)
+                       : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+               if ((ret = bkey_err(k)))
+                       goto btree_err;
+
+               if (!k.k ||
+                   k.k->p.inode != inum.inum ||
+                   bkey_le(k.k->p, POS(inum.inum, src_offset)))
+                       break;
+
+               copy = bch2_bkey_make_mut_noupdate(trans, k);
+               if ((ret = PTR_ERR_OR_ZERO(copy)))
+                       goto btree_err;
+
+               if (insert &&
+                   bkey_lt(bkey_start_pos(k.k), src_pos)) {
+                       bch2_cut_front(src_pos, copy);
+
+                       /* Splitting compressed extent? */
+                       bch2_disk_reservation_add(c, &disk_res,
+                                       copy->k.size *
+                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy)),
+                                       BCH_DISK_RESERVATION_NOFAIL);
+               }
+
+               bkey_init(&delete.k);
+               delete.k.p = copy->k.p;
+               delete.k.p.snapshot = snapshot;
+               delete.k.size = copy->k.size;
+
+               copy->k.p.offset += shift;
+               copy->k.p.snapshot = snapshot;
+
+               op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
+
+               ret =   bch2_bkey_set_needs_rebalance(c, copy,
+                                       opts.background_target,
+                                       opts.background_compression) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
+                       bch2_logged_op_update(trans, &op->k_i) ?:
+                       bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
+btree_err:
+               bch2_disk_reservation_put(c, &disk_res);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       goto err;
+
+               pos = le64_to_cpu(op->v.pos);
+       }
+
+       op->v.state = LOGGED_OP_FINSERT_finish;
+
+       if (!insert) {
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                               adjust_i_size(trans, inum, src_offset, shift) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+       } else {
+               /* We need an inode update to update bi_journal_seq for fsync: */
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                               adjust_i_size(trans, inum, 0, 0) ?:
+                               bch2_logged_op_update(trans, &op->k_i));
+       }
+
+       break;
+case LOGGED_OP_FINSERT_finish:
+       break;
+       }
+err:
+       bch2_logged_op_finish(trans, op_k);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_resume_logged_op_finsert(struct btree_trans *trans, struct bkey_i *op_k)
+{
+       return __bch2_resume_logged_op_finsert(trans, op_k, NULL);
+}
+
+int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
+                          u64 offset, u64 len, bool insert,
+                          s64 *i_sectors_delta)
+{
+       struct bkey_i_logged_op_finsert op;
+       s64 shift = insert ? len : -len;
+
+       bkey_logged_op_finsert_init(&op.k_i);
+       op.v.subvol     = cpu_to_le32(inum.subvol);
+       op.v.inum       = cpu_to_le64(inum.inum);
+       op.v.dst_offset = cpu_to_le64(offset + shift);
+       op.v.src_offset = cpu_to_le64(offset);
+       op.v.pos        = cpu_to_le64(insert ? U64_MAX : offset);
+
+       /*
+        * Logged ops aren't atomic w.r.t. snapshot creation: creating a
+        * snapshot while they're in progress, then crashing, will result in the
+        * resume only proceeding in one of the snapshots
+        */
+       down_read(&c->snapshot_create_lock);
+       int ret = bch2_trans_run(c,
+               bch2_logged_op_start(trans, &op.k_i) ?:
+               __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+       up_read(&c->snapshot_create_lock);
+
+       return ret;
+}
diff --git a/libbcachefs/io_misc.h b/libbcachefs/io_misc.h
new file mode 100644 (file)
index 0000000..9cb44a7
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_MISC_H
+#define _BCACHEFS_IO_MISC_H
+
+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
+                         u64, struct bch_io_opts, s64 *,
+                         struct write_point_specifier);
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
+                  subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+
+void bch2_logged_op_truncate_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_truncate ((struct bkey_ops) {  \
+       .val_to_text    = bch2_logged_op_truncate_to_text,      \
+       .min_val_size   = 24,                                   \
+})
+
+int bch2_resume_logged_op_truncate(struct btree_trans *, struct bkey_i *);
+
+int bch2_truncate(struct bch_fs *, subvol_inum, u64, u64 *);
+
+void bch2_logged_op_finsert_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_logged_op_finsert ((struct bkey_ops) {   \
+       .val_to_text    = bch2_logged_op_finsert_to_text,       \
+       .min_val_size   = 24,                                   \
+})
+
+int bch2_resume_logged_op_finsert(struct btree_trans *, struct bkey_i *);
+
+int bch2_fcollapse_finsert(struct bch_fs *, subvol_inum, u64, u64, bool, s64 *);
+
+#endif /* _BCACHEFS_IO_MISC_H */
diff --git a/libbcachefs/io_read.c b/libbcachefs/io_read.c
new file mode 100644 (file)
index 0000000..b833409
--- /dev/null
@@ -0,0 +1,1210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "alloc_foreground.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "data_update.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io_read.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "subvolume.h"
+#include "trace.h"
+
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       const struct bch_devs_mask *devs;
+       unsigned d, nr = 0, total = 0;
+       u64 now = local_clock(), last;
+       s64 congested;
+       struct bch_dev *ca;
+
+       if (!target)
+               return false;
+
+       rcu_read_lock();
+       devs = bch2_target_to_mask(c, target) ?:
+               &c->rw_devs[BCH_DATA_user];
+
+       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+               ca = rcu_dereference(c->devs[d]);
+               if (!ca)
+                       continue;
+
+               congested = atomic_read(&ca->congested);
+               last = READ_ONCE(ca->congested_last);
+               if (time_after64(now, last))
+                       congested -= (now - last) >> 12;
+
+               total += max(congested, 0LL);
+               nr++;
+       }
+       rcu_read_unlock();
+
+       return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       return false;
+}
+
+#endif
+
+/* Cache promotion on read */
+
+struct promote_op {
+       struct rcu_head         rcu;
+       u64                     start_time;
+
+       struct rhash_head       hash;
+       struct bpos             pos;
+
+       struct data_update      write;
+       struct bio_vec          bi_inline_vecs[0]; /* must be last */
+};
+
+static const struct rhashtable_params bch_promote_params = {
+       .head_offset    = offsetof(struct promote_op, hash),
+       .key_offset     = offsetof(struct promote_op, pos),
+       .key_len        = sizeof(struct bpos),
+};
+
+static inline int should_promote(struct bch_fs *c, struct bkey_s_c k,
+                                 struct bpos pos,
+                                 struct bch_io_opts opts,
+                                 unsigned flags)
+{
+       BUG_ON(!opts.promote_target);
+
+       if (!(flags & BCH_READ_MAY_PROMOTE))
+               return -BCH_ERR_nopromote_may_not;
+
+       if (bch2_bkey_has_target(c, k, opts.promote_target))
+               return -BCH_ERR_nopromote_already_promoted;
+
+       if (bkey_extent_is_unwritten(k))
+               return -BCH_ERR_nopromote_unwritten;
+
+       if (bch2_target_congested(c, opts.promote_target))
+               return -BCH_ERR_nopromote_congested;
+
+       if (rhashtable_lookup_fast(&c->promote_table, &pos,
+                                  bch_promote_params))
+               return -BCH_ERR_nopromote_in_flight;
+
+       return 0;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+       int ret;
+
+       bch2_data_update_exit(&op->write);
+
+       ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                    bch_promote_params);
+       BUG_ON(ret);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+       kfree_rcu(op, rcu);
+}
+
+static void promote_done(struct bch_write_op *wop)
+{
+       struct promote_op *op =
+               container_of(wop, struct promote_op, write.op);
+       struct bch_fs *c = op->write.op.c;
+
+       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
+                              op->start_time);
+       promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+       struct bio *bio = &op->write.op.wbio.bio;
+
+       trace_and_count(op->write.op.c, read_promote, &rbio->bio);
+
+       /* we now own pages: */
+       BUG_ON(!rbio->bounce);
+       BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+              sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+       bch2_data_update_read_done(&op->write, rbio->pick.crc);
+}
+
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
+                                         enum btree_id btree_id,
+                                         struct bkey_s_c k,
+                                         struct bpos pos,
+                                         struct extent_ptr_decoded *pick,
+                                         struct bch_io_opts opts,
+                                         unsigned sectors,
+                                         struct bch_read_bio **rbio)
+{
+       struct bch_fs *c = trans->c;
+       struct promote_op *op = NULL;
+       struct bio *bio;
+       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+       int ret;
+
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
+               return NULL;
+
+       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
+       if (!op)
+               goto err;
+
+       op->start_time = local_clock();
+       op->pos = pos;
+
+       /*
+        * We don't use the mempool here because extents that aren't
+        * checksummed or compressed can be too big for the mempool:
+        */
+       *rbio = kzalloc(sizeof(struct bch_read_bio) +
+                       sizeof(struct bio_vec) * pages,
+                       GFP_NOFS);
+       if (!*rbio)
+               goto err;
+
+       rbio_init(&(*rbio)->bio, opts);
+       bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
+
+       if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
+                                GFP_NOFS))
+               goto err;
+
+       (*rbio)->bounce         = true;
+       (*rbio)->split          = true;
+       (*rbio)->kmalloc        = true;
+
+       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+                                         bch_promote_params))
+               goto err;
+
+       bio = &op->write.op.wbio.bio;
+       bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
+
+       ret = bch2_data_update_init(trans, NULL, &op->write,
+                       writepoint_hashed((unsigned long) current),
+                       opts,
+                       (struct data_update_opts) {
+                               .target         = opts.promote_target,
+                               .extra_replicas = 1,
+                               .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
+                       },
+                       btree_id, k);
+       /*
+        * possible errors: -BCH_ERR_nocow_lock_blocked,
+        * -BCH_ERR_ENOSPC_disk_reservation:
+        */
+       if (ret) {
+               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                       bch_promote_params);
+               BUG_ON(ret);
+               goto err;
+       }
+
+       op->write.op.end_io = promote_done;
+
+       return op;
+err:
+       if (*rbio)
+               bio_free_pages(&(*rbio)->bio);
+       kfree(*rbio);
+       *rbio = NULL;
+       kfree(op);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
+       return NULL;
+}
+
+noinline
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_ptr_decoded *pick,
+                                       struct bch_io_opts opts,
+                                       unsigned flags,
+                                       struct bch_read_bio **rbio,
+                                       bool *bounce,
+                                       bool *read_full)
+{
+       struct bch_fs *c = trans->c;
+       bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+       /* data might have to be decompressed in the write path: */
+       unsigned sectors = promote_full
+               ? max(pick->crc.compressed_size, pick->crc.live_size)
+               : bvec_iter_sectors(iter);
+       struct bpos pos = promote_full
+               ? bkey_start_pos(k.k)
+               : POS(k.k->p.inode, iter.bi_sector);
+       struct promote_op *promote;
+       int ret;
+
+       ret = should_promote(c, k, pos, opts, flags);
+       if (ret)
+               goto nopromote;
+
+       promote = __promote_alloc(trans,
+                                 k.k->type == KEY_TYPE_reflink_v
+                                 ? BTREE_ID_reflink
+                                 : BTREE_ID_extents,
+                                 k, pos, pick, opts, sectors, rbio);
+       if (!promote) {
+               ret = -BCH_ERR_nopromote_enomem;
+               goto nopromote;
+       }
+
+       *bounce         = true;
+       *read_full      = promote_full;
+       return promote;
+nopromote:
+       trace_read_nopromote(c, ret);
+       return NULL;
+}
+
+/* Read */
+
+#define READ_RETRY_AVOID       1
+#define READ_RETRY             2
+#define READ_ERR               3
+
+enum rbio_context {
+       RBIO_CONTEXT_NULL,
+       RBIO_CONTEXT_HIGHPRI,
+       RBIO_CONTEXT_UNBOUND,
+};
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+       return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+                          enum rbio_context context,
+                          struct workqueue_struct *wq)
+{
+       if (context <= rbio->context) {
+               fn(&rbio->work);
+       } else {
+               rbio->work.func         = fn;
+               rbio->context           = context;
+               queue_work(wq, &rbio->work);
+       }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+       BUG_ON(rbio->bounce && !rbio->split);
+
+       if (rbio->promote)
+               promote_free(rbio->c, rbio->promote);
+       rbio->promote = NULL;
+
+       if (rbio->bounce)
+               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+
+       if (rbio->split) {
+               struct bch_read_bio *parent = rbio->parent;
+
+               if (rbio->kmalloc)
+                       kfree(rbio);
+               else
+                       bio_put(&rbio->bio);
+
+               rbio = parent;
+       }
+
+       return rbio;
+}
+
+/*
+ * Only called on a top level bch_read_bio to complete an entire read request,
+ * not a split:
+ */
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+       if (rbio->start_time)
+               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
+                                      rbio->start_time);
+       bio_endio(&rbio->bio);
+}
+
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+                                    struct bvec_iter bvec_iter,
+                                    struct bch_io_failures *failed,
+                                    unsigned flags)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       struct bkey_s_c k;
+       int ret;
+
+       flags &= ~BCH_READ_LAST_FRAGMENT;
+       flags |= BCH_READ_MUST_CLONE;
+
+       bch2_bkey_buf_init(&sk);
+
+       bch2_trans_iter_init(trans, &iter, rbio->data_btree,
+                            rbio->read_pos, BTREE_ITER_SLOTS);
+retry:
+       rbio->bio.bi_status = 0;
+
+       k = bch2_btree_iter_peek_slot(&iter);
+       if (bkey_err(k))
+               goto err;
+
+       bch2_bkey_buf_reassemble(&sk, c, k);
+       k = bkey_i_to_s_c(sk.k);
+       bch2_trans_unlock(trans);
+
+       if (!bch2_bkey_matches_ptr(c, k,
+                                  rbio->pick.ptr,
+                                  rbio->data_pos.offset -
+                                  rbio->pick.crc.offset)) {
+               /* extent we wanted to read no longer exists: */
+               rbio->hole = true;
+               goto out;
+       }
+
+       ret = __bch2_read_extent(trans, rbio, bvec_iter,
+                                rbio->read_pos,
+                                rbio->data_btree,
+                                k, 0, failed, flags);
+       if (ret == READ_RETRY)
+               goto retry;
+       if (ret)
+               goto err;
+out:
+       bch2_rbio_done(rbio);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&sk, c);
+       return;
+err:
+       rbio->bio.bi_status = BLK_STS_IOERR;
+       goto out;
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bvec_iter iter   = rbio->bvec_iter;
+       unsigned flags          = rbio->flags;
+       subvol_inum inum = {
+               .subvol = rbio->subvol,
+               .inum   = rbio->read_pos.inode,
+       };
+       struct bch_io_failures failed = { .nr = 0 };
+
+       trace_and_count(c, read_retry, &rbio->bio);
+
+       if (rbio->retry == READ_RETRY_AVOID)
+               bch2_mark_io_failure(&failed, &rbio->pick);
+
+       rbio->bio.bi_status = 0;
+
+       rbio = bch2_rbio_free(rbio);
+
+       flags |= BCH_READ_IN_RETRY;
+       flags &= ~BCH_READ_MAY_PROMOTE;
+
+       if (flags & BCH_READ_NODECODE) {
+               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
+       } else {
+               flags &= ~BCH_READ_LAST_FRAGMENT;
+               flags |= BCH_READ_MUST_CLONE;
+
+               __bch2_read(c, rbio, iter, inum, &failed, flags);
+       }
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
+                           blk_status_t error)
+{
+       rbio->retry = retry;
+
+       if (rbio->flags & BCH_READ_IN_RETRY)
+               return;
+
+       if (retry == READ_ERR) {
+               rbio = bch2_rbio_free(rbio);
+
+               rbio->bio.bi_status = error;
+               bch2_rbio_done(rbio);
+       } else {
+               bch2_rbio_punt(rbio, bch2_rbio_retry,
+                              RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+       }
+}
+
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
+                                  struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
+       struct bch_extent_crc_unpacked new_crc;
+       struct btree_iter iter;
+       struct bkey_i *new;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       if (crc_is_compressed(rbio->pick.crc))
+               return 0;
+
+       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       if ((ret = bkey_err(k)))
+               goto out;
+
+       if (bversion_cmp(k.k->version, rbio->version) ||
+           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
+               goto out;
+
+       /* Extent was merged? */
+       if (bkey_start_offset(k.k) < data_offset ||
+           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
+               goto out;
+
+       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+                       rbio->pick.crc, NULL, &new_crc,
+                       bkey_start_offset(k.k) - data_offset, k.k->size,
+                       rbio->pick.crc.csum_type)) {
+               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+               ret = 0;
+               goto out;
+       }
+
+       /*
+        * going to be temporarily appending another checksum entry:
+        */
+       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+                                sizeof(struct bch_extent_crc128));
+       if ((ret = PTR_ERR_OR_ZERO(new)))
+               goto out;
+
+       bkey_reassemble(new, k);
+
+       if (!bch2_bkey_narrow_crcs(new, new_crc))
+               goto out;
+
+       ret = bch2_trans_update(trans, &iter, new,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+       bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                     __bch2_rbio_narrow_crcs(trans, rbio));
+}
+
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct bio *src         = &rbio->bio;
+       struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
+       struct bvec_iter dst_iter = rbio->bvec_iter;
+       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+       struct nonce nonce = extent_nonce(rbio->version, crc);
+       unsigned nofs_flags;
+       struct bch_csum csum;
+       int ret;
+
+       nofs_flags = memalloc_nofs_save();
+
+       /* Reset iterator for checksumming and copying bounced data: */
+       if (rbio->bounce) {
+               src->bi_iter.bi_size            = crc.compressed_size << 9;
+               src->bi_iter.bi_idx             = 0;
+               src->bi_iter.bi_bvec_done       = 0;
+       } else {
+               src->bi_iter                    = rbio->bvec_iter;
+       }
+
+       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
+               goto csum_err;
+
+       /*
+        * XXX
+        * We need to rework the narrow_crcs path to deliver the read completion
+        * first, and then punt to a different workqueue, otherwise we're
+        * holding up reads while doing btree updates which is bad for memory
+        * reclaim.
+        */
+       if (unlikely(rbio->narrow_crcs))
+               bch2_rbio_narrow_crcs(rbio);
+
+       if (rbio->flags & BCH_READ_NODECODE)
+               goto nodecode;
+
+       /* Adjust crc to point to subset of data we want: */
+       crc.offset     += rbio->offset_into_extent;
+       crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
+
+       if (crc_is_compressed(crc)) {
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+                   !c->opts.no_data_io)
+                       goto decompression_err;
+       } else {
+               /* don't need to decrypt the entire bio: */
+               nonce = nonce_add(nonce, crc.offset << 9);
+               bio_advance(src, crc.offset << 9);
+
+               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+               src->bi_iter.bi_size = dst_iter.bi_size;
+
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               if (rbio->bounce) {
+                       struct bvec_iter src_iter = src->bi_iter;
+
+                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+               }
+       }
+
+       if (rbio->promote) {
+               /*
+                * Re encrypt data we decrypted, so it's consistent with
+                * rbio->crc:
+                */
+               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+               if (ret)
+                       goto decrypt_err;
+
+               promote_start(rbio->promote, rbio);
+               rbio->promote = NULL;
+       }
+nodecode:
+       if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+               rbio = bch2_rbio_free(rbio);
+               bch2_rbio_done(rbio);
+       }
+out:
+       memalloc_nofs_restore(nofs_flags);
+       return;
+csum_err:
+       /*
+        * Checksum error: if the bio wasn't bounced, we may have been
+        * reading into buffers owned by userspace (that userspace can
+        * scribble over) - retry the read, bouncing it this time:
+        */
+       if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+               rbio->flags |= BCH_READ_MUST_BOUNCE;
+               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
+               goto out;
+       }
+
+       bch_err_inum_offset_ratelimited(ca,
+               rbio->read_pos.inode,
+               rbio->read_pos.offset << 9,
+               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
+               rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+               csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+       bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+       goto out;
+decompression_err:
+       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+                                       rbio->read_pos.offset << 9,
+                                       "decompression error");
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       goto out;
+decrypt_err:
+       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
+                                       rbio->read_pos.offset << 9,
+                                       "decrypt error");
+       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
+       goto out;
+}
+
+static void bch2_read_endio(struct bio *bio)
+{
+       struct bch_read_bio *rbio =
+               container_of(bio, struct bch_read_bio, bio);
+       struct bch_fs *c        = rbio->c;
+       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+       struct workqueue_struct *wq = NULL;
+       enum rbio_context context = RBIO_CONTEXT_NULL;
+
+       if (rbio->have_ioref) {
+               bch2_latency_acct(ca, rbio->submit_time, READ);
+               percpu_ref_put(&ca->io_ref);
+       }
+
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+                                   rbio->read_pos.inode,
+                                   rbio->read_pos.offset,
+                                   "data read error: %s",
+                              bch2_blk_status_to_str(bio->bi_status))) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
+               return;
+       }
+
+       if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+           ptr_stale(ca, &rbio->pick.ptr)) {
+               trace_and_count(c, read_reuse_race, &rbio->bio);
+
+               if (rbio->flags & BCH_READ_RETRY_IF_STALE)
+                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
+               else
+                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
+               return;
+       }
+
+       if (rbio->narrow_crcs ||
+           rbio->promote ||
+           crc_is_compressed(rbio->pick.crc) ||
+           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+               context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
+       else if (rbio->pick.crc.csum_type)
+               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
+
+       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
+}
+
+int __bch2_read_indirect_extent(struct btree_trans *trans,
+                               unsigned *offset_into_extent,
+                               struct bkey_buf *orig_k)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 reflink_offset;
+       int ret;
+
+       reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
+               *offset_into_extent;
+
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+                              POS(0, reflink_offset), 0);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_reflink_v &&
+           k.k->type != KEY_TYPE_indirect_inline_data) {
+               bch_err_inum_offset_ratelimited(trans->c,
+                       orig_k->k->k.p.inode,
+                       orig_k->k->k.p.offset << 9,
+                       "%llu len %u points to nonexistent indirect extent %llu",
+                       orig_k->k->k.p.offset,
+                       orig_k->k->k.size,
+                       reflink_offset);
+               bch2_inconsistent_error(trans->c);
+               ret = -EIO;
+               goto err;
+       }
+
+       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
+       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+                                                  struct bkey_s_c k,
+                                                  struct bch_extent_ptr ptr)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+       struct btree_iter iter;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            PTR_BUCKET_POS(c, &ptr),
+                            BTREE_ITER_CACHED);
+
+       prt_printf(&buf, "Attempting to read from stale dirty pointer:");
+       printbuf_indent_add(&buf, 2);
+       prt_newline(&buf);
+
+       bch2_bkey_val_to_text(&buf, c, k);
+       prt_newline(&buf);
+
+       prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+
+       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+       if (!ret) {
+               prt_newline(&buf);
+               bch2_bkey_val_to_text(&buf, c, k);
+       }
+
+       bch2_fs_inconsistent(c, "%s", buf.buf);
+
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
+}
+
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
+                      struct bvec_iter iter, struct bpos read_pos,
+                      enum btree_id data_btree, struct bkey_s_c k,
+                      unsigned offset_into_extent,
+                      struct bch_io_failures *failed, unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct extent_ptr_decoded pick;
+       struct bch_read_bio *rbio = NULL;
+       struct bch_dev *ca = NULL;
+       struct promote_op *promote = NULL;
+       bool bounce = false, read_full = false, narrow_crcs = false;
+       struct bpos data_pos = bkey_start_pos(k.k);
+       int pick_ret;
+
+       if (bkey_extent_is_inline_data(k.k)) {
+               unsigned bytes = min_t(unsigned, iter.bi_size,
+                                      bkey_inline_data_bytes(k.k));
+
+               swap(iter.bi_size, bytes);
+               memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
+               swap(iter.bi_size, bytes);
+               bio_advance_iter(&orig->bio, &iter, bytes);
+               zero_fill_bio_iter(&orig->bio, iter);
+               goto out_read_done;
+       }
+retry_pick:
+       pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
+
+       /* hole or reservation - just zero fill: */
+       if (!pick_ret)
+               goto hole;
+
+       if (pick_ret < 0) {
+               bch_err_inum_offset_ratelimited(c,
+                               read_pos.inode, read_pos.offset << 9,
+                               "no device to read from");
+               goto err;
+       }
+
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       /*
+        * Stale dirty pointers are treated as IO errors, but @failed isn't
+        * allocated unless we're in the retry path - so if we're not in the
+        * retry path, don't check here, it'll be caught in bch2_read_endio()
+        * and we'll end up in the retry path:
+        */
+       if ((flags & BCH_READ_IN_RETRY) &&
+           !pick.ptr.cached &&
+           unlikely(ptr_stale(ca, &pick.ptr))) {
+               read_from_stale_dirty_pointer(trans, k, pick.ptr);
+               bch2_mark_io_failure(failed, &pick);
+               goto retry_pick;
+       }
+
+       /*
+        * Unlock the iterator while the btree node's lock is still in
+        * cache, before doing the IO:
+        */
+       bch2_trans_unlock(trans);
+
+       if (flags & BCH_READ_NODECODE) {
+               /*
+                * can happen if we retry, and the extent we were going to read
+                * has been merged in the meantime:
+                */
+               if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+                       goto hole;
+
+               iter.bi_size    = pick.crc.compressed_size << 9;
+               goto get_bio;
+       }
+
+       if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+           bio_flagged(&orig->bio, BIO_CHAIN))
+               flags |= BCH_READ_MUST_CLONE;
+
+       narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
+               bch2_can_narrow_extent_crcs(k, pick.crc);
+
+       if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+               flags |= BCH_READ_MUST_BOUNCE;
+
+       EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
+
+       if (crc_is_compressed(pick.crc) ||
+           (pick.crc.csum_type != BCH_CSUM_none &&
+            (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+             (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
+              (flags & BCH_READ_USER_MAPPED)) ||
+             (flags & BCH_READ_MUST_BOUNCE)))) {
+               read_full = true;
+               bounce = true;
+       }
+
+       if (orig->opts.promote_target)
+               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
+                                       &rbio, &bounce, &read_full);
+
+       if (!read_full) {
+               EBUG_ON(crc_is_compressed(pick.crc));
+               EBUG_ON(pick.crc.csum_type &&
+                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+                        bvec_iter_sectors(iter) != pick.crc.live_size ||
+                        pick.crc.offset ||
+                        offset_into_extent));
+
+               data_pos.offset += offset_into_extent;
+               pick.ptr.offset += pick.crc.offset +
+                       offset_into_extent;
+               offset_into_extent              = 0;
+               pick.crc.compressed_size        = bvec_iter_sectors(iter);
+               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
+               pick.crc.offset                 = 0;
+               pick.crc.live_size              = bvec_iter_sectors(iter);
+       }
+get_bio:
+       if (rbio) {
+               /*
+                * promote already allocated bounce rbio:
+                * promote needs to allocate a bio big enough for uncompressing
+                * data in the write path, but we're not going to use it all
+                * here:
+                */
+               EBUG_ON(rbio->bio.bi_iter.bi_size <
+                      pick.crc.compressed_size << 9);
+               rbio->bio.bi_iter.bi_size =
+                       pick.crc.compressed_size << 9;
+       } else if (bounce) {
+               unsigned sectors = pick.crc.compressed_size;
+
+               rbio = rbio_init(bio_alloc_bioset(NULL,
+                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
+                                                 0,
+                                                 GFP_NOFS,
+                                                 &c->bio_read_split),
+                                orig->opts);
+
+               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
+               rbio->bounce    = true;
+               rbio->split     = true;
+       } else if (flags & BCH_READ_MUST_CLONE) {
+               /*
+                * Have to clone if there were any splits, due to error
+                * reporting issues (if a split errored, and retrying didn't
+                * work, when it reports the error to its parent (us) we don't
+                * know if the error was from our bio, and we should retry, or
+                * from the whole bio, in which case we don't want to retry and
+                * lose the error)
+                */
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
+                                                &c->bio_read_split),
+                                orig->opts);
+               rbio->bio.bi_iter = iter;
+               rbio->split     = true;
+       } else {
+               rbio = orig;
+               rbio->bio.bi_iter = iter;
+               EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
+       }
+
+       EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
+
+       rbio->c                 = c;
+       rbio->submit_time       = local_clock();
+       if (rbio->split)
+               rbio->parent    = orig;
+       else
+               rbio->end_io    = orig->bio.bi_end_io;
+       rbio->bvec_iter         = iter;
+       rbio->offset_into_extent= offset_into_extent;
+       rbio->flags             = flags;
+       rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
+       rbio->narrow_crcs       = narrow_crcs;
+       rbio->hole              = 0;
+       rbio->retry             = 0;
+       rbio->context           = 0;
+       /* XXX: only initialize this if needed */
+       rbio->devs_have         = bch2_bkey_devs(k);
+       rbio->pick              = pick;
+       rbio->subvol            = orig->subvol;
+       rbio->read_pos          = read_pos;
+       rbio->data_btree        = data_btree;
+       rbio->data_pos          = data_pos;
+       rbio->version           = k.k->version;
+       rbio->promote           = promote;
+       INIT_WORK(&rbio->work, NULL);
+
+       rbio->bio.bi_opf        = orig->bio.bi_opf;
+       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
+       rbio->bio.bi_end_io     = bch2_read_endio;
+
+       if (rbio->bounce)
+               trace_and_count(c, read_bounce, &rbio->bio);
+
+       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
+       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
+
+       /*
+        * If it's being moved internally, we don't want to flag it as a cache
+        * hit:
+        */
+       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
+               bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+                       PTR_BUCKET_NR(ca, &pick.ptr), READ);
+
+       if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
+               bio_inc_remaining(&orig->bio);
+               trace_and_count(c, read_split, &orig->bio);
+       }
+
+       if (!rbio->pick.idx) {
+               if (!rbio->have_ioref) {
+                       bch_err_inum_offset_ratelimited(c,
+                                       read_pos.inode,
+                                       read_pos.offset << 9,
+                                       "no device to read from");
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
+               }
+
+               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
+                            bio_sectors(&rbio->bio));
+               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
+               if (unlikely(c->opts.no_data_io)) {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               bio_endio(&rbio->bio);
+               } else {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               submit_bio(&rbio->bio);
+                       else
+                               submit_bio_wait(&rbio->bio);
+               }
+
+               /*
+                * We just submitted IO which may block, we expect relock fail
+                * events and shouldn't count them:
+                */
+               trans->notrace_relock_fail = true;
+       } else {
+               /* Attempting reconstruct read: */
+               if (bch2_ec_read_extent(trans, rbio)) {
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
+               }
+
+               if (likely(!(flags & BCH_READ_IN_RETRY)))
+                       bio_endio(&rbio->bio);
+       }
+out:
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               return 0;
+       } else {
+               int ret;
+
+               rbio->context = RBIO_CONTEXT_UNBOUND;
+               bch2_read_endio(&rbio->bio);
+
+               ret = rbio->retry;
+               rbio = bch2_rbio_free(rbio);
+
+               if (ret == READ_RETRY_AVOID) {
+                       bch2_mark_io_failure(failed, &pick);
+                       ret = READ_RETRY;
+               }
+
+               if (!ret)
+                       goto out_read_done;
+
+               return ret;
+       }
+
+err:
+       if (flags & BCH_READ_IN_RETRY)
+               return READ_ERR;
+
+       orig->bio.bi_status = BLK_STS_IOERR;
+       goto out_read_done;
+
+hole:
+       /*
+        * won't normally happen in the BCH_READ_NODECODE
+        * (bch2_move_extent()) path, but if we retry and the extent we wanted
+        * to read no longer exists we have to signal that:
+        */
+       if (flags & BCH_READ_NODECODE)
+               orig->hole = true;
+
+       zero_fill_bio_iter(&orig->bio, iter);
+out_read_done:
+       if (flags & BCH_READ_LAST_FRAGMENT)
+               bch2_rbio_done(orig);
+       return 0;
+}
+
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                struct bvec_iter bvec_iter, subvol_inum inum,
+                struct bch_io_failures *failed, unsigned flags)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_buf sk;
+       struct bkey_s_c k;
+       u32 snapshot;
+       int ret;
+
+       BUG_ON(flags & BCH_READ_NODECODE);
+
+       bch2_bkey_buf_init(&sk);
+retry:
+       bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
+       while (1) {
+               unsigned bytes, sectors, offset_into_extent;
+               enum btree_id data_btree = BTREE_ID_extents;
+
+               /*
+                * read_extent -> io_time_reset may cause a transaction restart
+                * without returning an error, we need to check for that here:
+                */
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       break;
+
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, bvec_iter.bi_sector));
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               offset_into_extent = iter.pos.offset -
+                       bkey_start_offset(k.k);
+               sectors = k.k->size - offset_into_extent;
+
+               bch2_bkey_buf_reassemble(&sk, c, k);
+
+               ret = bch2_read_indirect_extent(trans, &data_btree,
+                                       &offset_into_extent, &sk);
+               if (ret)
+                       break;
+
+               k = bkey_i_to_s_c(sk.k);
+
+               /*
+                * With indirect extents, the amount of data to read is the min
+                * of the original extent and the indirect extent:
+                */
+               sectors = min(sectors, k.k->size - offset_into_extent);
+
+               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
+               swap(bvec_iter.bi_size, bytes);
+
+               if (bvec_iter.bi_size == bytes)
+                       flags |= BCH_READ_LAST_FRAGMENT;
+
+               ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos,
+                                        data_btree, k,
+                                        offset_into_extent, failed, flags);
+               if (ret)
+                       break;
+
+               if (flags & BCH_READ_LAST_FRAGMENT)
+                       break;
+
+               swap(bvec_iter.bi_size, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+               ret = btree_trans_too_many_iters(trans);
+               if (ret)
+                       break;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
+           ret == READ_RETRY ||
+           ret == READ_RETRY_AVOID)
+               goto retry;
+
+       bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&sk, c);
+
+       if (ret) {
+               bch_err_inum_offset_ratelimited(c, inum.inum,
+                                               bvec_iter.bi_sector << 9,
+                                               "read error %i from btree lookup", ret);
+               rbio->bio.bi_status = BLK_STS_IOERR;
+               bch2_rbio_done(rbio);
+       }
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *c)
+{
+       if (c->promote_table.tbl)
+               rhashtable_destroy(&c->promote_table);
+       bioset_exit(&c->bio_read_split);
+       bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_read_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_init;
+
+       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_split_init;
+
+       if (rhashtable_init(&c->promote_table, &bch_promote_params))
+               return -BCH_ERR_ENOMEM_promote_table_init;
+
+       return 0;
+}
diff --git a/libbcachefs/io_read.h b/libbcachefs/io_read.h
new file mode 100644 (file)
index 0000000..d9c18bb
--- /dev/null
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_READ_H
+#define _BCACHEFS_IO_READ_H
+
+#include "bkey_buf.h"
+
+struct bch_read_bio {
+       struct bch_fs           *c;
+       u64                     start_time;
+       u64                     submit_time;
+
+       /*
+        * Reads will often have to be split, and if the extent being read from
+        * was checksummed or compressed we'll also have to allocate bounce
+        * buffers and copy the data back into the original bio.
+        *
+        * If we didn't have to split, we have to save and restore the original
+        * bi_end_io - @split below indicates which:
+        */
+       union {
+       struct bch_read_bio     *parent;
+       bio_end_io_t            *end_io;
+       };
+
+       /*
+        * Saved copy of bio->bi_iter, from submission time - allows us to
+        * resubmit on IO error, and also to copy data back to the original bio
+        * when we're bouncing:
+        */
+       struct bvec_iter        bvec_iter;
+
+       unsigned                offset_into_extent;
+
+       u16                     flags;
+       union {
+       struct {
+       u16                     bounce:1,
+                               split:1,
+                               kmalloc:1,
+                               have_ioref:1,
+                               narrow_crcs:1,
+                               hole:1,
+                               retry:2,
+                               context:2;
+       };
+       u16                     _state;
+       };
+
+       struct bch_devs_list    devs_have;
+
+       struct extent_ptr_decoded pick;
+
+       /*
+        * pos we read from - different from data_pos for indirect extents:
+        */
+       u32                     subvol;
+       struct bpos             read_pos;
+
+       /*
+        * start pos of data we read (may not be pos of data we want) - for
+        * promote, narrow extents paths:
+        */
+       enum btree_id           data_btree;
+       struct bpos             data_pos;
+       struct bversion         version;
+
+       struct promote_op       *promote;
+
+       struct bch_io_opts      opts;
+
+       struct work_struct      work;
+
+       struct bio              bio;
+};
+
+#define to_rbio(_bio)          container_of((_bio), struct bch_read_bio, bio)
+
+struct bch_devs_mask;
+struct cache_promote_op;
+struct extent_ptr_decoded;
+
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
+                               struct bkey_buf *);
+
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
+                                           enum btree_id *data_btree,
+                                           unsigned *offset_into_extent,
+                                           struct bkey_buf *k)
+{
+       if (k->k->k.type != KEY_TYPE_reflink_p)
+               return 0;
+
+       *data_btree = BTREE_ID_reflink;
+       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
+}
+
+enum bch_read_flags {
+       BCH_READ_RETRY_IF_STALE         = 1 << 0,
+       BCH_READ_MAY_PROMOTE            = 1 << 1,
+       BCH_READ_USER_MAPPED            = 1 << 2,
+       BCH_READ_NODECODE               = 1 << 3,
+       BCH_READ_LAST_FRAGMENT          = 1 << 4,
+
+       /* internal: */
+       BCH_READ_MUST_BOUNCE            = 1 << 5,
+       BCH_READ_MUST_CLONE             = 1 << 6,
+       BCH_READ_IN_RETRY               = 1 << 7,
+};
+
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
+                      struct bvec_iter, struct bpos, enum btree_id,
+                      struct bkey_s_c, unsigned,
+                      struct bch_io_failures *, unsigned);
+
+static inline void bch2_read_extent(struct btree_trans *trans,
+                       struct bch_read_bio *rbio, struct bpos read_pos,
+                       enum btree_id data_btree, struct bkey_s_c k,
+                       unsigned offset_into_extent, unsigned flags)
+{
+       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
+                          data_btree, k, offset_into_extent, NULL, flags);
+}
+
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+                subvol_inum, struct bch_io_failures *, unsigned flags);
+
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                            subvol_inum inum)
+{
+       struct bch_io_failures failed = { .nr = 0 };
+
+       BUG_ON(rbio->_state);
+
+       rbio->c = c;
+       rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
+
+       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
+                   BCH_READ_RETRY_IF_STALE|
+                   BCH_READ_MAY_PROMOTE|
+                   BCH_READ_USER_MAPPED);
+}
+
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
+                                            struct bch_io_opts opts)
+{
+       struct bch_read_bio *rbio = to_rbio(bio);
+
+       rbio->_state    = 0;
+       rbio->promote   = NULL;
+       rbio->opts      = opts;
+       return rbio;
+}
+
+void bch2_fs_io_read_exit(struct bch_fs *);
+int bch2_fs_io_read_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_READ_H */
diff --git a/libbcachefs/io_write.c b/libbcachefs/io_write.c
new file mode 100644 (file)
index 0000000..75376f0
--- /dev/null
@@ -0,0 +1,1675 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bkey_buf.h"
+#include "bset.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "clock.h"
+#include "compress.h"
+#include "debug.h"
+#include "ec.h"
+#include "error.h"
+#include "extent_update.h"
+#include "inode.h"
+#include "io_write.h"
+#include "journal.h"
+#include "keylist.h"
+#include "move.h"
+#include "nocow_locking.h"
+#include "rebalance.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+#include "trace.h"
+
+#include <linux/blkdev.h>
+#include <linux/prefetch.h>
+#include <linux/random.h>
+#include <linux/sched/mm.h>
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+                                      u64 now, int rw)
+{
+       u64 latency_capable =
+               ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+       /* ideally we'd be taking into account the device's variance here: */
+       u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+       s64 latency_over = io_latency - latency_threshold;
+
+       if (latency_threshold && latency_over > 0) {
+               /*
+                * bump up congested by approximately latency_over * 4 /
+                * latency_threshold - we don't need much accuracy here so don't
+                * bother with the divide:
+                */
+               if (atomic_read(&ca->congested) < CONGESTED_MAX)
+                       atomic_add(latency_over >>
+                                  max_t(int, ilog2(latency_threshold) - 2, 0),
+                                  &ca->congested);
+
+               ca->congested_last = now;
+       } else if (atomic_read(&ca->congested) > 0) {
+               atomic_dec(&ca->congested);
+       }
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+       atomic64_t *latency = &ca->cur_latency[rw];
+       u64 now = local_clock();
+       u64 io_latency = time_after64(now, submit_time)
+               ? now - submit_time
+               : 0;
+       u64 old, new, v = atomic64_read(latency);
+
+       do {
+               old = v;
+
+               /*
+                * If the io latency was reasonably close to the current
+                * latency, skip doing the update and atomic operation - most of
+                * the time:
+                */
+               if (abs((int) (old - io_latency)) < (old >> 1) &&
+                   now & ~(~0U << 5))
+                       break;
+
+               new = ewma_add(old, io_latency, 5);
+       } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+       bch2_congested_acct(ca, io_latency, now, rw);
+
+       __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+}
+
+#endif
+
+/* Allocate, free from mempool: */
+
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
+{
+       struct bvec_iter_all iter;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, iter)
+               if (bv->bv_page != ZERO_PAGE(0))
+                       mempool_free(bv->bv_page, &c->bio_bounce_pages);
+       bio->bi_vcnt = 0;
+}
+
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
+{
+       struct page *page;
+
+       if (likely(!*using_mempool)) {
+               page = alloc_page(GFP_NOFS);
+               if (unlikely(!page)) {
+                       mutex_lock(&c->bio_bounce_pages_lock);
+                       *using_mempool = true;
+                       goto pool_alloc;
+
+               }
+       } else {
+pool_alloc:
+               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
+       }
+
+       return page;
+}
+
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
+                              size_t size)
+{
+       bool using_mempool = false;
+
+       while (size) {
+               struct page *page = __bio_alloc_page_pool(c, &using_mempool);
+               unsigned len = min_t(size_t, PAGE_SIZE, size);
+
+               BUG_ON(!bio_add_page(bio, page, len, 0));
+               size -= len;
+       }
+
+       if (using_mempool)
+               mutex_unlock(&c->bio_bounce_pages_lock);
+}
+
+/* Extent update path: */
+
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+                              struct btree_iter *extent_iter,
+                              struct bkey_i *new,
+                              bool *usage_increasing,
+                              s64 *i_sectors_delta,
+                              s64 *disk_sectors_delta)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c old;
+       unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+       bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
+       int ret = 0;
+
+       *usage_increasing       = false;
+       *i_sectors_delta        = 0;
+       *disk_sectors_delta     = 0;
+
+       bch2_trans_copy_iter(&iter, extent_iter);
+
+       for_each_btree_key_upto_continue_norestart(iter,
+                               new->k.p, BTREE_ITER_SLOTS, old, ret) {
+               s64 sectors = min(new->k.p.offset, old.k->p.offset) -
+                       max(bkey_start_offset(&new->k),
+                           bkey_start_offset(old.k));
+
+               *i_sectors_delta += sectors *
+                       (bkey_extent_is_allocation(&new->k) -
+                        bkey_extent_is_allocation(old.k));
+
+               *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
+               *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
+                       ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
+                       : 0;
+
+               if (!*usage_increasing &&
+                   (new->k.p.snapshot != old.k->p.snapshot ||
+                    new_replicas > bch2_bkey_replicas(c, old) ||
+                    (!new_compressed && bch2_bkey_sectors_compressed(old))))
+                       *usage_increasing = true;
+
+               if (bkey_ge(old.k->p, new->k.p))
+                       break;
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+                                                   struct btree_iter *extent_iter,
+                                                   u64 new_i_size,
+                                                   s64 i_sectors_delta)
+{
+       struct btree_iter iter;
+       struct bkey_i *k;
+       struct bkey_i_inode_v3 *inode;
+       /*
+        * Crazy performance optimization:
+        * Every extent update needs to also update the inode: the inode trigger
+        * will set bi->journal_seq to the journal sequence number of this
+        * transaction - for fsync.
+        *
+        * But if that's the only reason we're updating the inode (we're not
+        * updating bi_size or bi_sectors), then we don't need the inode update
+        * to be journalled - if we crash, the bi_journal_seq update will be
+        * lost, but that's fine.
+        */
+       unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
+       int ret;
+
+       k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+                             SPOS(0,
+                                  extent_iter->pos.inode,
+                                  extent_iter->snapshot),
+                             BTREE_ITER_CACHED);
+       ret = PTR_ERR_OR_ZERO(k);
+       if (unlikely(ret))
+               return ret;
+
+       if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+               k = bch2_inode_to_v3(trans, k);
+               ret = PTR_ERR_OR_ZERO(k);
+               if (unlikely(ret))
+                       goto err;
+       }
+
+       inode = bkey_i_to_inode_v3(k);
+
+       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_i_size_dirty) &&
+           new_i_size > le64_to_cpu(inode->v.bi_size)) {
+               inode->v.bi_size = cpu_to_le64(new_i_size);
+               inode_update_flags = 0;
+       }
+
+       if (i_sectors_delta) {
+               le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+               inode_update_flags = 0;
+       }
+
+       if (inode->k.p.snapshot != iter.snapshot) {
+               inode->k.p.snapshot = iter.snapshot;
+               inode_update_flags = 0;
+       }
+
+       ret = bch2_trans_update(trans, &iter, &inode->k_i,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                               inode_update_flags);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_extent_update(struct btree_trans *trans,
+                      subvol_inum inum,
+                      struct btree_iter *iter,
+                      struct bkey_i *k,
+                      struct disk_reservation *disk_res,
+                      u64 new_i_size,
+                      s64 *i_sectors_delta_total,
+                      bool check_enospc)
+{
+       struct bpos next_pos;
+       bool usage_increasing;
+       s64 i_sectors_delta = 0, disk_sectors_delta = 0;
+       int ret;
+
+       /*
+        * This traverses us the iterator without changing iter->path->pos to
+        * search_key() (which is pos + 1 for extents): we want there to be a
+        * path already traversed at iter->pos because
+        * bch2_trans_extent_update() will use it to attempt extent merging
+        */
+       ret = __bch2_btree_iter_traverse(iter);
+       if (ret)
+               return ret;
+
+       ret = bch2_extent_trim_atomic(trans, iter, k);
+       if (ret)
+               return ret;
+
+       next_pos = k->k.p;
+
+       ret = bch2_sum_sector_overwrites(trans, iter, k,
+                       &usage_increasing,
+                       &i_sectors_delta,
+                       &disk_sectors_delta);
+       if (ret)
+               return ret;
+
+       if (disk_res &&
+           disk_sectors_delta > (s64) disk_res->sectors) {
+               ret = bch2_disk_reservation_add(trans->c, disk_res,
+                                       disk_sectors_delta - disk_res->sectors,
+                                       !check_enospc || !usage_increasing
+                                       ? BCH_DISK_RESERVATION_NOFAIL : 0);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * Note:
+        * We always have to do an inode update - even when i_size/i_sectors
+        * aren't changing - for fsync to work properly; fsync relies on
+        * inode->bi_journal_seq which is updated by the trigger code:
+        */
+       ret =   bch2_extent_update_i_size_sectors(trans, iter,
+                                                 min(k->k.p.offset << 9, new_i_size),
+                                                 i_sectors_delta) ?:
+               bch2_trans_update(trans, iter, k, 0) ?:
+               bch2_trans_commit(trans, disk_res, NULL,
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc);
+       if (unlikely(ret))
+               return ret;
+
+       if (i_sectors_delta_total)
+               *i_sectors_delta_total += i_sectors_delta;
+       bch2_btree_iter_set_pos(iter, next_pos);
+       return 0;
+}
+
+static int bch2_write_index_default(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct bkey_buf sk;
+       struct keylist *keys = &op->insert_keys;
+       struct bkey_i *k = bch2_keylist_front(keys);
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       subvol_inum inum = {
+               .subvol = op->subvol,
+               .inum   = k->k.p.inode,
+       };
+       int ret;
+
+       BUG_ON(!inum.subvol);
+
+       bch2_bkey_buf_init(&sk);
+
+       do {
+               bch2_trans_begin(trans);
+
+               k = bch2_keylist_front(keys);
+               bch2_bkey_buf_copy(&sk, c, k);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol,
+                                                 &sk.k->k.p.snapshot);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       break;
+
+               bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                                    bkey_start_pos(&sk.k->k),
+                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+
+               ret =   bch2_bkey_set_needs_rebalance(c, sk.k,
+                                       op->opts.background_target,
+                                       op->opts.background_compression) ?:
+                       bch2_extent_update(trans, inum, &iter, sk.k,
+                                       &op->res,
+                                       op->new_i_size, &op->i_sectors_delta,
+                                       op->flags & BCH_WRITE_CHECK_ENOSPC);
+               bch2_trans_iter_exit(trans, &iter);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       break;
+
+               if (bkey_ge(iter.pos, k->k.p))
+                       bch2_keylist_pop_front(&op->insert_keys);
+               else
+                       bch2_cut_front(iter.pos, k);
+       } while (!bch2_keylist_empty(keys));
+
+       bch2_trans_put(trans);
+       bch2_bkey_buf_exit(&sk, c);
+
+       return ret;
+}
+
+/* Writes */
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+                              enum bch_data_type type,
+                              const struct bkey_i *k,
+                              bool nocow)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+       const struct bch_extent_ptr *ptr;
+       struct bch_write_bio *n;
+       struct bch_dev *ca;
+
+       BUG_ON(c->opts.nochanges);
+
+       bkey_for_each_ptr(ptrs, ptr) {
+               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+                      !c->devs[ptr->dev]);
+
+               ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (to_entry(ptr + 1) < ptrs.end) {
+                       n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
+                                               GFP_NOFS, &ca->replica_set));
+
+                       n->bio.bi_end_io        = wbio->bio.bi_end_io;
+                       n->bio.bi_private       = wbio->bio.bi_private;
+                       n->parent               = wbio;
+                       n->split                = true;
+                       n->bounce               = false;
+                       n->put_bio              = true;
+                       n->bio.bi_opf           = wbio->bio.bi_opf;
+                       bio_inc_remaining(&wbio->bio);
+               } else {
+                       n = wbio;
+                       n->split                = false;
+               }
+
+               n->c                    = c;
+               n->dev                  = ptr->dev;
+               n->have_ioref           = nocow || bch2_dev_get_ioref(ca,
+                                       type == BCH_DATA_btree ? READ : WRITE);
+               n->nocow                = nocow;
+               n->submit_time          = local_clock();
+               n->inode_offset         = bkey_start_offset(&k->k);
+               n->bio.bi_iter.bi_sector = ptr->offset;
+
+               if (likely(n->have_ioref)) {
+                       this_cpu_add(ca->io_done->sectors[WRITE][type],
+                                    bio_sectors(&n->bio));
+
+                       bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+                       if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+                               bio_endio(&n->bio);
+                               continue;
+                       }
+
+                       submit_bio(&n->bio);
+               } else {
+                       n->bio.bi_status        = BLK_STS_REMOVED;
+                       bio_endio(&n->bio);
+               }
+       }
+}
+
+static void __bch2_write(struct bch_write_op *);
+
+static void bch2_write_done(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+
+       EBUG_ON(op->open_buckets.nr);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
+       bch2_disk_reservation_put(c, &op->res);
+
+       if (!(op->flags & BCH_WRITE_MOVE))
+               bch2_write_ref_put(c, BCH_WRITE_REF_write);
+       bch2_keylist_free(&op->insert_keys, op->inline_keys);
+
+       EBUG_ON(cl->parent);
+       closure_debug_destroy(cl);
+       if (op->end_io)
+               op->end_io(op);
+}
+
+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
+{
+       struct keylist *keys = &op->insert_keys;
+       struct bch_extent_ptr *ptr;
+       struct bkey_i *src, *dst = keys->keys, *n;
+
+       for (src = keys->keys; src != keys->top; src = n) {
+               n = bkey_next(src);
+
+               if (bkey_extent_is_direct_data(&src->k)) {
+                       bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+                                           test_bit(ptr->dev, op->failed.d));
+
+                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
+                               return -EIO;
+               }
+
+               if (dst != src)
+                       memmove_u64s_down(dst, src, src->k.u64s);
+               dst = bkey_next(dst);
+       }
+
+       keys->top = dst;
+       return 0;
+}
+
+/**
+ * __bch2_write_index - after a write, update index to point to new data
+ * @op:                bch_write_op to process
+ */
+static void __bch2_write_index(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct keylist *keys = &op->insert_keys;
+       unsigned dev;
+       int ret = 0;
+
+       if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+               ret = bch2_write_drop_io_error_ptrs(op);
+               if (ret)
+                       goto err;
+       }
+
+       if (!bch2_keylist_empty(keys)) {
+               u64 sectors_start = keylist_sectors(keys);
+
+               ret = !(op->flags & BCH_WRITE_MOVE)
+                       ? bch2_write_index_default(op)
+                       : bch2_data_update_index_update(op);
+
+               BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
+               BUG_ON(keylist_sectors(keys) && !ret);
+
+               op->written += sectors_start - keylist_sectors(keys);
+
+               if (ret && !bch2_err_matches(ret, EROFS)) {
+                       struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+                       bch_err_inum_offset_ratelimited(c,
+                               insert->k.p.inode, insert->k.p.offset << 9,
+                               "write error while doing btree update: %s",
+                               bch2_err_str(ret));
+               }
+
+               if (ret)
+                       goto err;
+       }
+out:
+       /* If some a bucket wasn't written, we can't erasure code it: */
+       for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+               bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
+       bch2_open_buckets_put(c, &op->open_buckets);
+       return;
+err:
+       keys->top = keys->keys;
+       op->error = ret;
+       op->flags |= BCH_WRITE_DONE;
+       goto out;
+}
+
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+       if (state != wp->state) {
+               u64 now = ktime_get_ns();
+
+               if (wp->last_state_change &&
+                   time_after64(now, wp->last_state_change))
+                       wp->time[wp->state] += now - wp->last_state_change;
+               wp->state = state;
+               wp->last_state_change = now;
+       }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+       enum write_point_state state;
+
+       state = running                  ? WRITE_POINT_running :
+               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+                                        : WRITE_POINT_stopped;
+
+       __wp_update_state(wp, state);
+}
+
+static void bch2_write_index(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct write_point *wp = op->wp;
+       struct workqueue_struct *wq = index_update_wq(op);
+       unsigned long flags;
+
+       if ((op->flags & BCH_WRITE_DONE) &&
+           (op->flags & BCH_WRITE_MOVE))
+               bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+       spin_lock_irqsave(&wp->writes_lock, flags);
+       if (wp->state == WRITE_POINT_waiting_io)
+               __wp_update_state(wp, WRITE_POINT_waiting_work);
+       list_add_tail(&op->wp_list, &wp->writes);
+       spin_unlock_irqrestore (&wp->writes_lock, flags);
+
+       queue_work(wq, &wp->index_update_work);
+}
+
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+       op->wp = wp;
+
+       if (wp->state == WRITE_POINT_stopped) {
+               spin_lock_irq(&wp->writes_lock);
+               __wp_update_state(wp, WRITE_POINT_waiting_io);
+               spin_unlock_irq(&wp->writes_lock);
+       }
+}
+
+void bch2_write_point_do_index_updates(struct work_struct *work)
+{
+       struct write_point *wp =
+               container_of(work, struct write_point, index_update_work);
+       struct bch_write_op *op;
+
+       while (1) {
+               spin_lock_irq(&wp->writes_lock);
+               op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
+               if (op)
+                       list_del(&op->wp_list);
+               wp_update_state(wp, op != NULL);
+               spin_unlock_irq(&wp->writes_lock);
+
+               if (!op)
+                       break;
+
+               op->flags |= BCH_WRITE_IN_WORKER;
+
+               __bch2_write_index(op);
+
+               if (!(op->flags & BCH_WRITE_DONE))
+                       __bch2_write(op);
+               else
+                       bch2_write_done(&op->cl);
+       }
+}
+
+static void bch2_write_endio(struct bio *bio)
+{
+       struct closure *cl              = bio->bi_private;
+       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
+
+       if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+                                   op->pos.inode,
+                                   wbio->inode_offset << 9,
+                                   "data write error: %s",
+                                   bch2_blk_status_to_str(bio->bi_status))) {
+               set_bit(wbio->dev, op->failed.d);
+               op->flags |= BCH_WRITE_IO_ERROR;
+       }
+
+       if (wbio->nocow)
+               set_bit(wbio->dev, op->devs_need_flush->d);
+
+       if (wbio->have_ioref) {
+               bch2_latency_acct(ca, wbio->submit_time, WRITE);
+               percpu_ref_put(&ca->io_ref);
+       }
+
+       if (wbio->bounce)
+               bch2_bio_free_pages_pool(c, bio);
+
+       if (wbio->put_bio)
+               bio_put(bio);
+
+       if (parent)
+               bio_endio(&parent->bio);
+       else
+               closure_put(cl);
+}
+
+static void init_append_extent(struct bch_write_op *op,
+                              struct write_point *wp,
+                              struct bversion version,
+                              struct bch_extent_crc_unpacked crc)
+{
+       struct bkey_i_extent *e;
+
+       op->pos.offset += crc.uncompressed_size;
+
+       e = bkey_extent_init(op->insert_keys.top);
+       e->k.p          = op->pos;
+       e->k.size       = crc.uncompressed_size;
+       e->k.version    = version;
+
+       if (crc.csum_type ||
+           crc.compression_type ||
+           crc.nonce)
+               bch2_extent_crc_append(&e->k_i, crc);
+
+       bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
+                                      op->flags & BCH_WRITE_CACHED);
+
+       bch2_keylist_push(&op->insert_keys);
+}
+
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+                                       struct write_point *wp,
+                                       struct bio *src,
+                                       bool *page_alloc_failed,
+                                       void *buf)
+{
+       struct bch_write_bio *wbio;
+       struct bio *bio;
+       unsigned output_available =
+               min(wp->sectors_free << 9, src->bi_iter.bi_size);
+       unsigned pages = DIV_ROUND_UP(output_available +
+                                     (buf
+                                      ? ((unsigned long) buf & (PAGE_SIZE - 1))
+                                      : 0), PAGE_SIZE);
+
+       pages = min(pages, BIO_MAX_VECS);
+
+       bio = bio_alloc_bioset(NULL, pages, 0,
+                              GFP_NOFS, &c->bio_write);
+       wbio                    = wbio_init(bio);
+       wbio->put_bio           = true;
+       /* copy WRITE_SYNC flag */
+       wbio->bio.bi_opf        = src->bi_opf;
+
+       if (buf) {
+               bch2_bio_map(bio, buf, output_available);
+               return bio;
+       }
+
+       wbio->bounce            = true;
+
+       /*
+        * We can't use mempool for more than c->sb.encoded_extent_max
+        * worth of pages, but we'd like to allocate more if we can:
+        */
+       bch2_bio_alloc_pages_pool(c, bio,
+                                 min_t(unsigned, output_available,
+                                       c->opts.encoded_extent_max));
+
+       if (bio->bi_iter.bi_size < output_available)
+               *page_alloc_failed =
+                       bch2_bio_alloc_pages(bio,
+                                            output_available -
+                                            bio->bi_iter.bi_size,
+                                            GFP_NOFS) != 0;
+
+       return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+                                struct bch_write_op *op,
+                                unsigned new_csum_type)
+{
+       struct bio *bio = &op->wbio.bio;
+       struct bch_extent_crc_unpacked new_crc;
+       int ret;
+
+       /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+       if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+           bch2_csum_type_is_encryption(new_csum_type))
+               new_csum_type = op->crc.csum_type;
+
+       ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+                                 NULL, &new_crc,
+                                 op->crc.offset, op->crc.live_size,
+                                 new_csum_type);
+       if (ret)
+               return ret;
+
+       bio_advance(bio, op->crc.offset << 9);
+       bio->bi_iter.bi_size = op->crc.live_size << 9;
+       op->crc = new_crc;
+       return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct nonce nonce = extent_nonce(op->version, op->crc);
+       struct bch_csum csum;
+       int ret;
+
+       if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+               return 0;
+
+       /*
+        * If we need to decrypt data in the write path, we'll no longer be able
+        * to verify the existing checksum (poly1305 mac, in this case) after
+        * it's decrypted - this is the last point we'll be able to reverify the
+        * checksum:
+        */
+       csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+       if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+               return -EIO;
+
+       ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+       op->crc.csum_type = 0;
+       op->crc.csum = (struct bch_csum) { 0, 0 };
+       return ret;
+}
+
+static enum prep_encoded_ret {
+       PREP_ENCODED_OK,
+       PREP_ENCODED_ERR,
+       PREP_ENCODED_CHECKSUM_ERR,
+       PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+       struct bch_fs *c = op->c;
+       struct bio *bio = &op->wbio.bio;
+
+       if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+               return PREP_ENCODED_OK;
+
+       BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
+
+       /* Can we just write the entire extent as is? */
+       if (op->crc.uncompressed_size == op->crc.live_size &&
+           op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
+           op->crc.compressed_size <= wp->sectors_free &&
+           (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
+            op->incompressible)) {
+               if (!crc_is_compressed(op->crc) &&
+                   op->csum_type != op->crc.csum_type &&
+                   bch2_write_rechecksum(c, op, op->csum_type) &&
+                   !c->opts.no_data_io)
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               return PREP_ENCODED_DO_WRITE;
+       }
+
+       /*
+        * If the data is compressed and we couldn't write the entire extent as
+        * is, we have to decompress it:
+        */
+       if (crc_is_compressed(op->crc)) {
+               struct bch_csum csum;
+
+               if (bch2_write_decrypt(op))
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               /* Last point we can still verify checksum: */
+               csum = bch2_checksum_bio(c, op->crc.csum_type,
+                                        extent_nonce(op->version, op->crc),
+                                        bio);
+               if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
+                       return PREP_ENCODED_CHECKSUM_ERR;
+
+               if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+                       return PREP_ENCODED_ERR;
+       }
+
+       /*
+        * No longer have compressed data after this point - data might be
+        * encrypted:
+        */
+
+       /*
+        * If the data is checksummed and we're only writing a subset,
+        * rechecksum and adjust bio to point to currently live data:
+        */
+       if ((op->crc.live_size != op->crc.uncompressed_size ||
+            op->crc.csum_type != op->csum_type) &&
+           bch2_write_rechecksum(c, op, op->csum_type) &&
+           !c->opts.no_data_io)
+               return PREP_ENCODED_CHECKSUM_ERR;
+
+       /*
+        * If we want to compress the data, it has to be decrypted:
+        */
+       if ((op->compression_opt ||
+            bch2_csum_type_is_encryption(op->crc.csum_type) !=
+            bch2_csum_type_is_encryption(op->csum_type)) &&
+           bch2_write_decrypt(op))
+               return PREP_ENCODED_CHECKSUM_ERR;
+
+       return PREP_ENCODED_OK;
+}
+
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
+                            struct bio **_dst)
+{
+       struct bch_fs *c = op->c;
+       struct bio *src = &op->wbio.bio, *dst = src;
+       struct bvec_iter saved_iter;
+       void *ec_buf;
+       unsigned total_output = 0, total_input = 0;
+       bool bounce = false;
+       bool page_alloc_failed = false;
+       int ret, more = 0;
+
+       BUG_ON(!bio_sectors(src));
+
+       ec_buf = bch2_writepoint_ec_buf(c, wp);
+
+       switch (bch2_write_prep_encoded_data(op, wp)) {
+       case PREP_ENCODED_OK:
+               break;
+       case PREP_ENCODED_ERR:
+               ret = -EIO;
+               goto err;
+       case PREP_ENCODED_CHECKSUM_ERR:
+               goto csum_err;
+       case PREP_ENCODED_DO_WRITE:
+               /* XXX look for bug here */
+               if (ec_buf) {
+                       dst = bch2_write_bio_alloc(c, wp, src,
+                                                  &page_alloc_failed,
+                                                  ec_buf);
+                       bio_copy_data(dst, src);
+                       bounce = true;
+               }
+               init_append_extent(op, wp, op->version, op->crc);
+               goto do_write;
+       }
+
+       if (ec_buf ||
+           op->compression_opt ||
+           (op->csum_type &&
+            !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+           (bch2_csum_type_is_encryption(op->csum_type) &&
+            !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+               dst = bch2_write_bio_alloc(c, wp, src,
+                                          &page_alloc_failed,
+                                          ec_buf);
+               bounce = true;
+       }
+
+       saved_iter = dst->bi_iter;
+
+       do {
+               struct bch_extent_crc_unpacked crc = { 0 };
+               struct bversion version = op->version;
+               size_t dst_len = 0, src_len = 0;
+
+               if (page_alloc_failed &&
+                   dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+                   dst->bi_iter.bi_size < c->opts.encoded_extent_max)
+                       break;
+
+               BUG_ON(op->compression_opt &&
+                      (op->flags & BCH_WRITE_DATA_ENCODED) &&
+                      bch2_csum_type_is_encryption(op->crc.csum_type));
+               BUG_ON(op->compression_opt && !bounce);
+
+               crc.compression_type = op->incompressible
+                       ? BCH_COMPRESSION_TYPE_incompressible
+                       : op->compression_opt
+                       ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+                                           op->compression_opt)
+                       : 0;
+               if (!crc_is_compressed(crc)) {
+                       dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+                       dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+                       if (op->csum_type)
+                               dst_len = min_t(unsigned, dst_len,
+                                               c->opts.encoded_extent_max);
+
+                       if (bounce) {
+                               swap(dst->bi_iter.bi_size, dst_len);
+                               bio_copy_data(dst, src);
+                               swap(dst->bi_iter.bi_size, dst_len);
+                       }
+
+                       src_len = dst_len;
+               }
+
+               BUG_ON(!src_len || !dst_len);
+
+               if (bch2_csum_type_is_encryption(op->csum_type)) {
+                       if (bversion_zero(version)) {
+                               version.lo = atomic64_inc_return(&c->key_version);
+                       } else {
+                               crc.nonce = op->nonce;
+                               op->nonce += src_len >> 9;
+                       }
+               }
+
+               if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+                   !crc_is_compressed(crc) &&
+                   bch2_csum_type_is_encryption(op->crc.csum_type) ==
+                   bch2_csum_type_is_encryption(op->csum_type)) {
+                       u8 compression_type = crc.compression_type;
+                       u16 nonce = crc.nonce;
+                       /*
+                        * Note: when we're using rechecksum(), we need to be
+                        * checksumming @src because it has all the data our
+                        * existing checksum covers - if we bounced (because we
+                        * were trying to compress), @dst will only have the
+                        * part of the data the new checksum will cover.
+                        *
+                        * But normally we want to be checksumming post bounce,
+                        * because part of the reason for bouncing is so the
+                        * data can't be modified (by userspace) while it's in
+                        * flight.
+                        */
+                       if (bch2_rechecksum_bio(c, src, version, op->crc,
+                                       &crc, &op->crc,
+                                       src_len >> 9,
+                                       bio_sectors(src) - (src_len >> 9),
+                                       op->csum_type))
+                               goto csum_err;
+                       /*
+                        * rchecksum_bio sets compression_type on crc from op->crc,
+                        * this isn't always correct as sometimes we're changing
+                        * an extent from uncompressed to incompressible.
+                        */
+                       crc.compression_type = compression_type;
+                       crc.nonce = nonce;
+               } else {
+                       if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+                           bch2_rechecksum_bio(c, src, version, op->crc,
+                                       NULL, &op->crc,
+                                       src_len >> 9,
+                                       bio_sectors(src) - (src_len >> 9),
+                                       op->crc.csum_type))
+                               goto csum_err;
+
+                       crc.compressed_size     = dst_len >> 9;
+                       crc.uncompressed_size   = src_len >> 9;
+                       crc.live_size           = src_len >> 9;
+
+                       swap(dst->bi_iter.bi_size, dst_len);
+                       ret = bch2_encrypt_bio(c, op->csum_type,
+                                              extent_nonce(version, crc), dst);
+                       if (ret)
+                               goto err;
+
+                       crc.csum = bch2_checksum_bio(c, op->csum_type,
+                                        extent_nonce(version, crc), dst);
+                       crc.csum_type = op->csum_type;
+                       swap(dst->bi_iter.bi_size, dst_len);
+               }
+
+               init_append_extent(op, wp, version, crc);
+
+               if (dst != src)
+                       bio_advance(dst, dst_len);
+               bio_advance(src, src_len);
+               total_output    += dst_len;
+               total_input     += src_len;
+       } while (dst->bi_iter.bi_size &&
+                src->bi_iter.bi_size &&
+                wp->sectors_free &&
+                !bch2_keylist_realloc(&op->insert_keys,
+                                     op->inline_keys,
+                                     ARRAY_SIZE(op->inline_keys),
+                                     BKEY_EXTENT_U64s_MAX));
+
+       more = src->bi_iter.bi_size != 0;
+
+       dst->bi_iter = saved_iter;
+
+       if (dst == src && more) {
+               BUG_ON(total_output != total_input);
+
+               dst = bio_split(src, total_input >> 9,
+                               GFP_NOFS, &c->bio_write);
+               wbio_init(dst)->put_bio = true;
+               /* copy WRITE_SYNC flag */
+               dst->bi_opf             = src->bi_opf;
+       }
+
+       dst->bi_iter.bi_size = total_output;
+do_write:
+       *_dst = dst;
+       return more;
+csum_err:
+       bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+       ret = -EIO;
+err:
+       if (to_wbio(dst)->bounce)
+               bch2_bio_free_pages_pool(c, dst);
+       if (to_wbio(dst)->put_bio)
+               bio_put(dst);
+
+       return ret;
+}
+
+static bool bch2_extent_is_writeable(struct bch_write_op *op,
+                                    struct bkey_s_c k)
+{
+       struct bch_fs *c = op->c;
+       struct bkey_s_c_extent e;
+       struct extent_ptr_decoded p;
+       const union bch_extent_entry *entry;
+       unsigned replicas = 0;
+
+       if (k.k->type != KEY_TYPE_extent)
+               return false;
+
+       e = bkey_s_c_to_extent(k);
+       extent_for_each_ptr_decode(e, p, entry) {
+               if (crc_is_encoded(p.crc) || p.has_ec)
+                       return false;
+
+               replicas += bch2_extent_ptr_durability(c, &p);
+       }
+
+       return replicas >= op->opts.data_replicas;
+}
+
+static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       const struct bch_extent_ptr *ptr;
+       struct bkey_i *k;
+
+       for_each_keylist_key(&op->insert_keys, k) {
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
+
+               bkey_for_each_ptr(ptrs, ptr)
+                       bch2_bucket_nocow_unlock(&c->nocow_locks,
+                                              PTR_BUCKET_POS(c, ptr),
+                                              BUCKET_NOCOW_LOCK_UPDATE);
+       }
+}
+
+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
+                                                 struct btree_iter *iter,
+                                                 struct bkey_i *orig,
+                                                 struct bkey_s_c k,
+                                                 u64 new_i_size)
+{
+       struct bkey_i *new;
+       struct bkey_ptrs ptrs;
+       struct bch_extent_ptr *ptr;
+       int ret;
+
+       if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
+               /* trace this */
+               return 0;
+       }
+
+       new = bch2_bkey_make_mut_noupdate(trans, k);
+       ret = PTR_ERR_OR_ZERO(new);
+       if (ret)
+               return ret;
+
+       bch2_cut_front(bkey_start_pos(&orig->k), new);
+       bch2_cut_back(orig->k.p, new);
+
+       ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+       bkey_for_each_ptr(ptrs, ptr)
+               ptr->unwritten = 0;
+
+       /*
+        * Note that we're not calling bch2_subvol_get_snapshot() in this path -
+        * that was done when we kicked off the write, and here it's important
+        * that we update the extent that we wrote to - even if a snapshot has
+        * since been created. The write is still outstanding, so we're ok
+        * w.r.t. snapshot atomicity:
+        */
+       return  bch2_extent_update_i_size_sectors(trans, iter,
+                                       min(new->k.p.offset << 9, new_i_size), 0) ?:
+               bch2_trans_update(trans, iter, new,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_i *orig;
+       struct bkey_s_c k;
+       int ret;
+
+       for_each_keylist_key(&op->insert_keys, orig) {
+               ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+                                    bkey_start_pos(&orig->k), orig->k.p,
+                                    BTREE_ITER_INTENT, k,
+                                    NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+                       bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
+               }));
+
+               if (ret && !bch2_err_matches(ret, EROFS)) {
+                       struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
+
+                       bch_err_inum_offset_ratelimited(c,
+                               insert->k.p.inode, insert->k.p.offset << 9,
+                               "write error while doing btree update: %s",
+                               bch2_err_str(ret));
+               }
+
+               if (ret) {
+                       op->error = ret;
+                       break;
+               }
+       }
+
+       bch2_trans_put(trans);
+}
+
+static void __bch2_nocow_write_done(struct bch_write_op *op)
+{
+       bch2_nocow_write_unlock(op);
+
+       if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
+               op->error = -EIO;
+       } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
+               bch2_nocow_write_convert_unwritten(op);
+}
+
+static void bch2_nocow_write_done(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+
+       __bch2_nocow_write_done(op);
+       bch2_write_done(cl);
+}
+
+static void bch2_nocow_write(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct btree_trans *trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_ptrs_c ptrs;
+       const struct bch_extent_ptr *ptr;
+       struct {
+               struct bpos     b;
+               unsigned        gen;
+               struct nocow_lock_bucket *l;
+       } buckets[BCH_REPLICAS_MAX];
+       unsigned nr_buckets = 0;
+       u32 snapshot;
+       int ret, i;
+
+       if (op->flags & BCH_WRITE_MOVE)
+               return;
+
+       trans = bch2_trans_get(c);
+retry:
+       bch2_trans_begin(trans);
+
+       ret = bch2_subvolume_get_snapshot(trans, op->subvol, &snapshot);
+       if (unlikely(ret))
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(op->pos.inode, op->pos.offset, snapshot),
+                            BTREE_ITER_SLOTS);
+       while (1) {
+               struct bio *bio = &op->wbio.bio;
+
+               nr_buckets = 0;
+
+               k = bch2_btree_iter_peek_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               /* fall back to normal cow write path? */
+               if (unlikely(k.k->p.snapshot != snapshot ||
+                            !bch2_extent_is_writeable(op, k)))
+                       break;
+
+               if (bch2_keylist_realloc(&op->insert_keys,
+                                       op->inline_keys,
+                                       ARRAY_SIZE(op->inline_keys),
+                                       k.k->u64s))
+                       break;
+
+               /* Get iorefs before dropping btree locks: */
+               ptrs = bch2_bkey_ptrs_c(k);
+               bkey_for_each_ptr(ptrs, ptr) {
+                       buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+                       buckets[nr_buckets].gen = ptr->gen;
+                       buckets[nr_buckets].l =
+                               bucket_nocow_lock(&c->nocow_locks,
+                                                 bucket_to_u64(buckets[nr_buckets].b));
+
+                       prefetch(buckets[nr_buckets].l);
+
+                       if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
+                               goto err_get_ioref;
+
+                       nr_buckets++;
+
+                       if (ptr->unwritten)
+                               op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+               }
+
+               /* Unlock before taking nocow locks, doing IO: */
+               bkey_reassemble(op->insert_keys.top, k);
+               bch2_trans_unlock(trans);
+
+               bch2_cut_front(op->pos, op->insert_keys.top);
+               if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+                       bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+
+               for (i = 0; i < nr_buckets; i++) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+                       struct nocow_lock_bucket *l = buckets[i].l;
+                       bool stale;
+
+                       __bch2_bucket_nocow_lock(&c->nocow_locks, l,
+                                                bucket_to_u64(buckets[i].b),
+                                                BUCKET_NOCOW_LOCK_UPDATE);
+
+                       rcu_read_lock();
+                       stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+                       rcu_read_unlock();
+
+                       if (unlikely(stale))
+                               goto err_bucket_stale;
+               }
+
+               bio = &op->wbio.bio;
+               if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
+                       bio = bio_split(bio, k.k->p.offset - op->pos.offset,
+                                       GFP_KERNEL, &c->bio_write);
+                       wbio_init(bio)->put_bio = true;
+                       bio->bi_opf = op->wbio.bio.bi_opf;
+               } else {
+                       op->flags |= BCH_WRITE_DONE;
+               }
+
+               op->pos.offset += bio_sectors(bio);
+               op->written += bio_sectors(bio);
+
+               bio->bi_end_io  = bch2_write_endio;
+               bio->bi_private = &op->cl;
+               bio->bi_opf |= REQ_OP_WRITE;
+               closure_get(&op->cl);
+               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+                                         op->insert_keys.top, true);
+
+               bch2_keylist_push(&op->insert_keys);
+               if (op->flags & BCH_WRITE_DONE)
+                       break;
+               bch2_btree_iter_advance(&iter);
+       }
+out:
+       bch2_trans_iter_exit(trans, &iter);
+err:
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               goto retry;
+
+       if (ret) {
+               bch_err_inum_offset_ratelimited(c,
+                               op->pos.inode,
+                               op->pos.offset << 9,
+                               "%s: btree lookup error %s",
+                               __func__, bch2_err_str(ret));
+               op->error = ret;
+               op->flags |= BCH_WRITE_DONE;
+       }
+
+       bch2_trans_put(trans);
+
+       /* fallback to cow write path? */
+       if (!(op->flags & BCH_WRITE_DONE)) {
+               closure_sync(&op->cl);
+               __bch2_nocow_write_done(op);
+               op->insert_keys.top = op->insert_keys.keys;
+       } else if (op->flags & BCH_WRITE_SYNC) {
+               closure_sync(&op->cl);
+               bch2_nocow_write_done(&op->cl);
+       } else {
+               /*
+                * XXX
+                * needs to run out of process context because ei_quota_lock is
+                * a mutex
+                */
+               continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
+       }
+       return;
+err_get_ioref:
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+       /* Fall back to COW path: */
+       goto out;
+err_bucket_stale:
+       while (i >= 0) {
+               bch2_bucket_nocow_unlock(&c->nocow_locks,
+                                        buckets[i].b,
+                                        BUCKET_NOCOW_LOCK_UPDATE);
+               --i;
+       }
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
+
+       /* We can retry this: */
+       ret = -BCH_ERR_transaction_restart;
+       goto out;
+}
+
+static void __bch2_write(struct bch_write_op *op)
+{
+       struct bch_fs *c = op->c;
+       struct write_point *wp = NULL;
+       struct bio *bio = NULL;
+       unsigned nofs_flags;
+       int ret;
+
+       nofs_flags = memalloc_nofs_save();
+
+       if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
+               bch2_nocow_write(op);
+               if (op->flags & BCH_WRITE_DONE)
+                       goto out_nofs_restore;
+       }
+again:
+       memset(&op->failed, 0, sizeof(op->failed));
+
+       do {
+               struct bkey_i *key_to_write;
+               unsigned key_to_write_offset = op->insert_keys.top_p -
+                       op->insert_keys.keys_p;
+
+               /* +1 for possible cache device: */
+               if (op->open_buckets.nr + op->nr_replicas + 1 >
+                   ARRAY_SIZE(op->open_buckets.v))
+                       break;
+
+               if (bch2_keylist_realloc(&op->insert_keys,
+                                       op->inline_keys,
+                                       ARRAY_SIZE(op->inline_keys),
+                                       BKEY_EXTENT_U64s_MAX))
+                       break;
+
+               /*
+                * The copygc thread is now global, which means it's no longer
+                * freeing up space on specific disks, which means that
+                * allocations for specific disks may hang arbitrarily long:
+                */
+               ret = bch2_trans_do(c, NULL, NULL, 0,
+                       bch2_alloc_sectors_start_trans(trans,
+                               op->target,
+                               op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
+                               op->write_point,
+                               &op->devs_have,
+                               op->nr_replicas,
+                               op->nr_replicas_required,
+                               op->watermark,
+                               op->flags,
+                               (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+                                             BCH_WRITE_ONLY_SPECIFIED_DEVS))
+                               ? NULL : &op->cl, &wp));
+               if (unlikely(ret)) {
+                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                               break;
+
+                       goto err;
+               }
+
+               EBUG_ON(!wp);
+
+               bch2_open_bucket_get(c, wp, &op->open_buckets);
+               ret = bch2_write_extent(op, wp, &bio);
+
+               bch2_alloc_sectors_done_inlined(c, wp);
+err:
+               if (ret <= 0) {
+                       op->flags |= BCH_WRITE_DONE;
+
+                       if (ret < 0) {
+                               op->error = ret;
+                               break;
+                       }
+               }
+
+               bio->bi_end_io  = bch2_write_endio;
+               bio->bi_private = &op->cl;
+               bio->bi_opf |= REQ_OP_WRITE;
+
+               closure_get(bio->bi_private);
+
+               key_to_write = (void *) (op->insert_keys.keys_p +
+                                        key_to_write_offset);
+
+               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
+                                         key_to_write, false);
+       } while (ret);
+
+       /*
+        * Sync or no?
+        *
+        * If we're running asynchronously, wne may still want to block
+        * synchronously here if we weren't able to submit all of the IO at
+        * once, as that signals backpressure to the caller.
+        */
+       if ((op->flags & BCH_WRITE_SYNC) ||
+           (!(op->flags & BCH_WRITE_DONE) &&
+            !(op->flags & BCH_WRITE_IN_WORKER))) {
+               closure_sync(&op->cl);
+               __bch2_write_index(op);
+
+               if (!(op->flags & BCH_WRITE_DONE))
+                       goto again;
+               bch2_write_done(&op->cl);
+       } else {
+               bch2_write_queue(op, wp);
+               continue_at(&op->cl, bch2_write_index, NULL);
+       }
+out_nofs_restore:
+       memalloc_nofs_restore(nofs_flags);
+}
+
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+       struct bio *bio = &op->wbio.bio;
+       struct bvec_iter iter;
+       struct bkey_i_inline_data *id;
+       unsigned sectors;
+       int ret;
+
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       op->flags |= BCH_WRITE_DONE;
+
+       bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
+
+       ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+                                  ARRAY_SIZE(op->inline_keys),
+                                  BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+       if (ret) {
+               op->error = ret;
+               goto err;
+       }
+
+       sectors = bio_sectors(bio);
+       op->pos.offset += sectors;
+
+       id = bkey_inline_data_init(op->insert_keys.top);
+       id->k.p         = op->pos;
+       id->k.version   = op->version;
+       id->k.size      = sectors;
+
+       iter = bio->bi_iter;
+       iter.bi_size = data_len;
+       memcpy_from_bio(id->v.data, bio, iter);
+
+       while (data_len & 7)
+               id->v.data[data_len++] = '\0';
+       set_bkey_val_bytes(&id->k, data_len);
+       bch2_keylist_push(&op->insert_keys);
+
+       __bch2_write_index(op);
+err:
+       bch2_write_done(&op->cl);
+}
+
+/**
+ * bch2_write() - handle a write to a cache device or flash only volume
+ * @cl:                &bch_write_op->cl
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
+ * after the data is written it calls bch_journal, and after the keys have been
+ * added to the next journal write they're inserted into the btree.
+ *
+ * If op->discard is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
+ */
+void bch2_write(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bio *bio = &op->wbio.bio;
+       struct bch_fs *c = op->c;
+       unsigned data_len;
+
+       EBUG_ON(op->cl.parent);
+       BUG_ON(!op->nr_replicas);
+       BUG_ON(!op->write_point.v);
+       BUG_ON(bkey_eq(op->pos, POS_MAX));
+
+       op->start_time = local_clock();
+       bch2_keylist_init(&op->insert_keys, op->inline_keys);
+       wbio_init(bio)->put_bio = false;
+
+       if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
+               bch_err_inum_offset_ratelimited(c,
+                       op->pos.inode,
+                       op->pos.offset << 9,
+                       "misaligned write");
+               op->error = -EIO;
+               goto err;
+       }
+
+       if (c->opts.nochanges) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       if (!(op->flags & BCH_WRITE_MOVE) &&
+           !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
+       bch2_increment_clock(c, bio_sectors(bio), WRITE);
+
+       data_len = min_t(u64, bio->bi_iter.bi_size,
+                        op->new_i_size - (op->pos.offset << 9));
+
+       if (c->opts.inline_data &&
+           data_len <= min(block_bytes(c) / 2, 1024U)) {
+               bch2_write_data_inline(op, data_len);
+               return;
+       }
+
+       __bch2_write(op);
+       return;
+err:
+       bch2_disk_reservation_put(c, &op->res);
+
+       closure_debug_destroy(&op->cl);
+       if (op->end_io)
+               op->end_io(op);
+}
+
+static const char * const bch2_write_flags[] = {
+#define x(f)   #f,
+       BCH_WRITE_FLAGS()
+#undef x
+       NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+       prt_str(out, "pos: ");
+       bch2_bpos_to_text(out, op->pos);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_str(out, "started: ");
+       bch2_pr_time_units(out, local_clock() - op->start_time);
+       prt_newline(out);
+
+       prt_str(out, "flags: ");
+       prt_bitflags(out, bch2_write_flags, op->flags);
+       prt_newline(out);
+
+       prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
+void bch2_fs_io_write_exit(struct bch_fs *c)
+{
+       mempool_exit(&c->bio_bounce_pages);
+       bioset_exit(&c->bio_write);
+}
+
+int bch2_fs_io_write_init(struct bch_fs *c)
+{
+       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_write_init;
+
+       if (mempool_init_page_pool(&c->bio_bounce_pages,
+                                  max_t(unsigned,
+                                        c->opts.btree_node_size,
+                                        c->opts.encoded_extent_max) /
+                                  PAGE_SIZE, 0))
+               return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+       return 0;
+}
diff --git a/libbcachefs/io_write.h b/libbcachefs/io_write.h
new file mode 100644 (file)
index 0000000..9323167
--- /dev/null
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_H
+#define _BCACHEFS_IO_WRITE_H
+
+#include "checksum.h"
+#include "io_write_types.h"
+
+#define to_wbio(_bio)                  \
+       container_of((_bio), struct bch_write_bio, bio)
+
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+void bch2_latency_acct(struct bch_dev *, u64, int);
+#else
+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
+#endif
+
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+                              enum bch_data_type, const struct bkey_i *, bool);
+
+#define BCH_WRITE_FLAGS()              \
+       x(ALLOC_NOWAIT)                 \
+       x(CACHED)                       \
+       x(DATA_ENCODED)                 \
+       x(PAGES_STABLE)                 \
+       x(PAGES_OWNED)                  \
+       x(ONLY_SPECIFIED_DEVS)          \
+       x(WROTE_DATA_INLINE)            \
+       x(FROM_INTERNAL)                \
+       x(CHECK_ENOSPC)                 \
+       x(SYNC)                         \
+       x(MOVE)                         \
+       x(IN_WORKER)                    \
+       x(DONE)                         \
+       x(IO_ERROR)                     \
+       x(CONVERT_UNWRITTEN)
+
+enum __bch_write_flags {
+#define x(f)   __BCH_WRITE_##f,
+       BCH_WRITE_FLAGS()
+#undef x
+};
+
+enum bch_write_flags {
+#define x(f)   BCH_WRITE_##f = BIT(__BCH_WRITE_##f),
+       BCH_WRITE_FLAGS()
+#undef x
+};
+
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+       return op->watermark == BCH_WATERMARK_copygc
+               ? op->c->copygc_wq
+               : op->c->btree_update_wq;
+}
+
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+                              struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+                      struct btree_iter *, struct bkey_i *,
+                      struct disk_reservation *, u64, s64 *, bool);
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+                                     struct bch_io_opts opts)
+{
+       op->c                   = c;
+       op->end_io              = NULL;
+       op->flags               = 0;
+       op->written             = 0;
+       op->error               = 0;
+       op->csum_type           = bch2_data_checksum_type(c, opts);
+       op->compression_opt     = opts.compression;
+       op->nr_replicas         = 0;
+       op->nr_replicas_required = c->opts.data_replicas_required;
+       op->watermark           = BCH_WATERMARK_normal;
+       op->incompressible      = 0;
+       op->open_buckets.nr     = 0;
+       op->devs_have.nr        = 0;
+       op->target              = 0;
+       op->opts                = opts;
+       op->subvol              = 0;
+       op->pos                 = POS_MAX;
+       op->version             = ZERO_VERSION;
+       op->write_point         = (struct write_point_specifier) { 0 };
+       op->res                 = (struct disk_reservation) { 0 };
+       op->new_i_size          = U64_MAX;
+       op->i_sectors_delta     = 0;
+       op->devs_need_flush     = NULL;
+}
+
+void bch2_write(struct closure *);
+
+void bch2_write_point_do_index_updates(struct work_struct *);
+
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
+{
+       struct bch_write_bio *wbio = to_wbio(bio);
+
+       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
+       return wbio;
+}
+
+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
+
+void bch2_fs_io_write_exit(struct bch_fs *);
+int bch2_fs_io_write_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_IO_WRITE_H */
diff --git a/libbcachefs/io_write_types.h b/libbcachefs/io_write_types.h
new file mode 100644 (file)
index 0000000..c7f97c2
--- /dev/null
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IO_WRITE_TYPES_H
+#define _BCACHEFS_IO_WRITE_TYPES_H
+
+#include "alloc_types.h"
+#include "btree_types.h"
+#include "buckets_types.h"
+#include "extents_types.h"
+#include "keylist_types.h"
+#include "opts.h"
+#include "super_types.h"
+
+#include <linux/llist.h>
+#include <linux/workqueue.h>
+
+struct bch_write_bio {
+       struct_group(wbio,
+       struct bch_fs           *c;
+       struct bch_write_bio    *parent;
+
+       u64                     submit_time;
+       u64                     inode_offset;
+
+       struct bch_devs_list    failed;
+       u8                      dev;
+
+       unsigned                split:1,
+                               bounce:1,
+                               put_bio:1,
+                               have_ioref:1,
+                               nocow:1,
+                               used_mempool:1,
+                               first_btree_write:1;
+       );
+
+       struct bio              bio;
+};
+
+struct bch_write_op {
+       struct closure          cl;
+       struct bch_fs           *c;
+       void                    (*end_io)(struct bch_write_op *);
+       u64                     start_time;
+
+       unsigned                written; /* sectors */
+       u16                     flags;
+       s16                     error; /* dio write path expects it to hold -ERESTARTSYS... */
+
+       unsigned                compression_opt:8;
+       unsigned                csum_type:4;
+       unsigned                nr_replicas:4;
+       unsigned                nr_replicas_required:4;
+       unsigned                watermark:3;
+       unsigned                incompressible:1;
+       unsigned                stripe_waited:1;
+
+       struct bch_devs_list    devs_have;
+       u16                     target;
+       u16                     nonce;
+       struct bch_io_opts      opts;
+
+       u32                     subvol;
+       struct bpos             pos;
+       struct bversion         version;
+
+       /* For BCH_WRITE_DATA_ENCODED: */
+       struct bch_extent_crc_unpacked crc;
+
+       struct write_point_specifier write_point;
+
+       struct write_point      *wp;
+       struct list_head        wp_list;
+
+       struct disk_reservation res;
+
+       struct open_buckets     open_buckets;
+
+       u64                     new_i_size;
+       s64                     i_sectors_delta;
+
+       struct bch_devs_mask    failed;
+
+       struct keylist          insert_keys;
+       u64                     inline_keys[BKEY_EXTENT_U64s_MAX * 2];
+
+       /*
+        * Bitmask of devices that have had nocow writes issued to them since
+        * last flush:
+        */
+       struct bch_devs_mask    *devs_need_flush;
+
+       /* Must be last: */
+       struct bch_write_bio    wbio;
+};
+
+#endif /* _BCACHEFS_IO_WRITE_TYPES_H */
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
new file mode 100644 (file)
index 0000000..9a76a9a
--- /dev/null
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_update.h"
+#include "error.h"
+#include "io_misc.h"
+#include "logged_ops.h"
+#include "super.h"
+
+struct bch_logged_op_fn {
+       u8              type;
+       int             (*resume)(struct btree_trans *, struct bkey_i *);
+};
+
+static const struct bch_logged_op_fn logged_op_fns[] = {
+#define x(n)           {                                       \
+       .type           = KEY_TYPE_logged_op_##n,               \
+       .resume         = bch2_resume_logged_op_##n,            \
+},
+       BCH_LOGGED_OPS()
+#undef x
+};
+
+static const struct bch_logged_op_fn *logged_op_fn(enum bch_bkey_type type)
+{
+       for (unsigned i = 0; i < ARRAY_SIZE(logged_op_fns); i++)
+               if (logged_op_fns[i].type == type)
+                       return logged_op_fns + i;
+       return NULL;
+}
+
+static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
+                           struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
+       struct bkey_buf sk;
+       u32 restart_count = trans->restart_count;
+       int ret;
+
+       if (!fn)
+               return 0;
+
+       bch2_bkey_buf_init(&sk);
+       bch2_bkey_buf_reassemble(&sk, c, k);
+
+       ret =   drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
+               fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+
+       bch2_bkey_buf_exit(&sk, c);
+       return ret;
+}
+
+int bch2_resume_logged_ops(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = bch2_trans_run(c,
+               for_each_btree_key2(trans, iter,
+                               BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+                       resume_logged_op(trans, &iter, k)));
+       if (ret)
+               bch_err_fn(c, ret);
+       return ret;
+}
+
+static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+       struct btree_iter iter;
+       int ret;
+
+       ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX);
+       if (ret)
+               return ret;
+
+       k->k.p = iter.pos;
+
+       ret = bch2_trans_update(trans, &iter, k, 0);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
+{
+       return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                        __bch2_logged_op_start(trans, k));
+}
+
+void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+{
+       int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                           bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
+       /*
+        * This needs to be a fatal error because we've left an unfinished
+        * operation in the logged ops btree.
+        *
+        * We should only ever see an error here if the filesystem has already
+        * been shut down, but make sure of that here:
+        */
+       if (ret) {
+               struct bch_fs *c = trans->c;
+               struct printbuf buf = PRINTBUF;
+
+               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+               bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
+                                    __func__, buf.buf, bch2_err_str(ret));
+               printbuf_exit(&buf);
+       }
+}
diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h
new file mode 100644 (file)
index 0000000..4d1e786
--- /dev/null
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_H
+#define _BCACHEFS_LOGGED_OPS_H
+
+#include "bkey.h"
+
+#define BCH_LOGGED_OPS()                       \
+       x(truncate)                             \
+       x(finsert)
+
+static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op)
+{
+       return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0);
+}
+
+int bch2_resume_logged_ops(struct bch_fs *);
+int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
+void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+
+#endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/libbcachefs/sb-clean.c b/libbcachefs/sb-clean.c
new file mode 100644 (file)
index 0000000..e151ada
--- /dev/null
@@ -0,0 +1,398 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "replicas.h"
+#include "sb-clean.h"
+#include "super-io.h"
+
+/*
+ * BCH_SB_FIELD_clean:
+ *
+ * Btree roots, and a few other things, are recovered from the journal after an
+ * unclean shutdown - but after a clean shutdown, to avoid having to read the
+ * journal, we can store them in the superblock.
+ *
+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly
+ * as they would be in the journal:
+ */
+
+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean,
+                               int write)
+{
+       struct jset_entry *entry;
+       int ret;
+
+       for (entry = clean->start;
+            entry < (struct jset_entry *) vstruct_end(&clean->field);
+            entry = vstruct_next(entry)) {
+               ret = bch2_journal_entry_validate(c, NULL, entry,
+                                                 le16_to_cpu(c->disk_sb.sb->version),
+                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
+                                                 write);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static struct bkey_i *btree_root_find(struct bch_fs *c,
+                                     struct bch_sb_field_clean *clean,
+                                     struct jset *j,
+                                     enum btree_id id, unsigned *level)
+{
+       struct bkey_i *k;
+       struct jset_entry *entry, *start, *end;
+
+       if (clean) {
+               start = clean->start;
+               end = vstruct_end(&clean->field);
+       } else {
+               start = j->start;
+               end = vstruct_last(j);
+       }
+
+       for (entry = start; entry < end; entry = vstruct_next(entry))
+               if (entry->type == BCH_JSET_ENTRY_btree_root &&
+                   entry->btree_id == id)
+                       goto found;
+
+       return NULL;
+found:
+       if (!entry->u64s)
+               return ERR_PTR(-EINVAL);
+
+       k = entry->start;
+       *level = entry->level;
+       return k;
+}
+
+int bch2_verify_superblock_clean(struct bch_fs *c,
+                                struct bch_sb_field_clean **cleanp,
+                                struct jset *j)
+{
+       unsigned i;
+       struct bch_sb_field_clean *clean = *cleanp;
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       int ret = 0;
+
+       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+                       sb_clean_journal_seq_mismatch,
+                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+                       le64_to_cpu(clean->journal_seq),
+                       le64_to_cpu(j->seq))) {
+               kfree(clean);
+               *cleanp = NULL;
+               return 0;
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               struct bkey_i *k1, *k2;
+               unsigned l1 = 0, l2 = 0;
+
+               k1 = btree_root_find(c, clean, NULL, i, &l1);
+               k2 = btree_root_find(c, NULL, j, i, &l2);
+
+               if (!k1 && !k2)
+                       continue;
+
+               printbuf_reset(&buf1);
+               printbuf_reset(&buf2);
+
+               if (k1)
+                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
+               else
+                       prt_printf(&buf1, "(none)");
+
+               if (k2)
+                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
+               else
+                       prt_printf(&buf2, "(none)");
+
+               mustfix_fsck_err_on(!k1 || !k2 ||
+                                   IS_ERR(k1) ||
+                                   IS_ERR(k2) ||
+                                   k1->k.u64s != k2->k.u64s ||
+                                   memcmp(k1, k2, bkey_bytes(&k1->k)) ||
+                                   l1 != l2, c,
+                       sb_clean_btree_root_mismatch,
+                       "superblock btree root %u doesn't match journal after clean shutdown\n"
+                       "sb:      l=%u %s\n"
+                       "journal: l=%u %s\n", i,
+                       l1, buf1.buf,
+                       l2, buf2.buf);
+       }
+fsck_err:
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
+       return ret;
+}
+
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c)
+{
+       struct bch_sb_field_clean *clean, *sb_clean;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean);
+
+       if (fsck_err_on(!sb_clean, c,
+                       sb_clean_missing,
+                       "superblock marked clean but clean section not present")) {
+               SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+               c->sb.clean = false;
+               mutex_unlock(&c->sb_lock);
+               return NULL;
+       }
+
+       clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+                       GFP_KERNEL);
+       if (!clean) {
+               mutex_unlock(&c->sb_lock);
+               return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
+       }
+
+       ret = bch2_sb_clean_validate_late(c, clean, READ);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               return ERR_PTR(ret);
+       }
+
+       mutex_unlock(&c->sb_lock);
+
+       return clean;
+fsck_err:
+       mutex_unlock(&c->sb_lock);
+       return ERR_PTR(ret);
+}
+
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+       memset(entry, 0, u64s * sizeof(u64));
+       /*
+        * The u64s field counts from the start of data, ignoring the shared
+        * fields.
+        */
+       entry->u64s = cpu_to_le16(u64s - 1);
+
+       *end = vstruct_next(*end);
+       return entry;
+}
+
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
+{
+       struct bch_dev *ca;
+       unsigned i, dev;
+
+       percpu_down_read(&c->mark_lock);
+
+       if (!journal_seq) {
+               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+                       bch2_fs_usage_acc_to_base(c, i);
+       } else {
+               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
+       }
+
+       {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_inodes;
+               u->v            = cpu_to_le64(c->usage_base->nr_inodes);
+       }
+
+       {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_key_version;
+               u->v            = cpu_to_le64(atomic64_read(&c->key_version));
+       }
+
+       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+               struct jset_entry_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_usage;
+               u->entry.btree_id = BCH_FS_USAGE_reserved;
+               u->entry.level  = i;
+               u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
+       }
+
+       for (i = 0; i < c->replicas.nr; i++) {
+               struct bch_replicas_entry *e =
+                       cpu_replicas_entry(&c->replicas, i);
+               struct jset_entry_data_usage *u =
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
+
+               u->entry.type   = BCH_JSET_ENTRY_data_usage;
+               u->v            = cpu_to_le64(c->usage_base->replicas[i]);
+               unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
+                             "embedded variable length struct");
+       }
+
+       for_each_member_device(ca, c, dev) {
+               unsigned b = sizeof(struct jset_entry_dev_usage) +
+                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+               struct jset_entry_dev_usage *u =
+                       container_of(jset_entry_init(end, b),
+                                    struct jset_entry_dev_usage, entry);
+
+               u->entry.type = BCH_JSET_ENTRY_dev_usage;
+               u->dev = cpu_to_le32(dev);
+               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+               }
+       }
+
+       percpu_up_read(&c->mark_lock);
+
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
+       }
+}
+
+static int bch2_sb_clean_validate(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+               prt_printf(err, "wrong size (got %zu should be %zu)",
+                      vstruct_bytes(&clean->field), sizeof(*clean));
+               return -BCH_ERR_invalid_sb_clean;
+       }
+
+       return 0;
+}
+
+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+       struct jset_entry *entry;
+
+       prt_printf(out, "flags:          %x",   le32_to_cpu(clean->flags));
+       prt_newline(out);
+       prt_printf(out, "journal_seq:    %llu", le64_to_cpu(clean->journal_seq));
+       prt_newline(out);
+
+       for (entry = clean->start;
+            entry != vstruct_end(&clean->field);
+            entry = vstruct_next(entry)) {
+               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+                   !entry->u64s)
+                       continue;
+
+               bch2_journal_entry_to_text(out, NULL, entry);
+               prt_newline(out);
+       }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+       .validate       = bch2_sb_clean_validate,
+       .to_text        = bch2_sb_clean_to_text,
+};
+
+int bch2_fs_mark_dirty(struct bch_fs *c)
+{
+       int ret;
+
+       /*
+        * Unconditionally write superblock, to verify it hasn't changed before
+        * we go rw:
+        */
+
+       mutex_lock(&c->sb_lock);
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+       bch2_sb_maybe_downgrade(c);
+       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
+
+       ret = bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return ret;
+}
+
+void bch2_fs_mark_clean(struct bch_fs *c)
+{
+       struct bch_sb_field_clean *sb_clean;
+       struct jset_entry *entry;
+       unsigned u64s;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       if (BCH_SB_CLEAN(c->disk_sb.sb))
+               goto out;
+
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
+
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
+
+       u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
+
+       sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s);
+       if (!sb_clean) {
+               bch_err(c, "error resizing superblock while setting filesystem clean");
+               goto out;
+       }
+
+       sb_clean->flags         = 0;
+       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
+
+       /* Trying to catch outstanding bug: */
+       BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
+
+       entry = sb_clean->start;
+       bch2_journal_super_entries_add_common(c, &entry, 0);
+       entry = bch2_btree_roots_to_journal_entries(c, entry, 0);
+       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+
+       memset(entry, 0,
+              vstruct_end(&sb_clean->field) - (void *) entry);
+
+       /*
+        * this should be in the write path, and we should be validating every
+        * superblock section:
+        */
+       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
+       if (ret) {
+               bch_err(c, "error writing marking filesystem clean: validate error");
+               goto out;
+       }
+
+       bch2_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcachefs/sb-clean.h b/libbcachefs/sb-clean.h
new file mode 100644 (file)
index 0000000..71caef2
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_CLEAN_H
+#define _BCACHEFS_SB_CLEAN_H
+
+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **,
+                                struct jset *);
+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *);
+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_clean;
+
+int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_clean(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_CLEAN_H */
diff --git a/libbcachefs/sb-errors.c b/libbcachefs/sb-errors.c
new file mode 100644 (file)
index 0000000..f0930ab
--- /dev/null
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "sb-errors.h"
+#include "super-io.h"
+
+static const char * const bch2_sb_error_strs[] = {
+#define x(t, n, ...) [n] = #t,
+       BCH_SB_ERRS()
+       NULL
+};
+
+static void bch2_sb_error_id_to_text(struct printbuf *out, enum bch_sb_error_id id)
+{
+       if (id < BCH_SB_ERR_MAX)
+               prt_str(out, bch2_sb_error_strs[id]);
+       else
+               prt_printf(out, "(unknown error %u)", id);
+}
+
+static inline unsigned bch2_sb_field_errors_nr_entries(struct bch_sb_field_errors *e)
+{
+       return e
+               ? (bch2_sb_field_bytes(&e->field) - sizeof(*e)) / sizeof(e->entries[0])
+               : 0;
+}
+
+static inline unsigned bch2_sb_field_errors_u64s(unsigned nr)
+{
+       return (sizeof(struct bch_sb_field_errors) +
+               sizeof(struct bch_sb_field_error_entry) * nr) / sizeof(u64);
+}
+
+static int bch2_sb_errors_validate(struct bch_sb *sb, struct bch_sb_field *f,
+                                  struct printbuf *err)
+{
+       struct bch_sb_field_errors *e = field_to_type(f, errors);
+       unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+       for (i = 0; i < nr; i++) {
+               if (!BCH_SB_ERROR_ENTRY_NR(&e->entries[i])) {
+                       prt_printf(err, "entry with count 0 (id ");
+                       bch2_sb_error_id_to_text(err, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+                       prt_printf(err, ")");
+                       return -BCH_ERR_invalid_sb_errors;
+               }
+
+               if (i + 1 < nr &&
+                   BCH_SB_ERROR_ENTRY_ID(&e->entries[i]) >=
+                   BCH_SB_ERROR_ENTRY_ID(&e->entries[i + 1])) {
+                       prt_printf(err, "entries out of order");
+                       return -BCH_ERR_invalid_sb_errors;
+               }
+       }
+
+       return 0;
+}
+
+static void bch2_sb_errors_to_text(struct printbuf *out, struct bch_sb *sb,
+                                  struct bch_sb_field *f)
+{
+       struct bch_sb_field_errors *e = field_to_type(f, errors);
+       unsigned i, nr = bch2_sb_field_errors_nr_entries(e);
+
+       if (out->nr_tabstops <= 1)
+               printbuf_tabstop_push(out, 16);
+
+       for (i = 0; i < nr; i++) {
+               bch2_sb_error_id_to_text(out, BCH_SB_ERROR_ENTRY_ID(&e->entries[i]));
+               prt_tab(out);
+               prt_u64(out, BCH_SB_ERROR_ENTRY_NR(&e->entries[i]));
+               prt_tab(out);
+               bch2_prt_datetime(out, le64_to_cpu(e->entries[i].last_error_time));
+               prt_newline(out);
+       }
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_errors = {
+       .validate       = bch2_sb_errors_validate,
+       .to_text        = bch2_sb_errors_to_text,
+};
+
+void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err)
+{
+       bch_sb_errors_cpu *e = &c->fsck_error_counts;
+       struct bch_sb_error_entry_cpu n = {
+               .id = err,
+               .nr = 1,
+               .last_error_time = ktime_get_real_seconds()
+       };
+       unsigned i;
+
+       mutex_lock(&c->fsck_error_counts_lock);
+       for (i = 0; i < e->nr; i++) {
+               if (err == e->data[i].id) {
+                       e->data[i].nr++;
+                       e->data[i].last_error_time = n.last_error_time;
+                       goto out;
+               }
+               if (err < e->data[i].id)
+                       break;
+       }
+
+       if (darray_make_room(e, 1))
+               goto out;
+
+       darray_insert_item(e, i, n);
+out:
+       mutex_unlock(&c->fsck_error_counts_lock);
+}
+
+void bch2_sb_errors_from_cpu(struct bch_fs *c)
+{
+       bch_sb_errors_cpu *src = &c->fsck_error_counts;
+       struct bch_sb_field_errors *dst =
+               bch2_sb_field_resize(&c->disk_sb, errors,
+                                    bch2_sb_field_errors_u64s(src->nr));
+       unsigned i;
+
+       if (!dst)
+               return;
+
+       for (i = 0; i < src->nr; i++) {
+               SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id);
+               SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr);
+               dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time);
+       }
+}
+
+static int bch2_sb_errors_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors);
+       bch_sb_errors_cpu *dst = &c->fsck_error_counts;
+       unsigned i, nr = bch2_sb_field_errors_nr_entries(src);
+       int ret;
+
+       if (!nr)
+               return 0;
+
+       mutex_lock(&c->fsck_error_counts_lock);
+       ret = darray_make_room(dst, nr);
+       if (ret)
+               goto err;
+
+       dst->nr = nr;
+
+       for (i = 0; i < nr; i++) {
+               dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]);
+               dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]);
+               dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time);
+       }
+err:
+       mutex_unlock(&c->fsck_error_counts_lock);
+
+       return ret;
+}
+
+void bch2_fs_sb_errors_exit(struct bch_fs *c)
+{
+       darray_exit(&c->fsck_error_counts);
+}
+
+void bch2_fs_sb_errors_init_early(struct bch_fs *c)
+{
+       mutex_init(&c->fsck_error_counts_lock);
+       darray_init(&c->fsck_error_counts);
+}
+
+int bch2_fs_sb_errors_init(struct bch_fs *c)
+{
+       return bch2_sb_errors_to_cpu(c);
+}
diff --git a/libbcachefs/sb-errors.h b/libbcachefs/sb-errors.h
new file mode 100644 (file)
index 0000000..5a09a53
--- /dev/null
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_H
+#define _BCACHEFS_SB_ERRORS_H
+
+#include "sb-errors_types.h"
+
+#define BCH_SB_ERRS()                                                  \
+       x(clean_but_journal_not_empty,                          0)      \
+       x(dirty_but_no_journal_entries,                         1)      \
+       x(dirty_but_no_journal_entries_post_drop_nonflushes,    2)      \
+       x(sb_clean_journal_seq_mismatch,                        3)      \
+       x(sb_clean_btree_root_mismatch,                         4)      \
+       x(sb_clean_missing,                                     5)      \
+       x(jset_unsupported_version,                             6)      \
+       x(jset_unknown_csum,                                    7)      \
+       x(jset_last_seq_newer_than_seq,                         8)      \
+       x(jset_past_bucket_end,                                 9)      \
+       x(jset_seq_blacklisted,                                 10)     \
+       x(journal_entries_missing,                              11)     \
+       x(journal_entry_replicas_not_marked,                    12)     \
+       x(journal_entry_past_jset_end,                          13)     \
+       x(journal_entry_replicas_data_mismatch,                 14)     \
+       x(journal_entry_bkey_u64s_0,                            15)     \
+       x(journal_entry_bkey_past_end,                          16)     \
+       x(journal_entry_bkey_bad_format,                        17)     \
+       x(journal_entry_bkey_invalid,                           18)     \
+       x(journal_entry_btree_root_bad_size,                    19)     \
+       x(journal_entry_blacklist_bad_size,                     20)     \
+       x(journal_entry_blacklist_v2_bad_size,                  21)     \
+       x(journal_entry_blacklist_v2_start_past_end,            22)     \
+       x(journal_entry_usage_bad_size,                         23)     \
+       x(journal_entry_data_usage_bad_size,                    24)     \
+       x(journal_entry_clock_bad_size,                         25)     \
+       x(journal_entry_clock_bad_rw,                           26)     \
+       x(journal_entry_dev_usage_bad_size,                     27)     \
+       x(journal_entry_dev_usage_bad_dev,                      28)     \
+       x(journal_entry_dev_usage_bad_pad,                      29)     \
+       x(btree_node_unreadable,                                30)     \
+       x(btree_node_fault_injected,                            31)     \
+       x(btree_node_bad_magic,                                 32)     \
+       x(btree_node_bad_seq,                                   33)     \
+       x(btree_node_unsupported_version,                       34)     \
+       x(btree_node_bset_older_than_sb_min,                    35)     \
+       x(btree_node_bset_newer_than_sb,                        36)     \
+       x(btree_node_data_missing,                              37)     \
+       x(btree_node_bset_after_end,                            38)     \
+       x(btree_node_replicas_sectors_written_mismatch,         39)     \
+       x(btree_node_replicas_data_mismatch,                    40)     \
+       x(bset_unknown_csum,                                    41)     \
+       x(bset_bad_csum,                                        42)     \
+       x(bset_past_end_of_btree_node,                          43)     \
+       x(bset_wrong_sector_offset,                             44)     \
+       x(bset_empty,                                           45)     \
+       x(bset_bad_seq,                                         46)     \
+       x(bset_blacklisted_journal_seq,                         47)     \
+       x(first_bset_blacklisted_journal_seq,                   48)     \
+       x(btree_node_bad_btree,                                 49)     \
+       x(btree_node_bad_level,                                 50)     \
+       x(btree_node_bad_min_key,                               51)     \
+       x(btree_node_bad_max_key,                               52)     \
+       x(btree_node_bad_format,                                53)     \
+       x(btree_node_bkey_past_bset_end,                        54)     \
+       x(btree_node_bkey_bad_format,                           55)     \
+       x(btree_node_bad_bkey,                                  56)     \
+       x(btree_node_bkey_out_of_order,                         57)     \
+       x(btree_root_bkey_invalid,                              58)     \
+       x(btree_root_read_error,                                59)     \
+       x(btree_root_bad_min_key,                               50)     \
+       x(btree_root_bad_max_key,                               61)     \
+       x(btree_node_read_error,                                62)     \
+       x(btree_node_topology_bad_min_key,                      63)     \
+       x(btree_node_topology_bad_max_key,                      64)     \
+       x(btree_node_topology_overwritten_by_prev_node,         65)     \
+       x(btree_node_topology_overwritten_by_next_node,         66)     \
+       x(btree_node_topology_interior_node_empty,              67)     \
+       x(fs_usage_hidden_wrong,                                68)     \
+       x(fs_usage_btree_wrong,                                 69)     \
+       x(fs_usage_data_wrong,                                  70)     \
+       x(fs_usage_cached_wrong,                                71)     \
+       x(fs_usage_reserved_wrong,                              72)     \
+       x(fs_usage_persistent_reserved_wrong,                   73)     \
+       x(fs_usage_nr_inodes_wrong,                             74)     \
+       x(fs_usage_replicas_wrong,                              75)     \
+       x(dev_usage_buckets_wrong,                              76)     \
+       x(dev_usage_sectors_wrong,                              77)     \
+       x(dev_usage_fragmented_wrong,                           78)     \
+       x(dev_usage_buckets_ec_wrong,                           79)     \
+       x(bkey_version_in_future,                               80)     \
+       x(bkey_u64s_too_small,                                  81)     \
+       x(bkey_invalid_type_for_btree,                          82)     \
+       x(bkey_extent_size_zero,                                83)     \
+       x(bkey_extent_size_greater_than_offset,                 84)     \
+       x(bkey_size_nonzero,                                    85)     \
+       x(bkey_snapshot_nonzero,                                86)     \
+       x(bkey_snapshot_zero,                                   87)     \
+       x(bkey_at_pos_max,                                      88)     \
+       x(bkey_before_start_of_btree_node,                      89)     \
+       x(bkey_after_end_of_btree_node,                         90)     \
+       x(bkey_val_size_nonzero,                                91)     \
+       x(bkey_val_size_too_small,                              92)     \
+       x(alloc_v1_val_size_bad,                                93)     \
+       x(alloc_v2_unpack_error,                                94)     \
+       x(alloc_v3_unpack_error,                                95)     \
+       x(alloc_v4_val_size_bad,                                96)     \
+       x(alloc_v4_backpointers_start_bad,                      97)     \
+       x(alloc_key_data_type_bad,                              98)     \
+       x(alloc_key_empty_but_have_data,                        99)     \
+       x(alloc_key_dirty_sectors_0,                            100)    \
+       x(alloc_key_data_type_inconsistency,                    101)    \
+       x(alloc_key_to_missing_dev_bucket,                      102)    \
+       x(alloc_key_cached_inconsistency,                       103)    \
+       x(alloc_key_cached_but_read_time_zero,                  104)    \
+       x(alloc_key_to_missing_lru_entry,                       105)    \
+       x(alloc_key_data_type_wrong,                            106)    \
+       x(alloc_key_gen_wrong,                                  107)    \
+       x(alloc_key_dirty_sectors_wrong,                        108)    \
+       x(alloc_key_cached_sectors_wrong,                       109)    \
+       x(alloc_key_stripe_wrong,                               110)    \
+       x(alloc_key_stripe_redundancy_wrong,                    111)    \
+       x(bucket_sector_count_overflow,                         112)    \
+       x(bucket_metadata_type_mismatch,                        113)    \
+       x(need_discard_key_wrong,                               114)    \
+       x(freespace_key_wrong,                                  115)    \
+       x(freespace_hole_missing,                               116)    \
+       x(bucket_gens_val_size_bad,                             117)    \
+       x(bucket_gens_key_wrong,                                118)    \
+       x(bucket_gens_hole_wrong,                               119)    \
+       x(bucket_gens_to_invalid_dev,                           120)    \
+       x(bucket_gens_to_invalid_buckets,                       121)    \
+       x(bucket_gens_nonzero_for_invalid_buckets,              122)    \
+       x(need_discard_freespace_key_to_invalid_dev_bucket,     123)    \
+       x(need_discard_freespace_key_bad,                       124)    \
+       x(backpointer_pos_wrong,                                125)    \
+       x(backpointer_to_missing_device,                        126)    \
+       x(backpointer_to_missing_alloc,                         127)    \
+       x(backpointer_to_missing_ptr,                           128)    \
+       x(lru_entry_at_time_0,                                  129)    \
+       x(lru_entry_to_invalid_bucket,                          130)    \
+       x(lru_entry_bad,                                        131)    \
+       x(btree_ptr_val_too_big,                                132)    \
+       x(btree_ptr_v2_val_too_big,                             133)    \
+       x(btree_ptr_has_non_ptr,                                134)    \
+       x(extent_ptrs_invalid_entry,                            135)    \
+       x(extent_ptrs_no_ptrs,                                  136)    \
+       x(extent_ptrs_too_many_ptrs,                            137)    \
+       x(extent_ptrs_redundant_crc,                            138)    \
+       x(extent_ptrs_redundant_stripe,                         139)    \
+       x(extent_ptrs_unwritten,                                140)    \
+       x(extent_ptrs_written_and_unwritten,                    141)    \
+       x(ptr_to_invalid_device,                                142)    \
+       x(ptr_to_duplicate_device,                              143)    \
+       x(ptr_after_last_bucket,                                144)    \
+       x(ptr_before_first_bucket,                              145)    \
+       x(ptr_spans_multiple_buckets,                           146)    \
+       x(ptr_to_missing_backpointer,                           147)    \
+       x(ptr_to_missing_alloc_key,                             148)    \
+       x(ptr_to_missing_replicas_entry,                        149)    \
+       x(ptr_to_missing_stripe,                                150)    \
+       x(ptr_to_incorrect_stripe,                              151)    \
+       x(ptr_gen_newer_than_bucket_gen,                        152)    \
+       x(ptr_too_stale,                                        153)    \
+       x(stale_dirty_ptr,                                      154)    \
+       x(ptr_bucket_data_type_mismatch,                        155)    \
+       x(ptr_cached_and_erasure_coded,                         156)    \
+       x(ptr_crc_uncompressed_size_too_small,                  157)    \
+       x(ptr_crc_csum_type_unknown,                            158)    \
+       x(ptr_crc_compression_type_unknown,                     159)    \
+       x(ptr_crc_redundant,                                    160)    \
+       x(ptr_crc_uncompressed_size_too_big,                    161)    \
+       x(ptr_crc_nonce_mismatch,                               162)    \
+       x(ptr_stripe_redundant,                                 163)    \
+       x(reservation_key_nr_replicas_invalid,                  164)    \
+       x(reflink_v_refcount_wrong,                             165)    \
+       x(reflink_p_to_missing_reflink_v,                       166)    \
+       x(stripe_pos_bad,                                       167)    \
+       x(stripe_val_size_bad,                                  168)    \
+       x(stripe_sector_count_wrong,                            169)    \
+       x(snapshot_tree_pos_bad,                                170)    \
+       x(snapshot_tree_to_missing_snapshot,                    171)    \
+       x(snapshot_tree_to_missing_subvol,                      172)    \
+       x(snapshot_tree_to_wrong_subvol,                        173)    \
+       x(snapshot_tree_to_snapshot_subvol,                     174)    \
+       x(snapshot_pos_bad,                                     175)    \
+       x(snapshot_parent_bad,                                  176)    \
+       x(snapshot_children_not_normalized,                     177)    \
+       x(snapshot_child_duplicate,                             178)    \
+       x(snapshot_child_bad,                                   179)    \
+       x(snapshot_skiplist_not_normalized,                     180)    \
+       x(snapshot_skiplist_bad,                                181)    \
+       x(snapshot_should_not_have_subvol,                      182)    \
+       x(snapshot_to_bad_snapshot_tree,                        183)    \
+       x(snapshot_bad_depth,                                   184)    \
+       x(snapshot_bad_skiplist,                                185)    \
+       x(subvol_pos_bad,                                       186)    \
+       x(subvol_not_master_and_not_snapshot,                   187)    \
+       x(subvol_to_missing_root,                               188)    \
+       x(subvol_root_wrong_bi_subvol,                          189)    \
+       x(bkey_in_missing_snapshot,                             190)    \
+       x(inode_pos_inode_nonzero,                              191)    \
+       x(inode_pos_blockdev_range,                             192)    \
+       x(inode_unpack_error,                                   193)    \
+       x(inode_str_hash_invalid,                               194)    \
+       x(inode_v3_fields_start_bad,                            195)    \
+       x(inode_snapshot_mismatch,                              196)    \
+       x(inode_unlinked_but_clean,                             197)    \
+       x(inode_unlinked_but_nlink_nonzero,                     198)    \
+       x(inode_checksum_type_invalid,                          199)    \
+       x(inode_compression_type_invalid,                       200)    \
+       x(inode_subvol_root_but_not_dir,                        201)    \
+       x(inode_i_size_dirty_but_clean,                         202)    \
+       x(inode_i_sectors_dirty_but_clean,                      203)    \
+       x(inode_i_sectors_wrong,                                204)    \
+       x(inode_dir_wrong_nlink,                                205)    \
+       x(inode_dir_multiple_links,                             206)    \
+       x(inode_multiple_links_but_nlink_0,                     207)    \
+       x(inode_wrong_backpointer,                              208)    \
+       x(inode_wrong_nlink,                                    209)    \
+       x(inode_unreachable,                                    210)    \
+       x(deleted_inode_but_clean,                              211)    \
+       x(deleted_inode_missing,                                212)    \
+       x(deleted_inode_is_dir,                                 213)    \
+       x(deleted_inode_not_unlinked,                           214)    \
+       x(extent_overlapping,                                   215)    \
+       x(extent_in_missing_inode,                              216)    \
+       x(extent_in_non_reg_inode,                              217)    \
+       x(extent_past_end_of_inode,                             218)    \
+       x(dirent_empty_name,                                    219)    \
+       x(dirent_val_too_big,                                   220)    \
+       x(dirent_name_too_long,                                 221)    \
+       x(dirent_name_embedded_nul,                             222)    \
+       x(dirent_name_dot_or_dotdot,                            223)    \
+       x(dirent_name_has_slash,                                224)    \
+       x(dirent_d_type_wrong,                                  225)    \
+       x(dirent_d_parent_subvol_wrong,                         226)    \
+       x(dirent_in_missing_dir_inode,                          227)    \
+       x(dirent_in_non_dir_inode,                              228)    \
+       x(dirent_to_missing_inode,                              229)    \
+       x(dirent_to_missing_subvol,                             230)    \
+       x(dirent_to_itself,                                     231)    \
+       x(quota_type_invalid,                                   232)    \
+       x(xattr_val_size_too_small,                             233)    \
+       x(xattr_val_size_too_big,                               234)    \
+       x(xattr_invalid_type,                                   235)    \
+       x(xattr_name_invalid_chars,                             236)    \
+       x(xattr_in_missing_inode,                               237)    \
+       x(root_subvol_missing,                                  238)    \
+       x(root_dir_missing,                                     239)    \
+       x(root_inode_not_dir,                                   240)    \
+       x(dir_loop,                                             241)    \
+       x(hash_table_key_duplicate,                             242)    \
+       x(hash_table_key_wrong_offset,                          243)
+
+enum bch_sb_error_id {
+#define x(t, n) BCH_FSCK_ERR_##t = n,
+       BCH_SB_ERRS()
+#undef x
+       BCH_SB_ERR_MAX
+};
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_errors;
+
+void bch2_sb_error_count(struct bch_fs *, enum bch_sb_error_id);
+
+void bch2_sb_errors_from_cpu(struct bch_fs *);
+
+void bch2_fs_sb_errors_exit(struct bch_fs *);
+void bch2_fs_sb_errors_init_early(struct bch_fs *);
+int bch2_fs_sb_errors_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SB_ERRORS_H */
diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h
new file mode 100644 (file)
index 0000000..b1c0998
--- /dev/null
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_ERRORS_TYPES_H
+#define _BCACHEFS_SB_ERRORS_TYPES_H
+
+#include "darray.h"
+
+struct bch_sb_error_entry_cpu {
+       u64                     id:16,
+                               nr:48;
+       u64                     last_error_time;
+};
+
+typedef DARRAY(struct bch_sb_error_entry_cpu) bch_sb_errors_cpu;
+
+#endif /* _BCACHEFS_SB_ERRORS_TYPES_H */
+
diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c
new file mode 100644 (file)
index 0000000..bed0f85
--- /dev/null
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "opts.h"
+#include "replicas.h"
+#include "sb-members.h"
+#include "super-io.h"
+
+#define x(t, n, ...) [n] = #t,
+static const char * const bch2_iops_measurements[] = {
+       BCH_IOPS_MEASUREMENTS()
+       NULL
+};
+
+char * const bch2_member_error_strs[] = {
+       BCH_MEMBER_ERROR_TYPES()
+       NULL
+};
+#undef x
+
+/* Code for bch_sb_field_members_v1: */
+
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i)
+{
+       return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i);
+}
+
+static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i)
+{
+       struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i);
+       memset(&ret, 0, sizeof(ret));
+       memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret)));
+       return ret;
+}
+
+static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i)
+{
+       return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES);
+}
+
+static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i)
+{
+       struct bch_member ret, *p = members_v1_get_mut(mi, i);
+       memset(&ret, 0, sizeof(ret));
+       memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret)));
+       return ret;
+}
+
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i)
+{
+       struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2);
+       if (mi2)
+               return members_v2_get(mi2, i);
+       struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1);
+       return members_v1_get(mi1, i);
+}
+
+static int sb_members_v2_resize_entries(struct bch_fs *c)
+{
+       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+
+       if (le16_to_cpu(mi->member_bytes) < sizeof(struct bch_member)) {
+               unsigned u64s = DIV_ROUND_UP((sizeof(*mi) + sizeof(mi->_members[0]) *
+                                             c->disk_sb.sb->nr_devices), 8);
+
+               mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
+               if (!mi)
+                       return -BCH_ERR_ENOSPC_sb_members_v2;
+
+               for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) {
+                       void *dst = (void *) mi->_members + (i * sizeof(struct bch_member));
+                       memmove(dst, __bch2_members_v2_get_mut(mi, i), le16_to_cpu(mi->member_bytes));
+                       memset(dst + le16_to_cpu(mi->member_bytes),
+                              0, (sizeof(struct bch_member) - le16_to_cpu(mi->member_bytes)));
+               }
+               mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
+       }
+       return 0;
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c)
+{
+       struct bch_sb_field_members_v1 *mi1;
+       struct bch_sb_field_members_v2 *mi2;
+
+       if (!bch2_sb_field_get(c->disk_sb.sb, members_v2)) {
+               mi2 = bch2_sb_field_resize(&c->disk_sb, members_v2,
+                               DIV_ROUND_UP(sizeof(*mi2) +
+                                            sizeof(struct bch_member) * c->sb.nr_devices,
+                                            sizeof(u64)));
+               mi1 = bch2_sb_field_get(c->disk_sb.sb, members_v1);
+               memcpy(&mi2->_members[0], &mi1->_members[0],
+                      BCH_MEMBER_V1_BYTES * c->sb.nr_devices);
+               memset(&mi2->pad[0], 0, sizeof(mi2->pad));
+               mi2->member_bytes = cpu_to_le16(BCH_MEMBER_V1_BYTES);
+       }
+
+       return sb_members_v2_resize_entries(c);
+}
+
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb)
+{
+       struct bch_sb_field_members_v1 *mi1;
+       struct bch_sb_field_members_v2 *mi2;
+
+       mi1 = bch2_sb_field_resize(disk_sb, members_v1,
+                       DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES *
+                                    disk_sb->sb->nr_devices, sizeof(u64)));
+       if (!mi1)
+               return -BCH_ERR_ENOSPC_sb_members;
+
+       mi2 = bch2_sb_field_get(disk_sb->sb, members_v2);
+
+       for (unsigned i = 0; i < disk_sb->sb->nr_devices; i++)
+               memcpy(members_v1_get_mut(mi1, i), __bch2_members_v2_get_mut(mi2, i), BCH_MEMBER_V1_BYTES);
+
+       return 0;
+}
+
+static int validate_member(struct printbuf *err,
+                          struct bch_member m,
+                          struct bch_sb *sb,
+                          int i)
+{
+       if (le64_to_cpu(m.nbuckets) > LONG_MAX) {
+               prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
+                          i, le64_to_cpu(m.nbuckets), LONG_MAX);
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       if (le64_to_cpu(m.nbuckets) -
+           le16_to_cpu(m.first_bucket) < BCH_MIN_NR_NBUCKETS) {
+               prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
+                          i, le64_to_cpu(m.nbuckets), BCH_MIN_NR_NBUCKETS);
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       if (le16_to_cpu(m.bucket_size) <
+           le16_to_cpu(sb->block_size)) {
+               prt_printf(err, "device %u: bucket size %u smaller than block size %u",
+                          i, le16_to_cpu(m.bucket_size), le16_to_cpu(sb->block_size));
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       if (le16_to_cpu(m.bucket_size) <
+           BCH_SB_BTREE_NODE_SIZE(sb)) {
+               prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
+                          i, le16_to_cpu(m.bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       return 0;
+}
+
+static void member_to_text(struct printbuf *out,
+                          struct bch_member m,
+                          struct bch_sb_field_disk_groups *gi,
+                          struct bch_sb *sb,
+                          int i)
+{
+       unsigned data_have = bch2_sb_dev_has_data(sb, i);
+       u64 bucket_size = le16_to_cpu(m.bucket_size);
+       u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size;
+
+       if (!bch2_member_exists(&m))
+               return;
+
+       prt_printf(out, "Device:");
+       prt_tab(out);
+       prt_printf(out, "%u", i);
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+
+       prt_printf(out, "Label:");
+       prt_tab(out);
+       if (BCH_MEMBER_GROUP(&m)) {
+               unsigned idx = BCH_MEMBER_GROUP(&m) - 1;
+
+               if (idx < disk_groups_nr(gi))
+                       prt_printf(out, "%s (%u)",
+                                  gi->entries[idx].label, idx);
+               else
+                       prt_printf(out, "(bad disk labels section)");
+       } else {
+               prt_printf(out, "(none)");
+       }
+       prt_newline(out);
+
+       prt_printf(out, "UUID:");
+       prt_tab(out);
+       pr_uuid(out, m.uuid.b);
+       prt_newline(out);
+
+       prt_printf(out, "Size:");
+       prt_tab(out);
+       prt_units_u64(out, device_size << 9);
+       prt_newline(out);
+
+       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+               prt_printf(out, "%s errors:", bch2_member_error_strs[i]);
+               prt_tab(out);
+               prt_u64(out, le64_to_cpu(m.errors[i]));
+               prt_newline(out);
+       }
+
+       for (unsigned i = 0; i < BCH_IOPS_NR; i++) {
+               prt_printf(out, "%s iops:", bch2_iops_measurements[i]);
+               prt_tab(out);
+               prt_printf(out, "%u", le32_to_cpu(m.iops[i]));
+               prt_newline(out);
+       }
+
+       prt_printf(out, "Bucket size:");
+       prt_tab(out);
+       prt_units_u64(out, bucket_size << 9);
+       prt_newline(out);
+
+       prt_printf(out, "First bucket:");
+       prt_tab(out);
+       prt_printf(out, "%u", le16_to_cpu(m.first_bucket));
+       prt_newline(out);
+
+       prt_printf(out, "Buckets:");
+       prt_tab(out);
+       prt_printf(out, "%llu", le64_to_cpu(m.nbuckets));
+       prt_newline(out);
+
+       prt_printf(out, "Last mount:");
+       prt_tab(out);
+       if (m.last_mount)
+               bch2_prt_datetime(out, le64_to_cpu(m.last_mount));
+       else
+               prt_printf(out, "(never)");
+       prt_newline(out);
+
+       prt_printf(out, "State:");
+       prt_tab(out);
+       prt_printf(out, "%s",
+                  BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR
+                  ? bch2_member_states[BCH_MEMBER_STATE(&m)]
+                  : "unknown");
+       prt_newline(out);
+
+       prt_printf(out, "Data allowed:");
+       prt_tab(out);
+       if (BCH_MEMBER_DATA_ALLOWED(&m))
+               prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+       else
+               prt_printf(out, "(none)");
+       prt_newline(out);
+
+       prt_printf(out, "Has data:");
+       prt_tab(out);
+       if (data_have)
+               prt_bitflags(out, bch2_data_types, data_have);
+       else
+               prt_printf(out, "(none)");
+       prt_newline(out);
+
+       prt_printf(out, "Discard:");
+       prt_tab(out);
+       prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
+       prt_newline(out);
+
+       prt_printf(out, "Freespace initialized:");
+       prt_tab(out);
+       prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(&m));
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
+static int bch2_sb_members_v1_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+       unsigned i;
+
+       if ((void *) members_v1_get_mut(mi, sb->nr_devices) > vstruct_end(&mi->field)) {
+               prt_printf(err, "too many devices for section size");
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member m = members_v1_get(mi, i);
+
+               int ret = validate_member(err, m, sb, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb,
+                                      struct bch_sb_field *f)
+{
+       struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1);
+       struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+       unsigned i;
+
+       for (i = 0; i < sb->nr_devices; i++)
+               member_to_text(out, members_v1_get(mi, i), gi, sb, i);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = {
+       .validate       = bch2_sb_members_v1_validate,
+       .to_text        = bch2_sb_members_v1_to_text,
+};
+
+static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+                                      struct bch_sb_field *f)
+{
+       struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+       struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups);
+       unsigned i;
+
+       for (i = 0; i < sb->nr_devices; i++)
+               member_to_text(out, members_v2_get(mi, i), gi, sb, i);
+}
+
+static int bch2_sb_members_v2_validate(struct bch_sb *sb,
+                                      struct bch_sb_field *f,
+                                      struct printbuf *err)
+{
+       struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2);
+       size_t mi_bytes = (void *) __bch2_members_v2_get_mut(mi, sb->nr_devices) -
+               (void *) mi;
+
+       if (mi_bytes > vstruct_bytes(&mi->field)) {
+               prt_printf(err, "section too small (%zu > %zu)",
+                          mi_bytes, vstruct_bytes(&mi->field));
+               return -BCH_ERR_invalid_sb_members;
+       }
+
+       for (unsigned i = 0; i < sb->nr_devices; i++) {
+               int ret = validate_member(err, members_v2_get(mi, i), sb, i);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
+       .validate       = bch2_sb_members_v2_validate,
+       .to_text        = bch2_sb_members_v2_to_text,
+};
+
+void bch2_sb_members_from_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+       struct bch_dev *ca;
+       unsigned i, e;
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL) {
+               struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+
+               for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+                       m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
+       }
+       rcu_read_unlock();
+}
+
+void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca)
+{
+       struct bch_fs *c = ca->fs;
+       struct bch_member m;
+
+       mutex_lock(&ca->fs->sb_lock);
+       m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx);
+       mutex_unlock(&ca->fs->sb_lock);
+
+       printbuf_tabstop_push(out, 12);
+
+       prt_str(out, "IO errors since filesystem creation");
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+               prt_printf(out, "%s:", bch2_member_error_strs[i]);
+               prt_tab(out);
+               prt_u64(out, atomic64_read(&ca->errors[i]));
+               prt_newline(out);
+       }
+       printbuf_indent_sub(out, 2);
+
+       prt_str(out, "IO errors since ");
+       bch2_pr_time_units(out, (ktime_get_real_seconds() - le64_to_cpu(m.errors_reset_time)) * NSEC_PER_SEC);
+       prt_str(out, " ago");
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) {
+               prt_printf(out, "%s:", bch2_member_error_strs[i]);
+               prt_tab(out);
+               prt_u64(out, atomic64_read(&ca->errors[i]) - le64_to_cpu(m.errors_at_reset[i]));
+               prt_newline(out);
+       }
+       printbuf_indent_sub(out, 2);
+}
+
+void bch2_dev_errors_reset(struct bch_dev *ca)
+{
+       struct bch_fs *c = ca->fs;
+       struct bch_member *m;
+
+       mutex_lock(&c->sb_lock);
+       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+       for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
+               m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
+       m->errors_reset_time = ktime_get_real_seconds();
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+}
diff --git a/libbcachefs/sb-members.h b/libbcachefs/sb-members.h
new file mode 100644 (file)
index 0000000..03613e3
--- /dev/null
@@ -0,0 +1,227 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_MEMBERS_H
+#define _BCACHEFS_SB_MEMBERS_H
+
+extern char * const bch2_member_error_strs[];
+
+static inline struct bch_member *
+__bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i)
+{
+       return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes));
+}
+
+int bch2_sb_members_v2_init(struct bch_fs *c);
+int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb);
+struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i);
+struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i);
+
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
+{
+       return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+       return bch2_dev_is_online(ca) &&
+               ca->mi.state != BCH_MEMBER_STATE_failed;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+       if (!percpu_ref_tryget(&ca->io_ref))
+               return false;
+
+       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
+           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
+               return true;
+
+       percpu_ref_put(&ca->io_ref);
+       return false;
+}
+
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
+{
+       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
+}
+
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+                                        unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs.nr; i++)
+               if (devs.devs[i] == dev)
+                       return true;
+
+       return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+                                         unsigned dev)
+{
+       unsigned i;
+
+       for (i = 0; i < devs->nr; i++)
+               if (devs->devs[i] == dev) {
+                       array_remove_item(devs->devs, devs->nr, i);
+                       return;
+               }
+}
+
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
+                                        unsigned dev)
+{
+       if (!bch2_dev_list_has_dev(*devs, dev)) {
+               BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
+               devs->devs[devs->nr++] = dev;
+       }
+}
+
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
+                                             const struct bch_devs_mask *mask)
+{
+       struct bch_dev *ca = NULL;
+
+       while ((*iter = mask
+               ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
+               : *iter) < c->sb.nr_devices &&
+              !(ca = rcu_dereference_check(c->devs[*iter],
+                                           lockdep_is_held(&c->state_lock))))
+               (*iter)++;
+
+       return ca;
+}
+
+#define for_each_member_device_rcu(ca, c, iter, mask)                  \
+       for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       if ((ca = __bch2_next_dev(c, iter, NULL)))
+               percpu_ref_get(&ca->ref);
+       rcu_read_unlock();
+
+       return ca;
+}
+
+/*
+ * If you break early, you must drop your ref on the current device
+ */
+#define for_each_member_device(ca, c, iter)                            \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_dev(c, &(iter)));                      \
+            percpu_ref_put(&ca->ref), (iter)++)
+
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
+                                                     unsigned *iter,
+                                                     int state_mask)
+{
+       struct bch_dev *ca;
+
+       rcu_read_lock();
+       while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+              (!((1 << ca->mi.state) & state_mask) ||
+               !percpu_ref_tryget(&ca->io_ref)))
+               (*iter)++;
+       rcu_read_unlock();
+
+       return ca;
+}
+
+#define __for_each_online_member(ca, c, iter, state_mask)              \
+       for ((iter) = 0;                                                \
+            (ca = bch2_get_next_online_dev(c, &(iter), state_mask));   \
+            percpu_ref_put(&ca->io_ref), (iter)++)
+
+#define for_each_online_member(ca, c, iter)                            \
+       __for_each_online_member(ca, c, iter, ~0)
+
+#define for_each_rw_member(ca, c, iter)                                        \
+       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+
+#define for_each_readable_member(ca, c, iter)                          \
+       __for_each_online_member(ca, c, iter,                           \
+               (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+
+/*
+ * If a key exists that references a device, the device won't be going away and
+ * we can omit rcu_read_lock():
+ */
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_check(c->devs[idx], 1);
+}
+
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
+{
+       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
+
+       return rcu_dereference_protected(c->devs[idx],
+                                        lockdep_is_held(&c->sb_lock) ||
+                                        lockdep_is_held(&c->state_lock));
+}
+
+/* XXX kill, move to struct bch_fs */
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+       struct bch_devs_mask devs;
+       struct bch_dev *ca;
+       unsigned i;
+
+       memset(&devs, 0, sizeof(devs));
+       for_each_online_member(ca, c, i)
+               __set_bit(ca->dev_idx, devs.d);
+       return devs;
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1;
+extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;
+
+static inline bool bch2_member_exists(struct bch_member *m)
+{
+       return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+}
+
+static inline bool bch2_dev_exists(struct bch_sb *sb, unsigned dev)
+{
+       if (dev < sb->nr_devices) {
+               struct bch_member m = bch2_sb_member_get(sb, dev);
+               return bch2_member_exists(&m);
+       }
+       return false;
+}
+
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
+{
+       return (struct bch_member_cpu) {
+               .nbuckets       = le64_to_cpu(mi->nbuckets),
+               .first_bucket   = le16_to_cpu(mi->first_bucket),
+               .bucket_size    = le16_to_cpu(mi->bucket_size),
+               .group          = BCH_MEMBER_GROUP(mi),
+               .state          = BCH_MEMBER_STATE(mi),
+               .discard        = BCH_MEMBER_DISCARD(mi),
+               .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
+               .durability     = BCH_MEMBER_DURABILITY(mi)
+                       ? BCH_MEMBER_DURABILITY(mi) - 1
+                       : 1,
+               .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
+               .valid          = bch2_member_exists(mi),
+       };
+}
+
+void bch2_sb_members_from_cpu(struct bch_fs *);
+
+void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
+void bch2_dev_errors_reset(struct bch_dev *);
+
+#endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
new file mode 100644 (file)
index 0000000..f37d5ad
--- /dev/null
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/log2.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
+
+#include <trace/events/lock.h>
+
+#include "six.h"
+
+#ifdef DEBUG
+#define EBUG_ON(cond)                  BUG_ON(cond)
+#else
+#define EBUG_ON(cond)                  do {} while (0)
+#endif
+
+#define six_acquire(l, t, r, ip)       lock_acquire(l, 0, t, r, 1, NULL, ip)
+#define six_release(l, ip)             lock_release(l, ip)
+
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
+
+#define SIX_LOCK_HELD_read_OFFSET      0
+#define SIX_LOCK_HELD_read             ~(~0U << 26)
+#define SIX_LOCK_HELD_intent           (1U << 26)
+#define SIX_LOCK_HELD_write            (1U << 27)
+#define SIX_LOCK_WAITING_read          (1U << (28 + SIX_LOCK_read))
+#define SIX_LOCK_WAITING_write         (1U << (28 + SIX_LOCK_write))
+#define SIX_LOCK_NOSPIN                        (1U << 31)
+
+struct six_lock_vals {
+       /* Value we add to the lock in order to take the lock: */
+       u32                     lock_val;
+
+       /* If the lock has this value (used as a mask), taking the lock fails: */
+       u32                     lock_fail;
+
+       /* Mask that indicates lock is held for this type: */
+       u32                     held_mask;
+
+       /* Waitlist we wakeup when releasing the lock: */
+       enum six_lock_type      unlock_wakeup;
+};
+
+static const struct six_lock_vals l[] = {
+       [SIX_LOCK_read] = {
+               .lock_val       = 1U << SIX_LOCK_HELD_read_OFFSET,
+               .lock_fail      = SIX_LOCK_HELD_write,
+               .held_mask      = SIX_LOCK_HELD_read,
+               .unlock_wakeup  = SIX_LOCK_write,
+       },
+       [SIX_LOCK_intent] = {
+               .lock_val       = SIX_LOCK_HELD_intent,
+               .lock_fail      = SIX_LOCK_HELD_intent,
+               .held_mask      = SIX_LOCK_HELD_intent,
+               .unlock_wakeup  = SIX_LOCK_intent,
+       },
+       [SIX_LOCK_write] = {
+               .lock_val       = SIX_LOCK_HELD_write,
+               .lock_fail      = SIX_LOCK_HELD_read,
+               .held_mask      = SIX_LOCK_HELD_write,
+               .unlock_wakeup  = SIX_LOCK_read,
+       },
+};
+
+static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
+{
+       if ((atomic_read(&lock->state) & mask) != mask)
+               atomic_or(mask, &lock->state);
+}
+
+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
+{
+       if (atomic_read(&lock->state) & mask)
+               atomic_and(~mask, &lock->state);
+}
+
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
+                                u32 old, struct task_struct *owner)
+{
+       if (type != SIX_LOCK_intent)
+               return;
+
+       if (!(old & SIX_LOCK_HELD_intent)) {
+               EBUG_ON(lock->owner);
+               lock->owner = owner;
+       } else {
+               EBUG_ON(lock->owner != current);
+       }
+}
+
+static inline unsigned pcpu_read_count(struct six_lock *lock)
+{
+       unsigned read_count = 0;
+       int cpu;
+
+       for_each_possible_cpu(cpu)
+               read_count += *per_cpu_ptr(lock->readers, cpu);
+       return read_count;
+}
+
+/*
+ * __do_six_trylock() - main trylock routine
+ *
+ * Returns 1 on success, 0 on failure
+ *
+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure
+ * for anoter thread taking the competing lock type, and we may havve to do a
+ * wakeup: when a wakeup is required, we return -1 - wakeup_type.
+ */
+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
+                           struct task_struct *task, bool try)
+{
+       int ret;
+       u32 old;
+
+       EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
+       EBUG_ON(type == SIX_LOCK_write &&
+               (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
+
+       /*
+        * Percpu reader mode:
+        *
+        * The basic idea behind this algorithm is that you can implement a lock
+        * between two threads without any atomics, just memory barriers:
+        *
+        * For two threads you'll need two variables, one variable for "thread a
+        * has the lock" and another for "thread b has the lock".
+        *
+        * To take the lock, a thread sets its variable indicating that it holds
+        * the lock, then issues a full memory barrier, then reads from the
+        * other thread's variable to check if the other thread thinks it has
+        * the lock. If we raced, we backoff and retry/sleep.
+        *
+        * Failure to take the lock may cause a spurious trylock failure in
+        * another thread, because we temporarily set the lock to indicate that
+        * we held it. This would be a problem for a thread in six_lock(), when
+        * they are calling trylock after adding themself to the waitlist and
+        * prior to sleeping.
+        *
+        * Therefore, if we fail to get the lock, and there were waiters of the
+        * type we conflict with, we will have to issue a wakeup.
+        *
+        * Since we may be called under wait_lock (and by the wakeup code
+        * itself), we return that the wakeup has to be done instead of doing it
+        * here.
+        */
+       if (type == SIX_LOCK_read && lock->readers) {
+               preempt_disable();
+               this_cpu_inc(*lock->readers); /* signal that we own lock */
+
+               smp_mb();
+
+               old = atomic_read(&lock->state);
+               ret = !(old & l[type].lock_fail);
+
+               this_cpu_sub(*lock->readers, !ret);
+               preempt_enable();
+
+               if (!ret) {
+                       smp_mb();
+                       if (atomic_read(&lock->state) & SIX_LOCK_WAITING_write)
+                               ret = -1 - SIX_LOCK_write;
+               }
+       } else if (type == SIX_LOCK_write && lock->readers) {
+               if (try) {
+                       atomic_add(SIX_LOCK_HELD_write, &lock->state);
+                       smp_mb__after_atomic();
+               }
+
+               ret = !pcpu_read_count(lock);
+
+               if (try && !ret) {
+                       old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
+                       if (old & SIX_LOCK_WAITING_read)
+                               ret = -1 - SIX_LOCK_read;
+               }
+       } else {
+               old = atomic_read(&lock->state);
+               do {
+                       ret = !(old & l[type].lock_fail);
+                       if (!ret || (type == SIX_LOCK_write && !try)) {
+                               smp_mb();
+                               break;
+                       }
+               } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
+
+               EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
+       }
+
+       if (ret > 0)
+               six_set_owner(lock, type, old, task);
+
+       EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
+               (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
+
+       return ret;
+}
+
+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
+{
+       struct six_lock_waiter *w, *next;
+       struct task_struct *task;
+       bool saw_one;
+       int ret;
+again:
+       ret = 0;
+       saw_one = false;
+       raw_spin_lock(&lock->wait_lock);
+
+       list_for_each_entry_safe(w, next, &lock->wait_list, list) {
+               if (w->lock_want != lock_type)
+                       continue;
+
+               if (saw_one && lock_type != SIX_LOCK_read)
+                       goto unlock;
+               saw_one = true;
+
+               ret = __do_six_trylock(lock, lock_type, w->task, false);
+               if (ret <= 0)
+                       goto unlock;
+
+               /*
+                * Similar to percpu_rwsem_wake_function(), we need to guard
+                * against the wakee noticing w->lock_acquired, returning, and
+                * then exiting before we do the wakeup:
+                */
+               task = get_task_struct(w->task);
+               __list_del(w->list.prev, w->list.next);
+               /*
+                * The release barrier here ensures the ordering of the
+                * __list_del before setting w->lock_acquired; @w is on the
+                * stack of the thread doing the waiting and will be reused
+                * after it sees w->lock_acquired with no other locking:
+                * pairs with smp_load_acquire() in six_lock_slowpath()
+                */
+               smp_store_release(&w->lock_acquired, true);
+               wake_up_process(task);
+               put_task_struct(task);
+       }
+
+       six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
+unlock:
+       raw_spin_unlock(&lock->wait_lock);
+
+       if (ret < 0) {
+               lock_type = -ret - 1;
+               goto again;
+       }
+}
+
+__always_inline
+static void six_lock_wakeup(struct six_lock *lock, u32 state,
+                           enum six_lock_type lock_type)
+{
+       if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
+               return;
+
+       if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
+               return;
+
+       __six_lock_wakeup(lock, lock_type);
+}
+
+__always_inline
+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
+{
+       int ret;
+
+       ret = __do_six_trylock(lock, type, current, try);
+       if (ret < 0)
+               __six_lock_wakeup(lock, -ret - 1);
+
+       return ret > 0;
+}
+
+/**
+ * six_trylock_ip - attempt to take a six lock without blocking
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+       if (!do_six_trylock(lock, type, true))
+               return false;
+
+       if (type != SIX_LOCK_write)
+               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
+       return true;
+}
+EXPORT_SYMBOL_GPL(six_trylock_ip);
+
+/**
+ * six_relock_ip - attempt to re-take a lock that was held previously
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:       lock sequence number obtained from six_lock_seq() while lock was
+ *             held previously
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: true on success, false on failure.
+ */
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+                  unsigned seq, unsigned long ip)
+{
+       if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
+               return false;
+
+       if (six_lock_seq(lock) != seq) {
+               six_unlock_ip(lock, type, ip);
+               return false;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(six_relock_ip);
+
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+
+static inline bool six_owner_running(struct six_lock *lock)
+{
+       /*
+        * When there's no owner, we might have preempted between the owner
+        * acquiring the lock and setting the owner field. If we're an RT task
+        * that will live-lock because we won't let the owner complete.
+        */
+       rcu_read_lock();
+       struct task_struct *owner = READ_ONCE(lock->owner);
+       bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool six_optimistic_spin(struct six_lock *lock,
+                                      struct six_lock_waiter *wait,
+                                      enum six_lock_type type)
+{
+       unsigned loop = 0;
+       u64 end_time;
+
+       if (type == SIX_LOCK_write)
+               return false;
+
+       if (lock->wait_list.next != &wait->list)
+               return false;
+
+       if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
+               return false;
+
+       preempt_disable();
+       end_time = sched_clock() + 10 * NSEC_PER_USEC;
+
+       while (!need_resched() && six_owner_running(lock)) {
+               /*
+                * Ensures that writes to the waitlist entry happen after we see
+                * wait->lock_acquired: pairs with the smp_store_release in
+                * __six_lock_wakeup
+                */
+               if (smp_load_acquire(&wait->lock_acquired)) {
+                       preempt_enable();
+                       return true;
+               }
+
+               if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+                       six_set_bitmask(lock, SIX_LOCK_NOSPIN);
+                       break;
+               }
+
+               /*
+                * The cpu_relax() call is a compiler barrier which forces
+                * everything in this loop to be re-loaded. We don't need
+                * memory barriers as we'll eventually observe the right
+                * values at the cost of a few extra spins.
+                */
+               cpu_relax();
+       }
+
+       preempt_enable();
+       return false;
+}
+
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
+
+static inline bool six_optimistic_spin(struct six_lock *lock,
+                                      struct six_lock_waiter *wait,
+                                      enum six_lock_type type)
+{
+       return false;
+}
+
+#endif
+
+noinline
+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
+                            struct six_lock_waiter *wait,
+                            six_lock_should_sleep_fn should_sleep_fn, void *p,
+                            unsigned long ip)
+{
+       int ret = 0;
+
+       if (type == SIX_LOCK_write) {
+               EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+               atomic_add(SIX_LOCK_HELD_write, &lock->state);
+               smp_mb__after_atomic();
+       }
+
+       trace_contention_begin(lock, 0);
+       lock_contended(&lock->dep_map, ip);
+
+       wait->task              = current;
+       wait->lock_want         = type;
+       wait->lock_acquired     = false;
+
+       raw_spin_lock(&lock->wait_lock);
+       six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
+       /*
+        * Retry taking the lock after taking waitlist lock, in case we raced
+        * with an unlock:
+        */
+       ret = __do_six_trylock(lock, type, current, false);
+       if (ret <= 0) {
+               wait->start_time = local_clock();
+
+               if (!list_empty(&lock->wait_list)) {
+                       struct six_lock_waiter *last =
+                               list_last_entry(&lock->wait_list,
+                                       struct six_lock_waiter, list);
+
+                       if (time_before_eq64(wait->start_time, last->start_time))
+                               wait->start_time = last->start_time + 1;
+               }
+
+               list_add_tail(&wait->list, &lock->wait_list);
+       }
+       raw_spin_unlock(&lock->wait_lock);
+
+       if (unlikely(ret > 0)) {
+               ret = 0;
+               goto out;
+       }
+
+       if (unlikely(ret < 0)) {
+               __six_lock_wakeup(lock, -ret - 1);
+               ret = 0;
+       }
+
+       if (six_optimistic_spin(lock, wait, type))
+               goto out;
+
+       while (1) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+
+               /*
+                * Ensures that writes to the waitlist entry happen after we see
+                * wait->lock_acquired: pairs with the smp_store_release in
+                * __six_lock_wakeup
+                */
+               if (smp_load_acquire(&wait->lock_acquired))
+                       break;
+
+               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
+               if (unlikely(ret)) {
+                       bool acquired;
+
+                       /*
+                        * If should_sleep_fn() returns an error, we are
+                        * required to return that error even if we already
+                        * acquired the lock - should_sleep_fn() might have
+                        * modified external state (e.g. when the deadlock cycle
+                        * detector in bcachefs issued a transaction restart)
+                        */
+                       raw_spin_lock(&lock->wait_lock);
+                       acquired = wait->lock_acquired;
+                       if (!acquired)
+                               list_del(&wait->list);
+                       raw_spin_unlock(&lock->wait_lock);
+
+                       if (unlikely(acquired))
+                               do_six_unlock_type(lock, type);
+                       break;
+               }
+
+               schedule();
+       }
+
+       __set_current_state(TASK_RUNNING);
+out:
+       if (ret && type == SIX_LOCK_write) {
+               six_clear_bitmask(lock, SIX_LOCK_HELD_write);
+               six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
+       }
+       trace_contention_end(lock, 0);
+
+       return ret;
+}
+
+/**
+ * six_lock_ip_waiter - take a lock, with full waitlist interface
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:      pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * This is the most general six_lock() variant, with parameters to support full
+ * cycle detection for deadlock avoidance.
+ *
+ * The code calling this function must implement tracking of held locks, and the
+ * @wait object should be embedded into the struct that tracks held locks -
+ * which must also be accessible in a thread-safe way.
+ *
+ * @should_sleep_fn should invoke the cycle detector; it should walk each
+ * lock's waiters, and for each waiter recursively walk their held locks.
+ *
+ * When this function must block, @wait will be added to @lock's waitlist before
+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be
+ * removed from the lock waitlist until the lock has been successfully acquired,
+ * or we abort.
+ *
+ * @wait.start_time will be monotonically increasing for any given waitlist, and
+ * thus may be used as a loop cursor.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+                      struct six_lock_waiter *wait,
+                      six_lock_should_sleep_fn should_sleep_fn, void *p,
+                      unsigned long ip)
+{
+       int ret;
+
+       wait->start_time = 0;
+
+       if (type != SIX_LOCK_write)
+               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
+
+       ret = do_six_trylock(lock, type, true) ? 0
+               : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
+
+       if (ret && type != SIX_LOCK_write)
+               six_release(&lock->dep_map, ip);
+       if (!ret)
+               lock_acquired(&lock->dep_map, ip);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
+
+__always_inline
+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       u32 state;
+
+       if (type == SIX_LOCK_intent)
+               lock->owner = NULL;
+
+       if (type == SIX_LOCK_read &&
+           lock->readers) {
+               smp_mb(); /* unlock barrier */
+               this_cpu_dec(*lock->readers);
+               smp_mb(); /* between unlocking and checking for waiters */
+               state = atomic_read(&lock->state);
+       } else {
+               u32 v = l[type].lock_val;
+
+               if (type != SIX_LOCK_read)
+                       v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
+
+               EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
+               state = atomic_sub_return_release(v, &lock->state);
+       }
+
+       six_lock_wakeup(lock, state, l[type].unlock_wakeup);
+}
+
+/**
+ * six_unlock_ip - drop a six lock
+ * @lock:      lock to unlock
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);                          read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
+ */
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
+{
+       EBUG_ON(type == SIX_LOCK_write &&
+               !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+       EBUG_ON((type == SIX_LOCK_write ||
+                type == SIX_LOCK_intent) &&
+               lock->owner != current);
+
+       if (type != SIX_LOCK_write)
+               six_release(&lock->dep_map, ip);
+       else
+               lock->seq++;
+
+       if (type == SIX_LOCK_intent &&
+           lock->intent_lock_recurse) {
+               --lock->intent_lock_recurse;
+               return;
+       }
+
+       do_six_unlock_type(lock, type);
+}
+EXPORT_SYMBOL_GPL(six_unlock_ip);
+
+/**
+ * six_lock_downgrade - convert an intent lock to a read lock
+ * @lock:      lock to dowgrade
+ *
+ * @lock will have read count incremented and intent count decremented
+ */
+void six_lock_downgrade(struct six_lock *lock)
+{
+       six_lock_increment(lock, SIX_LOCK_read);
+       six_unlock_intent(lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
+
+/**
+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock
+ * @lock:      lock to upgrade
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_lock_tryupgrade(struct six_lock *lock)
+{
+       u32 old = atomic_read(&lock->state), new;
+
+       do {
+               new = old;
+
+               if (new & SIX_LOCK_HELD_intent)
+                       return false;
+
+               if (!lock->readers) {
+                       EBUG_ON(!(new & SIX_LOCK_HELD_read));
+                       new -= l[SIX_LOCK_read].lock_val;
+               }
+
+               new |= SIX_LOCK_HELD_intent;
+       } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
+
+       if (lock->readers)
+               this_cpu_dec(*lock->readers);
+
+       six_set_owner(lock, SIX_LOCK_intent, old, current);
+
+       return true;
+}
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
+
+/**
+ * six_trylock_convert - attempt to convert a held lock from one type to another
+ * @lock:      lock to upgrade
+ * @from:      SIX_LOCK_read or SIX_LOCK_intent
+ * @to:                SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * On success, @lock will have intent count incremented and read count
+ * decremented
+ *
+ * Return: true on success, false on failure
+ */
+bool six_trylock_convert(struct six_lock *lock,
+                        enum six_lock_type from,
+                        enum six_lock_type to)
+{
+       EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
+
+       if (to == from)
+               return true;
+
+       if (to == SIX_LOCK_read) {
+               six_lock_downgrade(lock);
+               return true;
+       } else {
+               return six_lock_tryupgrade(lock);
+       }
+}
+EXPORT_SYMBOL_GPL(six_trylock_convert);
+
+/**
+ * six_lock_increment - increase held lock count on a lock that is already held
+ * @lock:      lock to increment
+ * @type:      SIX_LOCK_read or SIX_LOCK_intent
+ *
+ * @lock must already be held, with a lock type that is greater than or equal to
+ * @type
+ *
+ * A corresponding six_unlock_type() call will be required for @lock to be fully
+ * unlocked.
+ */
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
+{
+       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
+
+       /* XXX: assert already locked, and that we don't overflow: */
+
+       switch (type) {
+       case SIX_LOCK_read:
+               if (lock->readers) {
+                       this_cpu_inc(*lock->readers);
+               } else {
+                       EBUG_ON(!(atomic_read(&lock->state) &
+                                 (SIX_LOCK_HELD_read|
+                                  SIX_LOCK_HELD_intent)));
+                       atomic_add(l[type].lock_val, &lock->state);
+               }
+               break;
+       case SIX_LOCK_intent:
+               EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
+               lock->intent_lock_recurse++;
+               break;
+       case SIX_LOCK_write:
+               BUG();
+               break;
+       }
+}
+EXPORT_SYMBOL_GPL(six_lock_increment);
+
+/**
+ * six_lock_wakeup_all - wake up all waiters on @lock
+ * @lock:      lock to wake up waiters for
+ *
+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
+ * abort the lock operation.
+ *
+ * This function is never needed in a bug-free program; it's only useful in
+ * debug code, e.g. to determine if a cycle detector is at fault.
+ */
+void six_lock_wakeup_all(struct six_lock *lock)
+{
+       u32 state = atomic_read(&lock->state);
+       struct six_lock_waiter *w;
+
+       six_lock_wakeup(lock, state, SIX_LOCK_read);
+       six_lock_wakeup(lock, state, SIX_LOCK_intent);
+       six_lock_wakeup(lock, state, SIX_LOCK_write);
+
+       raw_spin_lock(&lock->wait_lock);
+       list_for_each_entry(w, &lock->wait_list, list)
+               wake_up_process(w->task);
+       raw_spin_unlock(&lock->wait_lock);
+}
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
+
+/**
+ * six_lock_counts - return held lock counts, for each lock type
+ * @lock:      lock to return counters for
+ *
+ * Return: the number of times a lock is held for read, intent and write.
+ */
+struct six_lock_count six_lock_counts(struct six_lock *lock)
+{
+       struct six_lock_count ret;
+
+       ret.n[SIX_LOCK_read]    = !lock->readers
+               ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
+               : pcpu_read_count(lock);
+       ret.n[SIX_LOCK_intent]  = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
+               lock->intent_lock_recurse;
+       ret.n[SIX_LOCK_write]   = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(six_lock_counts);
+
+/**
+ * six_lock_readers_add - directly manipulate reader count of a lock
+ * @lock:      lock to add/subtract readers for
+ * @nr:                reader count to add/subtract
+ *
+ * When an upper layer is implementing lock reentrency, we may have both read
+ * and intent locks on the same lock.
+ *
+ * When we need to take a write lock, the read locks will cause self-deadlock,
+ * because six locks themselves do not track which read locks are held by the
+ * current thread and which are held by a different thread - it does no
+ * per-thread tracking of held locks.
+ *
+ * The upper layer that is tracking held locks may however, if trylock() has
+ * failed, count up its own read locks, subtract them, take the write lock, and
+ * then re-add them.
+ *
+ * As in any other situation when taking a write lock, @lock must be held for
+ * intent one (or more) times, so @lock will never be left unlocked.
+ */
+void six_lock_readers_add(struct six_lock *lock, int nr)
+{
+       if (lock->readers) {
+               this_cpu_add(*lock->readers, nr);
+       } else {
+               EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
+               /* reader count starts at bit 0 */
+               atomic_add(nr, &lock->state);
+       }
+}
+EXPORT_SYMBOL_GPL(six_lock_readers_add);
+
+/**
+ * six_lock_exit - release resources held by a lock prior to freeing
+ * @lock:      lock to exit
+ *
+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
+ * required to free the percpu read counts.
+ */
+void six_lock_exit(struct six_lock *lock)
+{
+       WARN_ON(lock->readers && pcpu_read_count(lock));
+       WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
+
+       free_percpu(lock->readers);
+       lock->readers = NULL;
+}
+EXPORT_SYMBOL_GPL(six_lock_exit);
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+                    struct lock_class_key *key, enum six_lock_init_flags flags)
+{
+       atomic_set(&lock->state, 0);
+       raw_spin_lock_init(&lock->wait_lock);
+       INIT_LIST_HEAD(&lock->wait_list);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+       lockdep_init_map(&lock->dep_map, name, key, 0);
+#endif
+
+       /*
+        * Don't assume that we have real percpu variables available in
+        * userspace:
+        */
+#ifdef __KERNEL__
+       if (flags & SIX_LOCK_INIT_PCPU) {
+               /*
+                * We don't return an error here on memory allocation failure
+                * since percpu is an optimization, and locks will work with the
+                * same semantics in non-percpu mode: callers can check for
+                * failure if they wish by checking lock->readers, but generally
+                * will not want to treat it as an error.
+                */
+               lock->readers = alloc_percpu(unsigned);
+       }
+#endif
+}
+EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/libbcachefs/six.h b/libbcachefs/six.h
new file mode 100644 (file)
index 0000000..a7104ac
--- /dev/null
@@ -0,0 +1,386 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/**
+ * DOC: SIX locks overview
+ *
+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
+ * but with an additional state: read/shared, intent, exclusive/write
+ *
+ * The purpose of the intent state is to allow for greater concurrency on tree
+ * structures without deadlocking. In general, a read can't be upgraded to a
+ * write lock without deadlocking, so an operation that updates multiple nodes
+ * will have to take write locks for the full duration of the operation.
+ *
+ * But by adding an intent state, which is exclusive with other intent locks but
+ * not with readers, we can take intent locks at thte start of the operation,
+ * and then take write locks only for the actual update to each individual
+ * nodes, without deadlocking.
+ *
+ * Example usage:
+ *   six_lock_read(&foo->lock);
+ *   six_unlock_read(&foo->lock);
+ *
+ * An intent lock must be held before taking a write lock:
+ *   six_lock_intent(&foo->lock);
+ *   six_lock_write(&foo->lock);
+ *   six_unlock_write(&foo->lock);
+ *   six_unlock_intent(&foo->lock);
+ *
+ * Other operations:
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade()      convert from intent to read
+ *   six_lock_tryupgrade()     attempt to convert from read to intent, may fail
+ *
+ * There are also interfaces that take the lock type as an enum:
+ *
+ *   six_lock_type(&foo->lock, SIX_LOCK_read);
+ *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
+ *   six_lock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_write);
+ *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
+ *
+ * Lock sequence numbers - unlock(), relock():
+ *
+ *   Locks embed sequences numbers, which are incremented on write lock/unlock.
+ *   This allows locks to be dropped and the retaken iff the state they protect
+ *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
+ *   doing IO or allocating memory.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     u32 seq = six_lock_seq(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *
+ *     some_operation_that_may_block();
+ *
+ *     if (six_relock_read(&foo->lock, seq)) { ... }
+ *
+ *   If the relock operation succeeds, it is as if the lock was never unlocked.
+ *
+ * Reentrancy:
+ *
+ *   Six locks are not by themselves reentrent, but have counters for both the
+ *   read and intent states that can be used to provide reentrency by an upper
+ *   layer that tracks held locks. If a lock is known to already be held in the
+ *   read or intent state, six_lock_increment() can be used to bump the "lock
+ *   held in this state" counter, increasing the number of unlock calls that
+ *   will be required to fully unlock it.
+ *
+ *   Example usage:
+ *     six_lock_read(&foo->lock);
+ *     six_lock_increment(&foo->lock, SIX_LOCK_read);
+ *     six_unlock_read(&foo->lock);
+ *     six_unlock_read(&foo->lock);
+ *   foo->lock is now fully unlocked.
+ *
+ *   Since the intent state supercedes read, it's legal to increment the read
+ *   counter when holding an intent lock, but not the reverse.
+ *
+ *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
+ *   is not legal.
+ *
+ * should_sleep_fn:
+ *
+ *   There is a six_lock() variant that takes a function pointer that is called
+ *   immediately prior to schedule() when blocking, and may return an error to
+ *   abort.
+ *
+ *   One possible use for this feature is when objects being locked are part of
+ *   a cache and may reused, and lock ordering is based on a property of the
+ *   object that will change when the object is reused - i.e. logical key order.
+ *
+ *   If looking up an object in the cache may race with object reuse, and lock
+ *   ordering is required to prevent deadlock, object reuse may change the
+ *   correct lock order for that object and cause a deadlock. should_sleep_fn
+ *   can be used to check if the object is still the object we want and avoid
+ *   this deadlock.
+ *
+ * Wait list entry interface:
+ *
+ *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
+ *   wait list entry. By embedding six_lock_waiter into another object, and by
+ *   traversing lock waitlists, it is then possible for an upper layer to
+ *   implement full cycle detection for deadlock avoidance.
+ *
+ *   should_sleep_fn should be used for invoking the cycle detector, walking the
+ *   graph of held locks to check for a deadlock. The upper layer must track
+ *   held locks for each thread, and each thread's held locks must be reachable
+ *   from its six_lock_waiter object.
+ *
+ *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
+ *   the lock, and before calling should_sleep_fn, and the wait object will not
+ *   be removed from the waitlist until either the lock has been successfully
+ *   acquired, or we aborted because should_sleep_fn returned an error.
+ *
+ *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
+ *   have timestamps in strictly ascending order - this is so the timestamp can
+ *   be used as a cursor for lock graph traverse.
+ */
+
+#include <linux/lockdep.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+enum six_lock_type {
+       SIX_LOCK_read,
+       SIX_LOCK_intent,
+       SIX_LOCK_write,
+};
+
+struct six_lock {
+       atomic_t                state;
+       u32                     seq;
+       unsigned                intent_lock_recurse;
+       struct task_struct      *owner;
+       unsigned __percpu       *readers;
+       raw_spinlock_t          wait_lock;
+       struct list_head        wait_list;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lockdep_map      dep_map;
+#endif
+};
+
+struct six_lock_waiter {
+       struct list_head        list;
+       struct task_struct      *task;
+       enum six_lock_type      lock_want;
+       bool                    lock_acquired;
+       u64                     start_time;
+};
+
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
+
+void six_lock_exit(struct six_lock *lock);
+
+enum six_lock_init_flags {
+       SIX_LOCK_INIT_PCPU      = 1U << 0,
+};
+
+void __six_lock_init(struct six_lock *lock, const char *name,
+                    struct lock_class_key *key, enum six_lock_init_flags flags);
+
+/**
+ * six_lock_init - initialize a six lock
+ * @lock:      lock to initialize
+ * @flags:     optional flags, i.e. SIX_LOCK_INIT_PCPU
+ */
+#define six_lock_init(lock, flags)                                     \
+do {                                                                   \
+       static struct lock_class_key __key;                             \
+                                                                       \
+       __six_lock_init((lock), #lock, &__key, flags);                  \
+} while (0)
+
+/**
+ * six_lock_seq - obtain current lock sequence number
+ * @lock:      six_lock to obtain sequence number for
+ *
+ * @lock should be held for read or intent, and not write
+ *
+ * By saving the lock sequence number, we can unlock @lock and then (typically
+ * after some blocking operation) attempt to relock it: the relock will succeed
+ * if the sequence number hasn't changed, meaning no write locks have been taken
+ * and state corresponding to what @lock protects is still valid.
+ */
+static inline u32 six_lock_seq(const struct six_lock *lock)
+{
+       return lock->seq;
+}
+
+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_trylock_type - attempt to take a six lock without blocking
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       return six_trylock_ip(lock, type, _THIS_IP_);
+}
+
+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
+                      struct six_lock_waiter *wait,
+                      six_lock_should_sleep_fn should_sleep_fn, void *p,
+                      unsigned long ip);
+
+/**
+ * six_lock_waiter - take a lock, with full waitlist interface
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @wait:      pointer to wait object, which will be added to lock's waitlist
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ *
+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function
+ * for full documentation.
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
+                                 struct six_lock_waiter *wait,
+                                 six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+/**
+ * six_lock_ip - take a six lock lock
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
+                             six_lock_should_sleep_fn should_sleep_fn, void *p,
+                             unsigned long ip)
+{
+       struct six_lock_waiter wait;
+
+       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
+}
+
+/**
+ * six_lock_type - take a six lock lock
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior
+ *             to scheduling
+ * @p:         passed through to @should_sleep_fn
+ *
+ * Return: 0 on success, or the return code from @should_sleep_fn on failure.
+ */
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
+                               six_lock_should_sleep_fn should_sleep_fn, void *p)
+{
+       struct six_lock_waiter wait;
+
+       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
+}
+
+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
+                  unsigned seq, unsigned long ip);
+
+/**
+ * six_relock_type - attempt to re-take a lock that was held previously
+ * @lock:      lock to take
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ * @seq:       lock sequence number obtained from six_lock_seq() while lock was
+ *             held previously
+ *
+ * Return: true on success, false on failure.
+ */
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
+                                  unsigned seq)
+{
+       return six_relock_ip(lock, type, seq, _THIS_IP_);
+}
+
+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
+
+/**
+ * six_unlock_type - drop a six lock
+ * @lock:      lock to unlock
+ * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
+ *
+ * When a lock is held multiple times (because six_lock_incement()) was used),
+ * this decrements the 'lock held' counter by one.
+ *
+ * For example:
+ * six_lock_read(&foo->lock);                          read count 1
+ * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
+ * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
+ */
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
+{
+       six_unlock_ip(lock, type, _THIS_IP_);
+}
+
+#define __SIX_LOCK(type)                                               \
+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
+{                                                                      \
+       return six_trylock_ip(lock, SIX_LOCK_##type, ip);               \
+}                                                                      \
+                                                                       \
+static inline bool six_trylock_##type(struct six_lock *lock)           \
+{                                                                      \
+       return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);        \
+}                                                                      \
+                                                                       \
+static inline int six_lock_ip_waiter_##type(struct six_lock *lock,     \
+                          struct six_lock_waiter *wait,                \
+                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
+                          unsigned long ip)                            \
+{                                                                      \
+       return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
+}                                                                      \
+                                                                       \
+static inline int six_lock_ip_##type(struct six_lock *lock,            \
+                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
+                   unsigned long ip)                                   \
+{                                                                      \
+       return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
+}                                                                      \
+                                                                       \
+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
+{                                                                      \
+       return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);           \
+}                                                                      \
+                                                                       \
+static inline bool six_relock_##type(struct six_lock *lock, u32 seq)   \
+{                                                                      \
+       return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);    \
+}                                                                      \
+                                                                       \
+static inline int six_lock_##type(struct six_lock *lock,               \
+                                 six_lock_should_sleep_fn fn, void *p)\
+{                                                                      \
+       return six_lock_ip_##type(lock, fn, p, _THIS_IP_);              \
+}                                                                      \
+                                                                       \
+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)       \
+{                                                                      \
+       six_unlock_ip(lock, SIX_LOCK_##type, ip);                       \
+}                                                                      \
+                                                                       \
+static inline void six_unlock_##type(struct six_lock *lock)            \
+{                                                                      \
+       six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);                \
+}
+
+__SIX_LOCK(read)
+__SIX_LOCK(intent)
+__SIX_LOCK(write)
+#undef __SIX_LOCK
+
+void six_lock_downgrade(struct six_lock *);
+bool six_lock_tryupgrade(struct six_lock *);
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
+                        enum six_lock_type);
+
+void six_lock_increment(struct six_lock *, enum six_lock_type);
+
+void six_lock_wakeup_all(struct six_lock *);
+
+struct six_lock_count {
+       unsigned n[3];
+};
+
+struct six_lock_count six_lock_counts(struct six_lock *);
+void six_lock_readers_add(struct six_lock *, int);
+
+#endif /* _LINUX_SIX_H */
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
new file mode 100644 (file)
index 0000000..b23550b
--- /dev/null
@@ -0,0 +1,1713 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "bkey_buf.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "errcode.h"
+#include "error.h"
+#include "fs.h"
+#include "snapshot.h"
+
+#include <linux/random.h>
+
+/*
+ * Snapshot trees:
+ *
+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
+ * exist to provide a stable identifier for the whole lifetime of a snapshot
+ * tree.
+ */
+
+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
+                               struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
+
+       prt_printf(out, "subvol %u root snapshot %u",
+                  le32_to_cpu(t.v->master_subvol),
+                  le32_to_cpu(t.v->root_snapshot));
+}
+
+int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
+                              enum bkey_invalid_flags flags,
+                              struct printbuf *err)
+{
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+                        bkey_lt(k.k->p, POS(0, 1)), c, err,
+                        snapshot_tree_pos_bad,
+                        "bad pos");
+fsck_err:
+       return ret;
+}
+
+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
+                             struct bch_snapshot_tree *s)
+{
+       int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
+                                         BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
+
+       if (bch2_err_matches(ret, ENOENT))
+               ret = -BCH_ERR_ENOENT_snapshot_tree;
+       return ret;
+}
+
+struct bkey_i_snapshot_tree *
+__bch2_snapshot_tree_create(struct btree_trans *trans)
+{
+       struct btree_iter iter;
+       int ret = bch2_bkey_get_empty_slot(trans, &iter,
+                       BTREE_ID_snapshot_trees, POS(0, U32_MAX));
+       struct bkey_i_snapshot_tree *s_t;
+
+       if (ret == -BCH_ERR_ENOSPC_btree_slot)
+               ret = -BCH_ERR_ENOSPC_snapshot_tree;
+       if (ret)
+               return ERR_PTR(ret);
+
+       s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
+       ret = PTR_ERR_OR_ZERO(s_t);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret ? ERR_PTR(ret) : s_t;
+}
+
+static int bch2_snapshot_tree_create(struct btree_trans *trans,
+                               u32 root_id, u32 subvol_id, u32 *tree_id)
+{
+       struct bkey_i_snapshot_tree *n_tree =
+               __bch2_snapshot_tree_create(trans);
+
+       if (IS_ERR(n_tree))
+               return PTR_ERR(n_tree);
+
+       n_tree->v.master_subvol = cpu_to_le32(subvol_id);
+       n_tree->v.root_snapshot = cpu_to_le32(root_id);
+       *tree_id = n_tree->k.p.offset;
+       return 0;
+}
+
+/* Snapshot nodes: */
+
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
+       struct snapshot_table *t;
+
+       rcu_read_lock();
+       t = rcu_dereference(c->snapshots);
+
+       while (id && id < ancestor)
+               id = __snapshot_t(t, id)->parent;
+       rcu_read_unlock();
+
+       return id == ancestor;
+}
+
+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+       const struct snapshot_t *s = __snapshot_t(t, id);
+
+       if (s->skip[2] <= ancestor)
+               return s->skip[2];
+       if (s->skip[1] <= ancestor)
+               return s->skip[1];
+       if (s->skip[0] <= ancestor)
+               return s->skip[0];
+       return s->parent;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+       struct snapshot_table *t;
+       bool ret;
+
+       EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+
+       rcu_read_lock();
+       t = rcu_dereference(c->snapshots);
+
+       while (id && id < ancestor - IS_ANCESTOR_BITMAP)
+               id = get_ancestor_below(t, id, ancestor);
+
+       if (id && id < ancestor) {
+               ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
+
+               EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
+       } else {
+               ret = id == ancestor;
+       }
+
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+       size_t idx = U32_MAX - id;
+       size_t new_size;
+       struct snapshot_table *new, *old;
+
+       new_size = max(16UL, roundup_pow_of_two(idx + 1));
+
+       new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+       if (!new)
+               return NULL;
+
+       old = rcu_dereference_protected(c->snapshots, true);
+       if (old)
+               memcpy(new->s,
+                      rcu_dereference_protected(c->snapshots, true)->s,
+                      sizeof(new->s[0]) * c->snapshot_table_size);
+
+       rcu_assign_pointer(c->snapshots, new);
+       c->snapshot_table_size = new_size;
+       kvfree_rcu_mightsleep(old);
+
+       return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+}
+
+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
+{
+       size_t idx = U32_MAX - id;
+
+       lockdep_assert_held(&c->snapshot_table_lock);
+
+       if (likely(idx < c->snapshot_table_size))
+               return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+
+       return __snapshot_t_mut(c, id);
+}
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+       prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
+              BCH_SNAPSHOT_SUBVOL(s.v),
+              BCH_SNAPSHOT_DELETED(s.v),
+              le32_to_cpu(s.v->parent),
+              le32_to_cpu(s.v->children[0]),
+              le32_to_cpu(s.v->children[1]),
+              le32_to_cpu(s.v->subvol),
+              le32_to_cpu(s.v->tree));
+
+       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
+               prt_printf(out, " depth %u skiplist %u %u %u",
+                          le32_to_cpu(s.v->depth),
+                          le32_to_cpu(s.v->skip[0]),
+                          le32_to_cpu(s.v->skip[1]),
+                          le32_to_cpu(s.v->skip[2]));
+}
+
+int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
+                         enum bkey_invalid_flags flags,
+                         struct printbuf *err)
+{
+       struct bkey_s_c_snapshot s;
+       u32 i, id;
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
+                        bkey_lt(k.k->p, POS(0, 1)), c, err,
+                        snapshot_pos_bad,
+                        "bad pos");
+
+       s = bkey_s_c_to_snapshot(k);
+
+       id = le32_to_cpu(s.v->parent);
+       bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
+                        snapshot_parent_bad,
+                        "bad parent node (%u <= %llu)",
+                        id, k.k->p.offset);
+
+       bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
+                        snapshot_children_not_normalized,
+                        "children not normalized");
+
+       bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
+                        snapshot_child_duplicate,
+                        "duplicate child nodes");
+
+       for (i = 0; i < 2; i++) {
+               id = le32_to_cpu(s.v->children[i]);
+
+               bkey_fsck_err_on(id >= k.k->p.offset, c, err,
+                                snapshot_child_bad,
+                                "bad child node (%u >= %llu)",
+                                id, k.k->p.offset);
+       }
+
+       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
+               bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
+                                le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
+                                snapshot_skiplist_not_normalized,
+                                "skiplist not normalized");
+
+               for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
+                       id = le32_to_cpu(s.v->skip[i]);
+
+                       bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
+                                        snapshot_skiplist_bad,
+                                        "bad skiplist node %u", id);
+               }
+       }
+fsck_err:
+       return ret;
+}
+
+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *t = snapshot_t_mut(c, id);
+       u32 parent = id;
+
+       while ((parent = bch2_snapshot_parent_early(c, parent)) &&
+              parent - id - 1 < IS_ANCESTOR_BITMAP)
+               __set_bit(parent - id - 1, t->is_ancestor);
+}
+
+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
+{
+       mutex_lock(&c->snapshot_table_lock);
+       __set_is_ancestor_bitmap(c, id);
+       mutex_unlock(&c->snapshot_table_lock);
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+                      enum btree_id btree, unsigned level,
+                      struct bkey_s_c old, struct bkey_s_c new,
+                      unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct snapshot_t *t;
+       u32 id = new.k->p.offset;
+       int ret = 0;
+
+       mutex_lock(&c->snapshot_table_lock);
+
+       t = snapshot_t_mut(c, id);
+       if (!t) {
+               ret = -BCH_ERR_ENOMEM_mark_snapshot;
+               goto err;
+       }
+
+       if (new.k->type == KEY_TYPE_snapshot) {
+               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+               t->parent       = le32_to_cpu(s.v->parent);
+               t->children[0]  = le32_to_cpu(s.v->children[0]);
+               t->children[1]  = le32_to_cpu(s.v->children[1]);
+               t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+               t->tree         = le32_to_cpu(s.v->tree);
+
+               if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
+                       t->depth        = le32_to_cpu(s.v->depth);
+                       t->skip[0]      = le32_to_cpu(s.v->skip[0]);
+                       t->skip[1]      = le32_to_cpu(s.v->skip[1]);
+                       t->skip[2]      = le32_to_cpu(s.v->skip[2]);
+               } else {
+                       t->depth        = 0;
+                       t->skip[0]      = 0;
+                       t->skip[1]      = 0;
+                       t->skip[2]      = 0;
+               }
+
+               __set_is_ancestor_bitmap(c, id);
+
+               if (BCH_SNAPSHOT_DELETED(s.v)) {
+                       set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+                       if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
+                               bch2_delete_dead_snapshots_async(c);
+               }
+       } else {
+               memset(t, 0, sizeof(*t));
+       }
+err:
+       mutex_unlock(&c->snapshot_table_lock);
+       return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+                        struct bch_snapshot *s)
+{
+       return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
+                                      BTREE_ITER_WITH_UPDATES, snapshot, s);
+}
+
+static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
+{
+       struct bch_snapshot v;
+       int ret;
+
+       if (!id)
+               return 0;
+
+       ret = bch2_snapshot_lookup(trans, id, &v);
+       if (bch2_err_matches(ret, ENOENT))
+               bch_err(trans->c, "snapshot node %u not found", id);
+       if (ret)
+               return ret;
+
+       return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+/*
+ * If @k is a snapshot with just one live child, it's part of a linear chain,
+ * which we consider to be an equivalence class: and then after snapshot
+ * deletion cleanup, there should only be a single key at a given position in
+ * this equivalence class.
+ *
+ * This sets the equivalence class of @k to be the child's equivalence class, if
+ * it's part of such a linear chain: this correctly sets equivalence classes on
+ * startup if we run leaf to root (i.e. in natural key order).
+ */
+static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       unsigned i, nr_live = 0, live_idx = 0;
+       struct bkey_s_c_snapshot snap;
+       u32 id = k.k->p.offset, child[2];
+
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       snap = bkey_s_c_to_snapshot(k);
+
+       child[0] = le32_to_cpu(snap.v->children[0]);
+       child[1] = le32_to_cpu(snap.v->children[1]);
+
+       for (i = 0; i < 2; i++) {
+               int ret = bch2_snapshot_live(trans, child[i]);
+
+               if (ret < 0)
+                       return ret;
+
+               if (ret)
+                       live_idx = i;
+               nr_live += ret;
+       }
+
+       mutex_lock(&c->snapshot_table_lock);
+
+       snapshot_t_mut(c, id)->equiv = nr_live == 1
+               ? snapshot_t_mut(c, child[live_idx])->equiv
+               : id;
+
+       mutex_unlock(&c->snapshot_table_lock);
+
+       return 0;
+}
+
+/* fsck: */
+
+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
+{
+       return snapshot_t(c, id)->children[child];
+}
+
+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
+{
+       return bch2_snapshot_child(c, id, 0);
+}
+
+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
+{
+       return bch2_snapshot_child(c, id, 1);
+}
+
+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
+{
+       u32 n, parent;
+
+       n = bch2_snapshot_left_child(c, id);
+       if (n)
+               return n;
+
+       while ((parent = bch2_snapshot_parent(c, id))) {
+               n = bch2_snapshot_right_child(c, parent);
+               if (n && n != id)
+                       return n;
+               id = parent;
+       }
+
+       return 0;
+}
+
+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
+{
+       u32 id = snapshot_root;
+       u32 subvol = 0, s;
+
+       while (id) {
+               s = snapshot_t(c, id)->subvol;
+
+               if (s && (!subvol || s < subvol))
+                       subvol = s;
+
+               id = bch2_snapshot_tree_next(c, id);
+       }
+
+       return subvol;
+}
+
+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
+                                           u32 snapshot_root, u32 *subvol_id)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_subvolume s;
+       bool found = false;
+       int ret;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+                                    0, k, ret) {
+               if (k.k->type != KEY_TYPE_subvolume)
+                       continue;
+
+               s = bkey_s_c_to_subvolume(k);
+               if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
+                       continue;
+               if (!BCH_SUBVOLUME_SNAP(s.v)) {
+                       *subvol_id = s.k->p.offset;
+                       found = true;
+                       break;
+               }
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (!ret && !found) {
+               struct bkey_i_subvolume *u;
+
+               *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
+
+               u = bch2_bkey_get_mut_typed(trans, &iter,
+                                           BTREE_ID_subvolumes, POS(0, *subvol_id),
+                                           0, subvolume);
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       return ret;
+
+               SET_BCH_SUBVOLUME_SNAP(&u->v, false);
+       }
+
+       return ret;
+}
+
+static int check_snapshot_tree(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c_snapshot_tree st;
+       struct bch_snapshot s;
+       struct bch_subvolume subvol;
+       struct printbuf buf = PRINTBUF;
+       u32 root_id;
+       int ret;
+
+       if (k.k->type != KEY_TYPE_snapshot_tree)
+               return 0;
+
+       st = bkey_s_c_to_snapshot_tree(k);
+       root_id = le32_to_cpu(st.v->root_snapshot);
+
+       ret = bch2_snapshot_lookup(trans, root_id, &s);
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               goto err;
+
+       if (fsck_err_on(ret ||
+                       root_id != bch2_snapshot_root(c, root_id) ||
+                       st.k->p.offset != le32_to_cpu(s.tree),
+                       c, snapshot_tree_to_missing_snapshot,
+                       "snapshot tree points to missing/incorrect snapshot:\n  %s",
+                       (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+               ret = bch2_btree_delete_at(trans, iter, 0);
+               goto err;
+       }
+
+       ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
+                                false, 0, &subvol);
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               goto err;
+
+       if (fsck_err_on(ret,
+                       c, snapshot_tree_to_missing_subvol,
+                       "snapshot tree points to missing subvolume:\n  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+           fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+                                               le32_to_cpu(subvol.snapshot),
+                                               root_id),
+                       c, snapshot_tree_to_wrong_subvol,
+                       "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
+           fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
+                       c, snapshot_tree_to_snapshot_subvol,
+                       "snapshot tree points to snapshot subvolume:\n  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
+               struct bkey_i_snapshot_tree *u;
+               u32 subvol_id;
+
+               ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+               if (ret)
+                       goto err;
+
+               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       goto err;
+
+               u->v.master_subvol = cpu_to_le32(subvol_id);
+               st = snapshot_tree_i_to_s_c(u);
+       }
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+/*
+ * For each snapshot_tree, make sure it points to the root of a snapshot tree
+ * and that snapshot entry points back to it, or delete it.
+ *
+ * And, make sure it points to a subvolume within that snapshot tree, or correct
+ * it to point to the oldest subvolume within that snapshot tree.
+ */
+int bch2_check_snapshot_trees(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter,
+                       BTREE_ID_snapshot_trees, POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_snapshot_tree(trans, &iter, k)));
+
+       if (ret)
+               bch_err(c, "error %i checking snapshot trees", ret);
+       return ret;
+}
+
+/*
+ * Look up snapshot tree for @tree_id and find root,
+ * make sure @snap_id is a descendent:
+ */
+static int snapshot_tree_ptr_good(struct btree_trans *trans,
+                                 u32 snap_id, u32 tree_id)
+{
+       struct bch_snapshot_tree s_t;
+       int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+
+       if (bch2_err_matches(ret, ENOENT))
+               return 0;
+       if (ret)
+               return ret;
+
+       return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
+{
+       const struct snapshot_t *s;
+
+       if (!id)
+               return 0;
+
+       rcu_read_lock();
+       s = snapshot_t(c, id);
+       if (s->parent)
+               id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
+       rcu_read_unlock();
+
+       return id;
+}
+
+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
+{
+       unsigned i;
+
+       for (i = 0; i < 3; i++)
+               if (!s.parent) {
+                       if (s.skip[i])
+                               return false;
+               } else {
+                       if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
+                               return false;
+               }
+
+       return true;
+}
+
+/*
+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
+ * its snapshot_tree pointer is correct (allocate new one if necessary), then
+ * update this node's pointer to root node's pointer:
+ */
+static int snapshot_tree_ptr_repair(struct btree_trans *trans,
+                                   struct btree_iter *iter,
+                                   struct bkey_s_c k,
+                                   struct bch_snapshot *s)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter root_iter;
+       struct bch_snapshot_tree s_t;
+       struct bkey_s_c_snapshot root;
+       struct bkey_i_snapshot *u;
+       u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
+       int ret;
+
+       root = bch2_bkey_get_iter_typed(trans, &root_iter,
+                              BTREE_ID_snapshots, POS(0, root_id),
+                              BTREE_ITER_WITH_UPDATES, snapshot);
+       ret = bkey_err(root);
+       if (ret)
+               goto err;
+
+       tree_id = le32_to_cpu(root.v->tree);
+
+       ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
+       if (ret && !bch2_err_matches(ret, ENOENT))
+               return ret;
+
+       if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
+               u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
+               ret =   PTR_ERR_OR_ZERO(u) ?:
+                       bch2_snapshot_tree_create(trans, root_id,
+                               bch2_snapshot_tree_oldest_subvol(c, root_id),
+                               &tree_id);
+               if (ret)
+                       goto err;
+
+               u->v.tree = cpu_to_le32(tree_id);
+               if (k.k->p.offset == root_id)
+                       *s = u->v;
+       }
+
+       if (k.k->p.offset != root_id) {
+               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       goto err;
+
+               u->v.tree = cpu_to_le32(tree_id);
+               *s = u->v;
+       }
+err:
+       bch2_trans_iter_exit(trans, &root_iter);
+       return ret;
+}
+
+static int check_snapshot(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_snapshot s;
+       struct bch_subvolume subvol;
+       struct bch_snapshot v;
+       struct bkey_i_snapshot *u;
+       u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
+       u32 real_depth;
+       struct printbuf buf = PRINTBUF;
+       bool should_have_subvol;
+       u32 i, id;
+       int ret = 0;
+
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       memset(&s, 0, sizeof(s));
+       memcpy(&s, k.v, bkey_val_bytes(k.k));
+
+       id = le32_to_cpu(s.parent);
+       if (id) {
+               ret = bch2_snapshot_lookup(trans, id, &v);
+               if (bch2_err_matches(ret, ENOENT))
+                       bch_err(c, "snapshot with nonexistent parent:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               if (ret)
+                       goto err;
+
+               if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
+                   le32_to_cpu(v.children[1]) != k.k->p.offset) {
+                       bch_err(c, "snapshot parent %u missing pointer to child %llu",
+                               id, k.k->p.offset);
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       for (i = 0; i < 2 && s.children[i]; i++) {
+               id = le32_to_cpu(s.children[i]);
+
+               ret = bch2_snapshot_lookup(trans, id, &v);
+               if (bch2_err_matches(ret, ENOENT))
+                       bch_err(c, "snapshot node %llu has nonexistent child %u",
+                               k.k->p.offset, id);
+               if (ret)
+                       goto err;
+
+               if (le32_to_cpu(v.parent) != k.k->p.offset) {
+                       bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
+                               id, le32_to_cpu(v.parent), k.k->p.offset);
+                       ret = -EINVAL;
+                       goto err;
+               }
+       }
+
+       should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+               !BCH_SNAPSHOT_DELETED(&s);
+
+       if (should_have_subvol) {
+               id = le32_to_cpu(s.subvol);
+               ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
+               if (bch2_err_matches(ret, ENOENT))
+                       bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+               if (ret)
+                       goto err;
+
+               if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
+                       bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+                               k.k->p.offset);
+                       ret = -EINVAL;
+                       goto err;
+               }
+       } else {
+               if (fsck_err_on(s.subvol,
+                               c, snapshot_should_not_have_subvol,
+                               "snapshot should not point to subvol:\n  %s",
+                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+                       u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+                       ret = PTR_ERR_OR_ZERO(u);
+                       if (ret)
+                               goto err;
+
+                       u->v.subvol = 0;
+                       s = u->v;
+               }
+       }
+
+       ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
+       if (ret < 0)
+               goto err;
+
+       if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
+                       "snapshot points to missing/incorrect tree:\n  %s",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
+               if (ret)
+                       goto err;
+       }
+       ret = 0;
+
+       real_depth = bch2_snapshot_depth(c, parent_id);
+
+       if (le32_to_cpu(s.depth) != real_depth &&
+           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+            fsck_err(c, snapshot_bad_depth,
+                     "snapshot with incorrect depth field, should be %u:\n  %s",
+                     real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       goto err;
+
+               u->v.depth = cpu_to_le32(real_depth);
+               s = u->v;
+       }
+
+       ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
+       if (ret < 0)
+               goto err;
+
+       if (!ret &&
+           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
+            fsck_err(c, snapshot_bad_skiplist,
+                     "snapshot with bad skiplist field:\n  %s",
+                     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
+               ret = PTR_ERR_OR_ZERO(u);
+               if (ret)
+                       goto err;
+
+               for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
+                       u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
+
+               bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
+               s = u->v;
+       }
+       ret = 0;
+err:
+fsck_err:
+       printbuf_exit(&buf);
+       return ret;
+}
+
+int bch2_check_snapshots(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       /*
+        * We iterate backwards as checking/fixing the depth field requires that
+        * the parent's depth already be correct:
+        */
+       ret = bch2_trans_run(c,
+               for_each_btree_key_reverse_commit(trans, iter,
+                       BTREE_ID_snapshots, POS_MAX,
+                       BTREE_ITER_PREFETCH, k,
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_snapshot(trans, &iter, k)));
+       if (ret)
+               bch_err_fn(c, ret);
+       return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+       struct btree_iter iter;
+       struct bkey_i_snapshot *s;
+       int ret = 0;
+
+       s = bch2_bkey_get_mut_typed(trans, &iter,
+                                   BTREE_ID_snapshots, POS(0, id),
+                                   0, snapshot);
+       ret = PTR_ERR_OR_ZERO(s);
+       if (unlikely(ret)) {
+               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
+                                       trans->c, "missing snapshot %u", id);
+               return ret;
+       }
+
+       /* already deleted? */
+       if (BCH_SNAPSHOT_DELETED(&s->v))
+               goto err;
+
+       SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+       SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
+       s->v.subvol = 0;
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
+{
+       if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
+               swap(s->children[0], s->children[1]);
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+       struct btree_iter c_iter = (struct btree_iter) { NULL };
+       struct btree_iter tree_iter = (struct btree_iter) { NULL };
+       struct bkey_s_c_snapshot s;
+       u32 parent_id, child_id;
+       unsigned i;
+       int ret = 0;
+
+       s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                                    BTREE_ITER_INTENT, snapshot);
+       ret = bkey_err(s);
+       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+                               "missing snapshot %u", id);
+
+       if (ret)
+               goto err;
+
+       BUG_ON(s.v->children[1]);
+
+       parent_id = le32_to_cpu(s.v->parent);
+       child_id = le32_to_cpu(s.v->children[0]);
+
+       if (parent_id) {
+               struct bkey_i_snapshot *parent;
+
+               parent = bch2_bkey_get_mut_typed(trans, &p_iter,
+                                    BTREE_ID_snapshots, POS(0, parent_id),
+                                    0, snapshot);
+               ret = PTR_ERR_OR_ZERO(parent);
+               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+                                       "missing snapshot %u", parent_id);
+               if (unlikely(ret))
+                       goto err;
+
+               /* find entry in parent->children for node being deleted */
+               for (i = 0; i < 2; i++)
+                       if (le32_to_cpu(parent->v.children[i]) == id)
+                               break;
+
+               if (bch2_fs_inconsistent_on(i == 2, c,
+                                       "snapshot %u missing child pointer to %u",
+                                       parent_id, id))
+                       goto err;
+
+               parent->v.children[i] = cpu_to_le32(child_id);
+
+               normalize_snapshot_child_pointers(&parent->v);
+       }
+
+       if (child_id) {
+               struct bkey_i_snapshot *child;
+
+               child = bch2_bkey_get_mut_typed(trans, &c_iter,
+                                    BTREE_ID_snapshots, POS(0, child_id),
+                                    0, snapshot);
+               ret = PTR_ERR_OR_ZERO(child);
+               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
+                                       "missing snapshot %u", child_id);
+               if (unlikely(ret))
+                       goto err;
+
+               child->v.parent = cpu_to_le32(parent_id);
+
+               if (!child->v.parent) {
+                       child->v.skip[0] = 0;
+                       child->v.skip[1] = 0;
+                       child->v.skip[2] = 0;
+               }
+       }
+
+       if (!parent_id) {
+               /*
+                * We're deleting the root of a snapshot tree: update the
+                * snapshot_tree entry to point to the new root, or delete it if
+                * this is the last snapshot ID in this tree:
+                */
+               struct bkey_i_snapshot_tree *s_t;
+
+               BUG_ON(s.v->children[1]);
+
+               s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
+                               BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
+                               0, snapshot_tree);
+               ret = PTR_ERR_OR_ZERO(s_t);
+               if (ret)
+                       goto err;
+
+               if (s.v->children[0]) {
+                       s_t->v.root_snapshot = s.v->children[0];
+               } else {
+                       s_t->k.type = KEY_TYPE_deleted;
+                       set_bkey_val_u64s(&s_t->k, 0);
+               }
+       }
+
+       ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &tree_iter);
+       bch2_trans_iter_exit(trans, &p_iter);
+       bch2_trans_iter_exit(trans, &c_iter);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
+                         u32 *new_snapids,
+                         u32 *snapshot_subvols,
+                         unsigned nr_snapids)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_i_snapshot *n;
+       struct bkey_s_c k;
+       unsigned i, j;
+       u32 depth = bch2_snapshot_depth(c, parent);
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+                            POS_MIN, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       for (i = 0; i < nr_snapids; i++) {
+               k = bch2_btree_iter_prev_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (!k.k || !k.k->p.offset) {
+                       ret = -BCH_ERR_ENOSPC_snapshot_create;
+                       goto err;
+               }
+
+               n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       goto err;
+
+               n->v.flags      = 0;
+               n->v.parent     = cpu_to_le32(parent);
+               n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
+               n->v.tree       = cpu_to_le32(tree);
+               n->v.depth      = cpu_to_le32(depth);
+
+               for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
+                       n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
+
+               bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
+               SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+               ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+                                        bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+               if (ret)
+                       goto err;
+
+               new_snapids[i]  = iter.pos.offset;
+
+               mutex_lock(&c->snapshot_table_lock);
+               snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
+               mutex_unlock(&c->snapshot_table_lock);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/*
+ * Create new snapshot IDs as children of an existing snapshot ID:
+ */
+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
+                             u32 *new_snapids,
+                             u32 *snapshot_subvols,
+                             unsigned nr_snapids)
+{
+       struct btree_iter iter;
+       struct bkey_i_snapshot *n_parent;
+       int ret = 0;
+
+       n_parent = bch2_bkey_get_mut_typed(trans, &iter,
+                       BTREE_ID_snapshots, POS(0, parent),
+                       0, snapshot);
+       ret = PTR_ERR_OR_ZERO(n_parent);
+       if (unlikely(ret)) {
+               if (bch2_err_matches(ret, ENOENT))
+                       bch_err(trans->c, "snapshot %u not found", parent);
+               return ret;
+       }
+
+       if (n_parent->v.children[0] || n_parent->v.children[1]) {
+               bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+               ret = -EINVAL;
+               goto err;
+       }
+
+       ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
+                            new_snapids, snapshot_subvols, nr_snapids);
+       if (ret)
+               goto err;
+
+       n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
+       n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
+       n_parent->v.subvol = 0;
+       SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+/*
+ * Create a snapshot node that is the root of a new tree:
+ */
+static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
+                             u32 *new_snapids,
+                             u32 *snapshot_subvols,
+                             unsigned nr_snapids)
+{
+       struct bkey_i_snapshot_tree *n_tree;
+       int ret;
+
+       n_tree = __bch2_snapshot_tree_create(trans);
+       ret =   PTR_ERR_OR_ZERO(n_tree) ?:
+               create_snapids(trans, 0, n_tree->k.p.offset,
+                            new_snapids, snapshot_subvols, nr_snapids);
+       if (ret)
+               return ret;
+
+       n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
+       n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
+       return 0;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+                             u32 *new_snapids,
+                             u32 *snapshot_subvols,
+                             unsigned nr_snapids)
+{
+       BUG_ON((parent == 0) != (nr_snapids == 1));
+       BUG_ON((parent != 0) != (nr_snapids == 2));
+
+       return parent
+               ? bch2_snapshot_node_create_children(trans, parent,
+                               new_snapids, snapshot_subvols, nr_snapids)
+               : bch2_snapshot_node_create_tree(trans,
+                               new_snapids, snapshot_subvols, nr_snapids);
+
+}
+
+/*
+ * If we have an unlinked inode in an internal snapshot node, and the inode
+ * really has been deleted in all child snapshots, how does this get cleaned up?
+ *
+ * first there is the problem of how keys that have been overwritten in all
+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be
+ * special?
+ *
+ * also: unlinked inode in internal snapshot appears to not be getting deleted
+ * correctly if inode doesn't exist in leaf snapshots
+ *
+ * solution:
+ *
+ * for a key in an interior snapshot node that needs work to be done that
+ * requires it to be mutated: iterate over all descendent leaf nodes and copy
+ * that key to snapshot leaf nodes, where we can mutate it
+ */
+
+static int snapshot_delete_key(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c k,
+                              snapshot_id_list *deleted,
+                              snapshot_id_list *equiv_seen,
+                              struct bpos *last_pos)
+{
+       struct bch_fs *c = trans->c;
+       u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+       if (!bkey_eq(k.k->p, *last_pos))
+               equiv_seen->nr = 0;
+       *last_pos = k.k->p;
+
+       if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+           snapshot_list_has_id(equiv_seen, equiv)) {
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       } else {
+               return snapshot_list_add(c, equiv_seen, equiv);
+       }
+}
+
+static int move_key_to_correct_snapshot(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
+
+       /*
+        * When we have a linear chain of snapshot nodes, we consider
+        * those to form an equivalence class: we're going to collapse
+        * them all down to a single node, and keep the leaf-most node -
+        * which has the same id as the equivalence class id.
+        *
+        * If there are multiple keys in different snapshots at the same
+        * position, we're only going to keep the one in the newest
+        * snapshot - the rest have been overwritten and are redundant,
+        * and for the key we're going to keep we need to move it to the
+        * equivalance class ID if it's not there already.
+        */
+       if (equiv != k.k->p.snapshot) {
+               struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+               struct btree_iter new_iter;
+               int ret;
+
+               ret = PTR_ERR_OR_ZERO(new);
+               if (ret)
+                       return ret;
+
+               new->k.p.snapshot = equiv;
+
+               bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
+                                    BTREE_ITER_ALL_SNAPSHOTS|
+                                    BTREE_ITER_CACHED|
+                                    BTREE_ITER_INTENT);
+
+               ret =   bch2_btree_iter_traverse(&new_iter) ?:
+                       bch2_trans_update(trans, &new_iter, new,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+                       bch2_btree_delete_at(trans, iter,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               bch2_trans_iter_exit(trans, &new_iter);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
+static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot snap;
+       u32 children[2];
+       int ret;
+
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       snap = bkey_s_c_to_snapshot(k);
+       if (BCH_SNAPSHOT_DELETED(snap.v) ||
+           BCH_SNAPSHOT_SUBVOL(snap.v))
+               return 0;
+
+       children[0] = le32_to_cpu(snap.v->children[0]);
+       children[1] = le32_to_cpu(snap.v->children[1]);
+
+       ret   = bch2_snapshot_live(trans, children[0]) ?:
+               bch2_snapshot_live(trans, children[1]);
+       if (ret < 0)
+               return ret;
+       return !ret;
+}
+
+/*
+ * For a given snapshot, if it doesn't have a subvolume that points to it, and
+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it
+ * as deleted.
+ */
+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
+{
+       int ret = bch2_snapshot_needs_delete(trans, k);
+
+       return ret <= 0
+               ? ret
+               : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
+}
+
+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
+                                               snapshot_id_list *skip)
+{
+       rcu_read_lock();
+       while (snapshot_list_has_id(skip, id))
+               id = __bch2_snapshot_parent(c, id);
+
+       while (n--) {
+               do {
+                       id = __bch2_snapshot_parent(c, id);
+               } while (snapshot_list_has_id(skip, id));
+       }
+       rcu_read_unlock();
+
+       return id;
+}
+
+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
+                                             struct btree_iter *iter, struct bkey_s_c k,
+                                             snapshot_id_list *deleted)
+{
+       struct bch_fs *c = trans->c;
+       u32 nr_deleted_ancestors = 0;
+       struct bkey_i_snapshot *s;
+       u32 *i;
+       int ret;
+
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       if (snapshot_list_has_id(deleted, k.k->p.offset))
+               return 0;
+
+       s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
+       ret = PTR_ERR_OR_ZERO(s);
+       if (ret)
+               return ret;
+
+       darray_for_each(*deleted, i)
+               nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
+
+       if (!nr_deleted_ancestors)
+               return 0;
+
+       le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
+
+       if (!s->v.depth) {
+               s->v.skip[0] = 0;
+               s->v.skip[1] = 0;
+               s->v.skip[2] = 0;
+       } else {
+               u32 depth = le32_to_cpu(s->v.depth);
+               u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
+
+               for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
+                       u32 id = le32_to_cpu(s->v.skip[j]);
+
+                       if (snapshot_list_has_id(deleted, id)) {
+                               id = bch2_snapshot_nth_parent_skip(c,
+                                                       parent,
+                                                       depth > 1
+                                                       ? get_random_u32_below(depth - 1)
+                                                       : 0,
+                                                       deleted);
+                               s->v.skip[j] = cpu_to_le32(id);
+                       }
+               }
+
+               bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
+       }
+
+       return bch2_trans_update(trans, iter, &s->k_i, 0);
+}
+
+int bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+       struct btree_trans *trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot snap;
+       snapshot_id_list deleted = { 0 };
+       snapshot_id_list deleted_interior = { 0 };
+       u32 *i, id;
+       int ret = 0;
+
+       if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+               return 0;
+
+       if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+               ret = bch2_fs_read_write_early(c);
+               if (ret) {
+                       bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+                       return ret;
+               }
+       }
+
+       trans = bch2_trans_get(c);
+
+       /*
+        * For every snapshot node: If we have no live children and it's not
+        * pointed to by a subvolume, delete it:
+        */
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
+                       POS_MIN, 0, k,
+                       NULL, NULL, 0,
+               bch2_delete_redundant_snapshot(trans, k));
+       if (ret) {
+               bch_err_msg(c, ret, "deleting redundant snapshots");
+               goto err;
+       }
+
+       ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+                                 POS_MIN, 0, k,
+               bch2_snapshot_set_equiv(trans, k));
+       if (ret) {
+               bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+               goto err;
+       }
+
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               if (BCH_SNAPSHOT_DELETED(snap.v)) {
+                       ret = snapshot_list_add(c, &deleted, k.k->p.offset);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret) {
+               bch_err_msg(c, ret, "walking snapshots");
+               goto err;
+       }
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               struct bpos last_pos = POS_MIN;
+               snapshot_id_list equiv_seen = { 0 };
+               struct disk_reservation res = { 0 };
+
+               if (!btree_type_has_snapshots(id))
+                       continue;
+
+               /*
+                * deleted inodes btree is maintained by a trigger on the inodes
+                * btree - no work for us to do here, and it's not safe to scan
+                * it because we'll see out of date keys due to the btree write
+                * buffer:
+                */
+               if (id == BTREE_ID_deleted_inodes)
+                       continue;
+
+               ret = for_each_btree_key_commit(trans, iter,
+                               id, POS_MIN,
+                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
+                     for_each_btree_key_commit(trans, iter,
+                               id, POS_MIN,
+                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               &res, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       move_key_to_correct_snapshot(trans, &iter, k));
+
+               bch2_disk_reservation_put(c, &res);
+               darray_exit(&equiv_seen);
+
+               if (ret) {
+                       bch_err_msg(c, ret, "deleting keys from dying snapshots");
+                       goto err;
+               }
+       }
+
+       bch2_trans_unlock(trans);
+       down_write(&c->snapshot_create_lock);
+
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               u32 snapshot = k.k->p.offset;
+               u32 equiv = bch2_snapshot_equiv(c, snapshot);
+
+               if (equiv != snapshot)
+                       snapshot_list_add(c, &deleted_interior, snapshot);
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               goto err_create_lock;
+
+       /*
+        * Fixing children of deleted snapshots can't be done completely
+        * atomically, if we crash between here and when we delete the interior
+        * nodes some depth fields will be off:
+        */
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
+                                 BTREE_ITER_INTENT, k,
+                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+               bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
+       if (ret)
+               goto err_create_lock;
+
+       darray_for_each(deleted, i) {
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
+               if (ret) {
+                       bch_err_msg(c, ret, "deleting snapshot %u", *i);
+                       goto err_create_lock;
+               }
+       }
+
+       darray_for_each(deleted_interior, i) {
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(trans, *i));
+               if (ret) {
+                       bch_err_msg(c, ret, "deleting snapshot %u", *i);
+                       goto err_create_lock;
+               }
+       }
+err_create_lock:
+       up_write(&c->snapshot_create_lock);
+err:
+       darray_exit(&deleted_interior);
+       darray_exit(&deleted);
+       bch2_trans_put(trans);
+       if (ret)
+               bch_err_fn(c, ret);
+       return ret;
+}
+
+void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+
+       bch2_delete_dead_snapshots(c);
+       bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+void bch2_delete_dead_snapshots_async(struct bch_fs *c)
+{
+       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
+           !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
+               bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
+}
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+                                      enum btree_id id,
+                                      struct bpos pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, id, pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (!k.k)
+                       break;
+
+               if (!bkey_eq(pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
+{
+       const struct snapshot_t *s = snapshot_t(c, id);
+
+       return s->children[1] ?: s->children[0];
+}
+
+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
+{
+       u32 child;
+
+       while ((child = bch2_snapshot_smallest_child(c, id)))
+               id = child;
+       return id;
+}
+
+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
+                                              enum btree_id btree,
+                                              struct bkey_s_c interior_k,
+                                              u32 leaf_id, struct bpos *new_min_pos)
+{
+       struct btree_iter iter;
+       struct bpos pos = interior_k.k->p;
+       struct bkey_s_c k;
+       struct bkey_i *new;
+       int ret;
+
+       pos.snapshot = leaf_id;
+
+       bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
+
+       /* key already overwritten in this snapshot? */
+       if (k.k->p.snapshot != interior_k.k->p.snapshot)
+               goto out;
+
+       if (bpos_eq(*new_min_pos, POS_MIN)) {
+               *new_min_pos = k.k->p;
+               new_min_pos->snapshot = leaf_id;
+       }
+
+       new = bch2_bkey_make_mut_noupdate(trans, interior_k);
+       ret = PTR_ERR_OR_ZERO(new);
+       if (ret)
+               goto out;
+
+       new->k.p.snapshot = leaf_id;
+       ret = bch2_trans_update(trans, &iter, new, 0);
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
+                                         enum btree_id btree,
+                                         struct bkey_s_c k,
+                                         struct bpos *new_min_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_buf sk;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
+
+       bch2_bkey_buf_init(&sk);
+       bch2_bkey_buf_reassemble(&sk, c, k);
+       k = bkey_i_to_s_c(sk.k);
+
+       *new_min_pos = POS_MIN;
+
+       for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
+            id < k.k->p.snapshot;
+            id++) {
+               if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
+                   !bch2_snapshot_is_leaf(c, id))
+                       continue;
+again:
+               ret =   btree_trans_too_many_iters(trans) ?:
+                       bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
+                       bch2_trans_commit(trans, NULL, NULL, 0);
+               if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+                       bch2_trans_begin(trans);
+                       goto again;
+               }
+
+               if (ret)
+                       break;
+       }
+
+       bch2_bkey_buf_exit(&sk, c);
+
+       return ret ?: trans_was_restarted(trans, restart_count);
+}
+
+static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c_snapshot snap;
+       int ret = 0;
+
+       if (k.k->type != KEY_TYPE_snapshot)
+               return 0;
+
+       snap = bkey_s_c_to_snapshot(k);
+       if (BCH_SNAPSHOT_DELETED(snap.v) ||
+           bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
+           (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
+               set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+               return 0;
+       }
+
+       return ret;
+}
+
+int bch2_snapshots_read(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       ret = bch2_trans_run(c,
+               for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+                       bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+                       bch2_snapshot_set_equiv(trans, k) ?:
+                       bch2_check_snapshot_needs_deletion(trans, k)) ?:
+               for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k,
+                          (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
+       if (ret)
+               bch_err_fn(c, ret);
+       return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+       kfree(rcu_dereference_protected(c->snapshots, true));
+}
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
new file mode 100644 (file)
index 0000000..f09a22f
--- /dev/null
@@ -0,0 +1,268 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_H
+#define _BCACHEFS_SNAPSHOT_H
+
+enum bkey_invalid_flags;
+
+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_tree_invalid(struct bch_fs *, struct bkey_s_c,
+                              enum bkey_invalid_flags, struct printbuf *);
+
+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {       \
+       .key_invalid    = bch2_snapshot_tree_invalid,           \
+       .val_to_text    = bch2_snapshot_tree_to_text,           \
+       .min_val_size   = 8,                                    \
+})
+
+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *);
+
+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
+                         enum bkey_invalid_flags, struct printbuf *);
+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
+                      struct bkey_s_c, struct bkey_s_c, unsigned);
+
+#define bch2_bkey_ops_snapshot ((struct bkey_ops) {            \
+       .key_invalid    = bch2_snapshot_invalid,                \
+       .val_to_text    = bch2_snapshot_to_text,                \
+       .atomic_trigger = bch2_mark_snapshot,                   \
+       .min_val_size   = 24,                                   \
+})
+
+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
+{
+       return &t->s[U32_MAX - id];
+}
+
+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+       return __snapshot_t(rcu_dereference(c->snapshots), id);
+}
+
+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
+{
+       rcu_read_lock();
+       id = snapshot_t(c, id)->tree;
+       rcu_read_unlock();
+
+       return id;
+}
+
+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+       return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
+{
+       rcu_read_lock();
+       id = __bch2_snapshot_parent_early(c, id);
+       rcu_read_unlock();
+
+       return id;
+}
+
+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       u32 parent = snapshot_t(c, id)->parent;
+
+       if (parent &&
+           snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+               panic("id %u depth=%u parent %u depth=%u\n",
+                     id, snapshot_t(c, id)->depth,
+                     parent, snapshot_t(c, parent)->depth);
+
+       return parent;
+#else
+       return snapshot_t(c, id)->parent;
+#endif
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+       rcu_read_lock();
+       id = __bch2_snapshot_parent(c, id);
+       rcu_read_unlock();
+
+       return id;
+}
+
+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
+{
+       rcu_read_lock();
+       while (n--)
+               id = __bch2_snapshot_parent(c, id);
+       rcu_read_unlock();
+
+       return id;
+}
+
+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32);
+
+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
+{
+       u32 parent;
+
+       rcu_read_lock();
+       while ((parent = __bch2_snapshot_parent(c, id)))
+               id = parent;
+       rcu_read_unlock();
+
+       return id;
+}
+
+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+       return snapshot_t(c, id)->equiv;
+}
+
+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
+{
+       rcu_read_lock();
+       id = __bch2_snapshot_equiv(c, id);
+       rcu_read_unlock();
+
+       return id;
+}
+
+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
+{
+       return id == bch2_snapshot_equiv(c, id);
+}
+
+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+{
+       const struct snapshot_t *s;
+       bool ret;
+
+       rcu_read_lock();
+       s = snapshot_t(c, id);
+       ret = s->children[0];
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
+{
+       return !bch2_snapshot_is_internal_node(c, id);
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+       const struct snapshot_t *s;
+       u32 parent = __bch2_snapshot_parent(c, id);
+
+       if (!parent)
+               return 0;
+
+       s = snapshot_t(c, __bch2_snapshot_parent(c, id));
+       if (id == s->children[0])
+               return s->children[1];
+       if (id == s->children[1])
+               return s->children[0];
+       return 0;
+}
+
+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
+{
+       u32 depth;
+
+       rcu_read_lock();
+       depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
+       rcu_read_unlock();
+
+       return depth;
+}
+
+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+       return id == ancestor
+               ? true
+               : __bch2_snapshot_is_ancestor(c, id, ancestor);
+}
+
+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
+{
+       const struct snapshot_t *t;
+       bool ret;
+
+       rcu_read_lock();
+       t = snapshot_t(c, id);
+       ret = (t->children[0]|t->children[1]) != 0;
+       rcu_read_unlock();
+
+       return ret;
+}
+
+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
+{
+       u32 *i;
+
+       darray_for_each(*s, i)
+               if (*i == id)
+                       return true;
+       return false;
+}
+
+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+       u32 *i;
+
+       darray_for_each(*s, i)
+               if (bch2_snapshot_is_ancestor(c, id, *i))
+                       return true;
+       return false;
+}
+
+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+       int ret;
+
+       BUG_ON(snapshot_list_has_id(s, id));
+       ret = darray_push(s, id);
+       if (ret)
+               bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+       return ret;
+}
+
+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
+                        struct bch_snapshot *s);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+                            struct bch_subvolume *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+                             u32 *, u32 *, unsigned);
+
+int bch2_check_snapshot_trees(struct bch_fs *);
+int bch2_check_snapshots(struct bch_fs *);
+
+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
+void bch2_delete_dead_snapshots_work(struct work_struct *);
+
+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos);
+
+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
+{
+       if (!btree_type_has_snapshots(id) ||
+           bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+               return 0;
+
+       return __bch2_key_has_snapshot_overwrites(trans, id, pos);
+}
+
+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
+                                         struct bkey_s_c, struct bpos *);
+
+int bch2_snapshots_read(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+
+#endif /* _BCACHEFS_SNAPSHOT_H */
diff --git a/make-release-tarball.sh b/make-release-tarball.sh
new file mode 100755 (executable)
index 0000000..9986eac
--- /dev/null
@@ -0,0 +1,60 @@
+#!/bin/bash
+
+set -o errexit
+
+version=$1
+
+git checkout v$version
+git clean -xfd
+
+(cd rust-src; cargo license) > COPYING.rust-dependencies
+
+git ls-files|
+    tar --create --file bcachefs-tools-$version.tar -T -    \
+       --transform="s_^_bcachefs-tools-$version/_"
+
+tar --append --file bcachefs-tools-vendored-$version.tar       \
+    --transform="s_^_bcachefs-tools-$version/_"                        \
+    COPYING.rust-dependencies
+
+zstd -z --ultra                        bcachefs-tools-$version.tar
+
+gpg --armor --detach-sign      bcachefs-tools-$version.tar
+mv bcachefs-tools-$version.tar.asc bcachefs-tools-$version.tar.sign
+
+gpg --armor --sign             bcachefs-tools-$version.tar
+
+scp bcachefs-tools-$version.tar.zst    evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-$version.tar.asc    evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-$version.tar.sign   evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+
+cargo vendor --manifest-path rust-src/Cargo.toml
+
+mkdir .cargo
+cat > .cargo/config.toml <<-ZZ
+[source.crates-io]
+replace-with = "vendored-sources"
+
+[source."git+https://evilpiepirate.org/git/rust-bindgen.git"]
+git = "https://evilpiepirate.org/git/rust-bindgen.git"
+replace-with = "vendored-sources"
+
+[source.vendored-sources]
+directory = "vendor"
+ZZ
+
+cp bcachefs-tools-$version.tar bcachefs-tools-vendored-$version.tar
+tar --append --file bcachefs-tools-vendored-$version.tar       \
+    --transform="s_^_bcachefs-tools-$version/_"                        \
+    .cargo vendor
+
+zstd -z --ultra                        bcachefs-tools-vendored-$version.tar
+
+gpg --armor --detach-sign      bcachefs-tools-vendored-$version.tar
+mv bcachefs-tools-vendored-$version.tar.asc bcachefs-tools-vendored-$version.tar.sign
+
+gpg --armor --sign             bcachefs-tools-vendored-$version.tar
+
+scp bcachefs-tools-vendored-$version.tar.zst   evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-vendored-$version.tar.asc   evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
+scp bcachefs-tools-vendored-$version.tar.sign  evilpiepirate.org:/var/www/htdocs/bcachefs-tools/
diff --git a/rust-src/src/cmd_completions.rs b/rust-src/src/cmd_completions.rs
new file mode 100644 (file)
index 0000000..5185969
--- /dev/null
@@ -0,0 +1,24 @@
+use crate::transform_c_args;
+use clap::{Command, CommandFactory, Parser};
+use clap_complete::{generate, Generator, Shell};
+use std::ffi::{c_char, c_int};
+use std::io;
+
+/// Generate shell completions
+#[derive(clap::Parser, Debug)]
+pub struct Cli {
+    shell: Shell,
+}
+
+fn print_completions<G: Generator>(gen: G, cmd: &mut Command) {
+    generate(gen, cmd, cmd.get_name().to_string(), &mut io::stdout());
+}
+
+#[no_mangle]
+#[allow(clippy::not_unsafe_ptr_arg_deref)]
+pub extern "C" fn cmd_completions(argc: c_int, argv: *const *const c_char) -> c_int {
+    transform_c_args!(argv, argc, argv);
+    let cli = Cli::parse_from(argv);
+    print_completions(cli.shell, &mut super::Cli::command());
+    0
+}