]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
New upstream release
authorJonathan Carter <jcc@debian.org>
Tue, 21 Nov 2023 15:32:00 +0000 (17:32 +0200)
committerJonathan Carter <jcc@debian.org>
Tue, 21 Nov 2023 15:32:00 +0000 (17:32 +0200)
188 files changed:
INSTALL.md
Makefile
Makefile.compiler
bcachefs.c
build.nix
cmd_data.c
cmd_device.c
cmd_dump.c
cmd_format.c
cmd_fusemount.c
cmd_key.c
cmd_kill_btree_node.c
cmd_list_journal.c
cmd_migrate.c
cmd_version.c
cmds.h
crypto.c
debian/changelog
default.nix
flake.lock
flake.nix
fsck.bcachefs [deleted file]
include/linux/atomic.h
include/linux/bit_spinlock.h
include/linux/blkdev.h
include/linux/closure.h
include/linux/compiler.h
include/linux/generic-radix-tree.h
include/linux/kernel.h
include/linux/list.h
include/linux/page.h
include/linux/rcupdate.h
include/linux/sched.h
include/linux/six.h [deleted file]
include/linux/slab.h
libbcachefs.c
libbcachefs.h
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/backpointers.c
libbcachefs/backpointers.h
libbcachefs/bbpos.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.c
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bkey_sort.c
libbcachefs/bkey_sort.h
libbcachefs/bset.c
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_locking.c
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c [deleted file]
libbcachefs/btree_write_buffer.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_waiting_for_journal.c
libbcachefs/chardev.c
libbcachefs/chardev.h
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/compress.c
libbcachefs/compress.h
libbcachefs/counters.c
libbcachefs/darray.h
libbcachefs/data_update.c
libbcachefs/data_update.h
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/disk_groups.c
libbcachefs/disk_groups.h
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/errcode.c
libbcachefs/errcode.h
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-common.c
libbcachefs/fs-io.c
libbcachefs/fs-io.h
libbcachefs/fs-ioctl.c
libbcachefs/fs-ioctl.h
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/fsck.h
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c [deleted file]
libbcachefs/io.h [deleted file]
libbcachefs/io_types.h [deleted file]
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_sb.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/lru.c
libbcachefs/lru.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/move_types.h
libbcachefs/movinggc.c
libbcachefs/nocow_locking.c
libbcachefs/nocow_locking.h
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/printbuf.c
libbcachefs/quota.c
libbcachefs/quota.h
libbcachefs/rebalance.c
libbcachefs/rebalance.h
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/recovery_types.h
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/replicas.c
libbcachefs/str_hash.h
libbcachefs/subvolume.c
libbcachefs/subvolume.h
libbcachefs/subvolume_types.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/trace.c
libbcachefs/trace.h
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c
libbcachefs/vstructs.h
libbcachefs/xattr.c
libbcachefs/xattr.h
linux/blkdev.c
linux/closure.c
linux/shrinker.c
linux/six.c [deleted file]
mkfs.bcachefs [deleted file]
mount.bcachefs [deleted file]
packaging/bcachefs-tools.spec
qcow2.c
rust-src/Cargo.lock
rust-src/Cargo.toml
rust-src/bch_bindgen/Cargo.lock
rust-src/bch_bindgen/Cargo.toml
rust-src/bch_bindgen/build.rs
rust-src/bch_bindgen/src/bcachefs.rs
rust-src/bch_bindgen/src/bkey.rs
rust-src/bch_bindgen/src/btree.rs
rust-src/bch_bindgen/src/lib.rs
rust-src/bch_bindgen/src/libbcachefs_wrapper.h
rust-src/src/cmd_list.rs
rust-src/src/cmd_mount.rs
rust-src/src/key.rs
rust-src/src/lib.rs
tools-util.c
tools-util.h

index 94b28770777430ee0121d4e2ca276c3c37d1cd28..370fb8df7f83b1db6fd8225384df1e519eddad09 100644 (file)
@@ -16,7 +16,7 @@ Build dependencies:
  * zlib1g
 
 In addition a recent Rust toolchain is required (rustc, cargo), either by using
-[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.64)
+[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.65)
 is available.
 
 Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with
index c77c0c51c2a0c1be872b5011ffedaca274455aff..61a624558e5876c9ab624699403d67474600037a 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,9 @@
+VERSION=1.3.3
+
 PREFIX?=/usr/local
 PKG_CONFIG?=pkg-config
 INSTALL=install
+LN=ln
 
 ifeq ("$(origin V)", "command line")
   BUILD_VERBOSE = $(V)
@@ -34,14 +37,17 @@ CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC                  \
        $(EXTRA_CFLAGS)
 LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS)
 
-CARGO_ARGS=
+ifdef CARGO_TOOLCHAIN_VERSION
+  CARGO_TOOLCHAIN = +$(CARGO_TOOLCHAIN_VERSION)
+endif
+
+CARGO_ARGS=${CARGO_TOOLCHAIN}
 CARGO=cargo $(CARGO_ARGS)
 CARGO_PROFILE=release
 # CARGO_PROFILE=debug
 
 CARGO_BUILD_ARGS=--$(CARGO_PROFILE)
 CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS)
-VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit)
 
 include Makefile.compiler
 
@@ -148,12 +154,15 @@ install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
 install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
 install: bcachefs
        $(INSTALL) -m0755 -D bcachefs      -t $(DESTDIR)$(ROOT_SBINDIR)
-       $(INSTALL) -m0755    fsck.bcachefs    $(DESTDIR)$(ROOT_SBINDIR)
-       $(INSTALL) -m0755    mkfs.bcachefs    $(DESTDIR)$(ROOT_SBINDIR)
-       $(INSTALL) -m0755    mount.bcachefs   $(DESTDIR)$(ROOT_SBINDIR)
        $(INSTALL) -m0644 -D bcachefs.8    -t $(DESTDIR)$(PREFIX)/share/man/man8/
        $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
        $(INSTALL) -m0755 -D initramfs/hook   $(DESTDIR)$(INITRAMFS_HOOK)
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mkfs.fuse.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/fsck.fuse.bcachefs
+       $(LN) -sfr $(DESTDIR)$(ROOT_SBINDIR)/bcachefs $(DESTDIR)$(ROOT_SBINDIR)/mount.fuse.bcachefs
 
        sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
        echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
@@ -178,6 +187,11 @@ bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex
 
 doc: bcachefs-principles-of-operation.pdf
 
+.PHONY: cargo-update-msrv
+cargo-update-msrv:
+       cargo +nightly generate-lockfile --manifest-path rust-src/Cargo.toml -Zmsrv-policy
+       cargo +nightly generate-lockfile --manifest-path rust-src/bch_bindgen/Cargo.toml -Zmsrv-policy
+
 .PHONY: update-bcachefs-sources
 update-bcachefs-sources:
        git rm -rf --ignore-unmatch libbcachefs
@@ -192,10 +206,6 @@ update-bcachefs-sources:
        git add include/linux/xxhash.h
        cp $(LINUX_DIR)/lib/xxhash.c linux/
        git add linux/xxhash.c
-       cp $(LINUX_DIR)/kernel/locking/six.c linux/
-       git add linux/six.c
-       cp $(LINUX_DIR)/include/linux/six.h include/linux/
-       git add include/linux/six.h
        cp $(LINUX_DIR)/include/linux/list_nulls.h include/linux/
        git add include/linux/list_nulls.h
        cp $(LINUX_DIR)/include/linux/poison.h include/linux/
index 7aa1fbc4aafef69327bf3b0ca51a80c1d46492a0..8fcb427405a6f17f61655a6d0881c433f22e1dd6 100644 (file)
@@ -32,13 +32,13 @@ try-run = $(shell set -e;           \
 # Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,)
 
 as-option = $(call try-run,\
-       $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
+       $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2))
 
 # as-instr
 # Usage: aflags-y += $(call as-instr,instr,option1,option2)
 
 as-instr = $(call try-run,\
-       printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+       printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
 
 # __cc-option
 # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
@@ -72,7 +72,3 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1)
 # ld-option
 # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
 ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
-
-# ld-ifversion
-# Usage:  $(call ld-ifversion, -ge, 22252, y)
-ld-ifversion = $(shell [ $(CONFIG_LD_VERSION)0 $(1) $(2)0 ] && echo $(3) || echo $(4))
index a3fe6d8247e4e8969978391c3d0a288d480cbaa6..4efe29edaad25c0b3fb96865e0a5f4907b3f1d55 100644 (file)
@@ -84,6 +84,7 @@ static void usage(void)
             "\n"
             "Commands for operating on files in a bcachefs filesystem:\n"
             "  setattr                  Set various per file attributes\n"
+            "\n"
             "Debug:\n"
             "These commands work on offline, unmounted filesystems\n"
             "  dump                     Dump filesystem metadata to a qcow2 image\n"
@@ -92,7 +93,13 @@ static void usage(void)
 #endif
             "  list_journal             List contents of journal\n"
             "\n"
+            "FUSE:\n"
+            "  fusemount                Mount a filesystem via FUSE\n"
+            "\n"
             "Miscellaneous:\n"
+#ifndef BCACHEFS_NO_RUST
+         "  completions              Generate shell completions\n"
+#endif
             "  version                  Display the version of the invoked bcachefs tool\n");
 }
 
@@ -184,6 +191,24 @@ int main(int argc, char *argv[])
 
        full_cmd = argv[0];
 
+       /* Are we being called via a symlink? */
+
+       if (strstr(full_cmd, "mkfs"))
+               return cmd_format(argc, argv);
+
+       if (strstr(full_cmd, "fsck"))
+               return cmd_fsck(argc, argv);
+
+#ifdef BCACHEFS_FUSE
+       if (strstr(full_cmd, "mount.fuse"))
+               return cmd_fusemount(argc, argv);
+#endif
+
+#ifndef BCACHEFS_NO_RUST
+       if (strstr(full_cmd, "mount"))
+               return cmd_mount(argc, argv);
+#endif
+
        setvbuf(stdout, NULL, _IOLBF, 0);
 
        char *cmd = pop_cmd(&argc, argv);
@@ -249,10 +274,10 @@ int main(int argc, char *argv[])
        if (!strcmp(cmd, "setattr"))
                return cmd_setattr(argc, argv);
 #ifndef BCACHEFS_NO_RUST
-       if (!strcmp(cmd, "mount")) {
-               cmd_mount(argc, argv);
-               return 0;
-       }
+       if (!strcmp(cmd, "mount"))
+               return cmd_mount(argc, argv);
+    if (strstr(cmd, "completions"))
+        return cmd_completions(argc, argv);
 #endif
 
 #ifdef BCACHEFS_FUSE
index 5cf07de25e2e32906e4da09f02e94f8b63d07054..e05dad4243c1f42657099e0b3fdf00de3958985c 100644 (file)
--- a/build.nix
+++ b/build.nix
@@ -1,28 +1,10 @@
-{ lib
-, stdenv
-, pkg-config
-, attr
-, libuuid
-, libsodium
-, keyutils
-, liburcu
-, zlib
-, libaio
-, udev
-, zstd
-, lz4
-, nix-gitignore
-, rustPlatform
-, rustc
-, cargo
- }:
-
+{ lib, stdenv, pkg-config, attr, libuuid, libsodium, keyutils, liburcu, zlib
+, libaio, udev, zstd, lz4, nix-gitignore, rustPlatform, rustc, cargo, }:
 let
-  src = nix-gitignore.gitignoreSource [] ./. ;
+  src = nix-gitignore.gitignoreSource [ ] ./.;
 
   commit = lib.strings.substring 0 7 (builtins.readFile ./.bcachefs_revision);
   version = "git-${commit}";
-
 in stdenv.mkDerivation {
   inherit src version;
 
@@ -61,12 +43,14 @@ in stdenv.mkDerivation {
     };
   };
 
-  makeFlags = [
-    "PREFIX=${placeholder "out"}"
-    "VERSION=${commit}"
-  ];
+  makeFlags = [ "DESTDIR=${placeholder "out"}" "PREFIX=" "VERSION=${commit}" ];
 
   dontStrip = true;
   checkPhase = "./bcachefs version";
   doCheck = true;
+
+  meta = {
+    mainProgram = "bcachefs";
+    license = lib.licenses.gpl2Only;
+  };
 }
index 160eb918b8947a6e7d265860e26e7e55f91e17af..6d709883720d7a5dc137105d99cc234056e860a0 100644 (file)
@@ -103,7 +103,7 @@ int cmd_data_job(int argc, char *argv[])
                switch (opt) {
                case 'b':
                        op.start_btree = read_string_list_or_die(optarg,
-                                               bch2_btree_ids, "btree id");
+                                               __bch2_btree_ids, "btree id");
                        op.end_btree = op.start_btree;
                        break;
                case 's':
index c59d37094761dccb5d83b9905d34cce451714c75..1cb31ab858422f7f5646aeb2ee7d3d7dd38f6c35 100644 (file)
@@ -16,6 +16,7 @@
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/journal.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super-io.h"
 #include "cmds.h"
 #include "libbcachefs.h"
@@ -112,7 +113,9 @@ int cmd_device_add(int argc, char *argv[])
 
        struct bchfs_handle fs = bcache_fs_open(fs_path);
 
-       dev_opts.fd = open_for_format(dev_opts.path, force);
+       int ret = open_for_format(&dev_opts, force);
+       if (ret)
+               die("Error opening %s: %s", dev_opts.path, strerror(-ret));
 
        struct bch_opt_strs fs_opt_strs;
        memset(&fs_opt_strs, 0, sizeof(fs_opt_strs));
@@ -129,8 +132,8 @@ int cmd_device_add(int argc, char *argv[])
                                        format_opts,
                                        &dev_opts, 1);
        free(sb);
-       fsync(dev_opts.fd);
-       close(dev_opts.fd);
+       fsync(dev_opts.bdev->bd_buffered_fd);
+       close(dev_opts.bdev->bd_buffered_fd);
 
        bchu_disk_add(fs, dev_opts.path);
        return 0;
@@ -413,7 +416,7 @@ int cmd_device_set_state(int argc, char *argv[])
                if (ret)
                        die("error opening %s: %s", dev_str, bch2_err_str(ret));
 
-               struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx;
+               struct bch_member *m = bch2_members_v2_get_mut(sb.sb, sb.sb->dev_idx);
 
                SET_BCH_MEMBER_STATE(m, new_state);
 
@@ -483,7 +486,7 @@ int cmd_device_resize(int argc, char *argv[])
 
        char *size_arg = arg_pop();
        if (!size_arg)
-               size = get_size(dev, dev_fd);
+               size = get_size(dev_fd);
        else if (bch2_strtoull_h(size_arg, &size))
                die("invalid size");
 
@@ -509,16 +512,11 @@ int cmd_device_resize(int argc, char *argv[])
                if (idx >= sb->nr_devices)
                        die("error reading superblock: dev idx >= sb->nr_devices");
 
-               struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-               if (!mi)
-                       die("error reading superblock: no member info");
+               struct bch_member m = bch2_sb_member_get(sb, idx);
 
-               /* could also just read this out of sysfs... meh */
-               struct bch_member *m = mi->members + idx;
+               u64 nbuckets = size / le16_to_cpu(m.bucket_size);
 
-               u64 nbuckets = size / le16_to_cpu(m->bucket_size);
-
-               if (nbuckets < le64_to_cpu(m->nbuckets))
+               if (nbuckets < le64_to_cpu(m.nbuckets))
                        die("Shrinking not supported yet");
 
                printf("resizing %s to %llu buckets\n", dev, nbuckets);
@@ -615,14 +613,9 @@ int cmd_device_resize_journal(int argc, char *argv[])
                if (idx >= sb->nr_devices)
                        die("error reading superblock: dev idx >= sb->nr_devices");
 
-               struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-               if (!mi)
-                       die("error reading superblock: no member info");
-
-               /* could also just read this out of sysfs... meh */
-               struct bch_member *m = mi->members + idx;
+               struct bch_member m = bch2_sb_member_get(sb, idx);
 
-               u64 nbuckets = size / le16_to_cpu(m->bucket_size);
+               u64 nbuckets = size / le16_to_cpu(m.bucket_size);
 
                printf("resizing journal on %s to %llu buckets\n", dev, nbuckets);
                bchu_disk_resize_journal(fs, idx, nbuckets);
index cc25a6a3134d9e27a7c4cbf656821caa746c390f..0d34923360fbe2977a8e9a3fc0f812367e716acb 100644 (file)
@@ -1,4 +1,5 @@
 #include <fcntl.h>
+#include <getopt.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -12,6 +13,7 @@
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/error.h"
 #include "libbcachefs/extents.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super.h"
 
 static void dump_usage(void)
@@ -21,9 +23,9 @@ static void dump_usage(void)
             "\n"
             "Options:\n"
             "  -o output     Output qcow2 image(s)\n"
-            "  -f            Force; overwrite when needed\n"
-            "  -j            Dump entire journal, not just dirty entries\n"
-            "  -h            Display this help and exit\n"
+            "  -f, --force   Force; overwrite when needed\n"
+            "  --nojournal   Don't dump entire journal, just dirty entries\n"
+            "  -h, --help    Display this help and exit\n"
             "Report bugs to <linux-bcachefs@vger.kernel.org>");
 }
 
@@ -59,13 +61,11 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
        for (i = 0; i < BTREE_ID_NR; i++) {
                const struct bch_extent_ptr *ptr;
                struct bkey_ptrs_c ptrs;
-               struct btree_trans trans;
+               struct btree_trans *trans = bch2_trans_get(c);
                struct btree_iter iter;
                struct btree *b;
 
-               bch2_trans_init(&trans, c, 0, 0);
-
-               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
+               __for_each_btree_node(trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
                        struct btree_node_iter iter;
                        struct bkey u;
                        struct bkey_s_c k;
@@ -95,8 +95,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
                                                  btree_bytes(c));
                }
 
-               bch2_trans_iter_exit(&trans, &iter);
-               bch2_trans_exit(&trans);
+               bch2_trans_iter_exit(trans, &iter);
+               bch2_trans_put(trans);
        }
 
        qcow2_write_image(ca->disk_sb.bdev->bd_buffered_fd, fd, &data,
@@ -106,20 +106,29 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd,
 
 int cmd_dump(int argc, char *argv[])
 {
+       static const struct option longopts[] = {
+               { "force",              no_argument,            NULL, 'f' },
+               { "nojournal",          no_argument,            NULL, 'j' },
+               { "verbose",            no_argument,            NULL, 'v' },
+               { "help",               no_argument,            NULL, 'h' },
+               { NULL }
+       };
        struct bch_opts opts = bch2_opts_empty();
        struct bch_dev *ca;
        char *out = NULL;
        unsigned i, nr_devices = 0;
-       bool force = false, entire_journal = false;
+       bool force = false, entire_journal = true;
        int fd, opt;
 
+       opt_set(opts, read_only,        true);
        opt_set(opts, nochanges,        true);
        opt_set(opts, norecovery,       true);
        opt_set(opts, degraded,         true);
        opt_set(opts, errors,           BCH_ON_ERROR_continue);
        opt_set(opts, fix_errors,       FSCK_FIX_no);
 
-       while ((opt = getopt(argc, argv, "o:fjvh")) != -1)
+       while ((opt = getopt_long(argc, argv, "o:fvh",
+                                 longopts, NULL)) != -1)
                switch (opt) {
                case 'o':
                        out = optarg;
@@ -128,7 +137,7 @@ int cmd_dump(int argc, char *argv[])
                        force = true;
                        break;
                case 'j':
-                       entire_journal = true;
+                       entire_journal = false;
                        break;
                case 'v':
                        opt_set(opts, verbose, true);
index 26a1cd9f725837cb50b5b82d42339b91453220b7..f0a4b6a54f51d4d21b61daada543555561dbf19f 100644 (file)
@@ -119,6 +119,7 @@ int cmd_format(int argc, char *argv[])
        struct format_opts opts = format_opts_default();
        struct dev_opts dev_opts = dev_opts_default(), *dev;
        bool force = false, no_passphrase = false, quiet = false, initialize = true, verbose = false;
+       bool unconsumed_dev_option = false;
        unsigned v;
        int opt;
 
@@ -162,6 +163,7 @@ int cmd_format(int argc, char *argv[])
                case O_fs_size:
                        if (bch2_strtoull_h(optarg, &dev_opts.size))
                                die("invalid filesystem size");
+                       unconsumed_dev_option = true;
                        break;
                case O_superblock_size:
                        if (bch2_strtouint_h(optarg, &opts.superblock_size))
@@ -172,23 +174,28 @@ int cmd_format(int argc, char *argv[])
                case O_bucket_size:
                        if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
                                die("bad bucket_size %s", optarg);
+                       unconsumed_dev_option = true;
                        break;
                case O_label:
                case 'l':
                        dev_opts.label = optarg;
+                       unconsumed_dev_option = true;
                        break;
                case O_discard:
                        dev_opts.discard = true;
+                       unconsumed_dev_option = true;
                        break;
                case O_data_allowed:
                        dev_opts.data_allowed =
                                read_flag_list_or_die(optarg,
                                        bch2_data_types, "data type");
+                       unconsumed_dev_option = true;
                        break;
                case O_durability:
                        if (kstrtouint(optarg, 10, &dev_opts.durability) ||
                            dev_opts.durability > BCH_REPLICAS_MAX)
                                die("invalid durability");
+                       unconsumed_dev_option = true;
                        break;
                case O_version:
                        if (kstrtouint(optarg, 10, &opts.version))
@@ -202,6 +209,7 @@ int cmd_format(int argc, char *argv[])
                        dev_opts.path = optarg;
                        darray_push(&devices, dev_opts);
                        dev_opts.size = 0;
+                       unconsumed_dev_option = false;
                        break;
                case O_quiet:
                case 'q':
@@ -219,6 +227,9 @@ int cmd_format(int argc, char *argv[])
                        break;
                }
 
+       if (unconsumed_dev_option)
+               die("Options for devices apply to subsequent devices; got a device option with no device");
+
        if (opts.version != bcachefs_metadata_version_current)
                initialize = false;
 
@@ -230,8 +241,11 @@ int cmd_format(int argc, char *argv[])
                initialize = false;
        }
 
-       darray_for_each(devices, dev)
-               dev->fd = open_for_format(dev->path, force);
+       darray_for_each(devices, dev) {
+               int ret = open_for_format(dev, force);
+               if (ret)
+                       die("Error opening %s: %s", dev_opts.path, strerror(-ret));
+       }
 
        struct bch_sb *sb =
                bch2_format(fs_opt_strs,
@@ -245,7 +259,7 @@ int cmd_format(int argc, char *argv[])
 
                buf.human_readable_units = true;
 
-               bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members);
+               bch2_sb_to_text(&buf, sb, false, 1 << BCH_SB_FIELD_members_v2);
                printf("%s", buf.buf);
 
                printbuf_exit(&buf);
@@ -305,8 +319,9 @@ int cmd_show_super(int argc, char *argv[])
                { "help",                       0, NULL, 'h' },
                { NULL }
        };
-       unsigned fields = 1 << BCH_SB_FIELD_members;
+       unsigned fields = 0;
        bool print_layout = false;
+       bool print_default_fields = true;
        int opt;
 
        while ((opt = getopt_long(argc, argv, "f:lh", longopts, NULL)) != -1)
@@ -316,6 +331,7 @@ int cmd_show_super(int argc, char *argv[])
                                ? ~0
                                : read_flag_list_or_die(optarg,
                                        bch2_sb_fields, "superblock field");
+                       print_default_fields = false;
                        break;
                case 'l':
                        print_layout = true;
@@ -342,6 +358,13 @@ int cmd_show_super(int argc, char *argv[])
        if (ret)
                die("Error opening %s: %s", dev, bch2_err_str(ret));
 
+       if (print_default_fields) {
+               fields |= bch2_sb_field_get(sb.sb, members_v2)
+                       ? 1 << BCH_SB_FIELD_members_v2
+                       : 1 << BCH_SB_FIELD_members_v1;
+               fields |= BCH_SB_FIELD_errors;
+       }
+
        struct printbuf buf = PRINTBUF;
 
        buf.human_readable_units = true;
index 4470f83844a128b9f83f79d091c788b0d976c43c..46dbb9deee6458cc41c73e93a22d447e10b03fb4 100644 (file)
@@ -21,7 +21,8 @@
 #include "libbcachefs/error.h"
 #include "libbcachefs/fs-common.h"
 #include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_read.h"
+#include "libbcachefs/io_write.h"
 #include "libbcachefs/opts.h"
 #include "libbcachefs/super.h"
 
@@ -33,9 +34,9 @@
 /* XXX cut and pasted from fsck.c */
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static inline u64 map_root_ino(u64 ino)
+static inline subvol_inum map_root_ino(u64 ino)
 {
-       return ino == 1 ? 4096 : ino;
+       return (subvol_inum) { 1, ino == 1 ? 4096 : ino };
 }
 
 static inline u64 unmap_root_ino(u64 ino)
@@ -92,19 +93,18 @@ static void bcachefs_fuse_destroy(void *arg)
        bch2_fs_stop(c);
 }
 
-static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir_ino,
                                 const char *name)
 {
+       subvol_inum dir = map_root_ino(dir_ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked bi;
        struct qstr qstr = QSTR(name);
-       u64 inum;
+       subvol_inum inum;
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG, "fuse_lookup(dir=%llu name=%s)\n",
-                dir, name);
-
-       dir = map_root_ino(dir);
+                dir.inum, name);
 
        ret = bch2_inode_find_by_inum(c, dir, &bi);
        if (ret) {
@@ -114,8 +114,8 @@ static void bcachefs_fuse_lookup(fuse_req_t req, fuse_ino_t dir,
 
        struct bch_hash_info hash_info = bch2_hash_info_init(c, &bi);
 
-       inum = bch2_dirent_lookup(c, dir, &hash_info, &qstr);
-       if (!inum) {
+       ret = bch2_dirent_lookup(c, dir, &hash_info, &qstr, &inum);
+       if (ret) {
                struct fuse_entry_param e = {
                        .attr_timeout   = DBL_MAX,
                        .entry_timeout  = DBL_MAX,
@@ -139,20 +139,17 @@ err:
        fuse_reply_err(req, -ret);
 }
 
-static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t ino,
                                  struct fuse_file_info *fi)
 {
+       subvol_inum inum = map_root_ino(ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked bi;
        struct stat attr;
-       int ret;
 
-       fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n",
-                inum);
+       fuse_log(FUSE_LOG_DEBUG, "fuse_getattr(inum=%llu)\n", inum.inum);
 
-       inum = map_root_ino(inum);
-
-       ret = bch2_inode_find_by_inum(c, inum, &bi);
+       int ret = bch2_inode_find_by_inum(c, inum, &bi);
        if (ret) {
                fuse_log(FUSE_LOG_DEBUG, "fuse_getattr error %i\n", ret);
                fuse_reply_err(req, -ret);
@@ -165,28 +162,27 @@ static void bcachefs_fuse_getattr(fuse_req_t req, fuse_ino_t inum,
        fuse_reply_attr(req, &attr, DBL_MAX);
 }
 
-static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t ino,
                                  struct stat *attr, int to_set,
                                  struct fuse_file_info *fi)
 {
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked inode_u;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        u64 now;
        int ret;
 
-       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n",
-                inum, to_set);
+       subvol_inum inum = map_root_ino(ino);
 
-       inum = map_root_ino(inum);
+       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_setattr(%llu, %x)\n", inum.inum, to_set);
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        now = bch2_current_time(c);
 
-       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+       ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -208,15 +204,15 @@ retry:
                inode_u.bi_mtime = now;
        /* TODO: CTIME? */
 
-       ret   = bch2_inode_write(&trans, &iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
+       ret   = bch2_inode_write(trans, &iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 err:
-        bch2_trans_iter_exit(&trans, &iter);
+        bch2_trans_iter_exit(trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret) {
                *attr = inode_to_stat(c, &inode_u);
@@ -226,34 +222,37 @@ err:
        }
 }
 
-static int do_create(struct bch_fs *c, u64 dir,
+static int do_create(struct bch_fs *c, subvol_inum dir,
                     const char *name, mode_t mode, dev_t rdev,
                     struct bch_inode_unpacked *new_inode)
 {
        struct qstr qstr = QSTR(name);
        struct bch_inode_unpacked dir_u;
-
-       dir = map_root_ino(dir);
+       uid_t uid = 0;
+       gid_t gid = 0;
 
        bch2_inode_init_early(c, new_inode);
 
        return bch2_trans_do(c, NULL, NULL, 0,
-                       bch2_create_trans(&trans,
+                       bch2_create_trans(trans,
                                dir, &dir_u,
                                new_inode, &qstr,
-                               0, 0, mode, rdev, NULL, NULL));
+                               uid, gid, mode, rdev, NULL, NULL,
+                               (subvol_inum) { 0 }, 0));
 }
 
-static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_mknod(fuse_req_t req, fuse_ino_t dir_ino,
                                const char *name, mode_t mode,
                                dev_t rdev)
 {
+       subvol_inum dir = map_root_ino(dir_ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked new_inode;
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_mknod(%llu, %s, %x, %x)\n",
-                dir, name, mode, rdev);
+                dir.inum, name, mode, rdev);
+
        ret = do_create(c, dir, name, mode, rdev, &new_inode);
        if (ret)
                goto err;
@@ -277,21 +276,19 @@ static void bcachefs_fuse_mkdir(fuse_req_t req, fuse_ino_t dir,
        bcachefs_fuse_mknod(req, dir, name, mode, 0);
 }
 
-static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_unlink(fuse_req_t req, fuse_ino_t dir_ino,
                                 const char *name)
 {
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked dir_u, inode_u;
        struct qstr qstr = QSTR(name);
-       int ret;
+       subvol_inum dir = map_root_ino(dir_ino);
 
-       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir, name);
+       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_unlink(%llu, %s)\n", dir.inum, name);
 
-       dir = map_root_ino(dir);
-
-       ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
-                           bch2_unlink_trans(&trans, dir, &dir_u,
-                                             &inode_u, &qstr));
+       int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+                           bch2_unlink_trans(trans, dir, &dir_u,
+                                             &inode_u, &qstr, false));
 
        fuse_reply_err(req, -ret);
 }
@@ -301,14 +298,12 @@ static void bcachefs_fuse_rmdir(fuse_req_t req, fuse_ino_t dir,
 {
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_rmdir(%llu, %s)\n", dir, name);
 
-       dir = map_root_ino(dir);
-
        bcachefs_fuse_unlink(req, dir, name);
 }
 
 static void bcachefs_fuse_rename(fuse_req_t req,
-                                fuse_ino_t src_dir, const char *srcname,
-                                fuse_ino_t dst_dir, const char *dstname,
+                                fuse_ino_t src_dir_ino, const char *srcname,
+                                fuse_ino_t dst_dir_ino, const char *dstname,
                                 unsigned flags)
 {
        struct bch_fs *c = fuse_req_userdata(req);
@@ -316,18 +311,17 @@ static void bcachefs_fuse_rename(fuse_req_t req,
        struct bch_inode_unpacked src_inode_u, dst_inode_u;
        struct qstr dst_name = QSTR(srcname);
        struct qstr src_name = QSTR(dstname);
+       subvol_inum src_dir = map_root_ino(src_dir_ino);
+       subvol_inum dst_dir = map_root_ino(dst_dir_ino);
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG,
                 "bcachefs_fuse_rename(%llu, %s, %llu, %s, %x)\n",
-                src_dir, srcname, dst_dir, dstname, flags);
-
-       src_dir = map_root_ino(src_dir);
-       dst_dir = map_root_ino(dst_dir);
+                src_dir.inum, srcname, dst_dir.inum, dstname, flags);
 
        /* XXX handle overwrites */
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_rename_trans(&trans,
+               bch2_rename_trans(trans,
                                  src_dir, &src_dir_u,
                                  dst_dir, &dst_dir_u,
                                  &src_inode_u, &dst_inode_u,
@@ -337,22 +331,22 @@ static void bcachefs_fuse_rename(fuse_req_t req,
        fuse_reply_err(req, -ret);
 }
 
-static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t inum,
-                              fuse_ino_t newparent, const char *newname)
+static void bcachefs_fuse_link(fuse_req_t req, fuse_ino_t ino,
+                              fuse_ino_t newparent_ino, const char *newname)
 {
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked dir_u, inode_u;
        struct qstr qstr = QSTR(newname);
+       subvol_inum newparent   = map_root_ino(newparent_ino);
+       subvol_inum inum        = map_root_ino(ino);
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_link(%llu, %llu, %s)\n",
-                inum, newparent, newname);
-
-       newparent = map_root_ino(newparent);
+                inum, newparent.inum, newname);
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                           bch2_link_trans(&trans, newparent,
-                                           inum, &dir_u, &inode_u, &qstr));
+                           bch2_link_trans(trans, newparent, &dir_u,
+                                           inum, &inode_u, &qstr));
 
        if (!ret) {
                struct fuse_entry_param e = inode_to_entry(c, &inode_u);
@@ -375,22 +369,20 @@ static void bcachefs_fuse_open(fuse_req_t req, fuse_ino_t inum,
 static void userbio_init(struct bio *bio, struct bio_vec *bv,
                         void *buf, size_t size)
 {
-       bio_init(bio, bv, 1);
+       bio_init(bio, NULL, bv, 1, 0);
        bio->bi_iter.bi_size    = size;
        bv->bv_page             = buf;
        bv->bv_len              = size;
        bv->bv_offset           = 0;
 }
 
-static int get_inode_io_opts(struct bch_fs *c, u64 inum,
-                            struct bch_io_opts *opts)
+static int get_inode_io_opts(struct bch_fs *c, subvol_inum inum, struct bch_io_opts *opts)
 {
        struct bch_inode_unpacked inode;
        if (bch2_inode_find_by_inum(c, inum, &inode))
                return -EINVAL;
 
-       *opts = bch2_opts_to_inode_opts(c->opts);
-       bch2_io_opts_apply(opts, bch2_inode_opts_get(&inode));
+       bch2_inode_opts_get(opts, c, &inode);
        return 0;
 }
 
@@ -448,7 +440,7 @@ static size_t align_fix_up_bytes(const struct fuse_align_io *align,
 /*
  * Read aligned data.
  */
-static int read_aligned(struct bch_fs *c, fuse_ino_t inum, size_t aligned_size,
+static int read_aligned(struct bch_fs *c, subvol_inum inum, size_t aligned_size,
                        off_t aligned_offset, void *buf)
 {
        BUG_ON(aligned_size & (block_bytes(c) - 1));
@@ -478,10 +470,11 @@ static int read_aligned(struct bch_fs *c, fuse_ino_t inum, size_t aligned_size,
        return -blk_status_to_errno(rbio.bio.bi_status);
 }
 
-static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t ino,
                               size_t size, off_t offset,
                               struct fuse_file_info *fi)
 {
+       subvol_inum inum = map_root_ino(ino);
        struct bch_fs *c = fuse_req_userdata(req);
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_read(%llu, %zd, %lld)\n",
@@ -520,43 +513,43 @@ static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum,
        free(buf);
 }
 
-static int inode_update_times(struct bch_fs *c, fuse_ino_t inum)
+static int inode_update_times(struct bch_fs *c, subvol_inum inum)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bch_inode_unpacked inode_u;
        int ret = 0;
        u64 now;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        now = bch2_current_time(c);
 
-       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
+       ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        inode_u.bi_mtime = now;
        inode_u.bi_ctime = now;
 
-       ret = bch2_inode_write(&trans, &iter, &inode_u);
+       ret = bch2_inode_write(trans, &iter, &inode_u);
        if (ret)
                goto err;
 
-       ret = bch2_trans_commit(&trans, NULL, NULL,
+       ret = bch2_trans_commit(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL);
 
 err:
-        bch2_trans_iter_exit(&trans, &iter);
+        bch2_trans_iter_exit(trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
-static int write_aligned(struct bch_fs *c, fuse_ino_t inum,
+static int write_aligned(struct bch_fs *c, subvol_inum inum,
                         struct bch_io_opts io_opts, void *buf,
                         size_t aligned_size, off_t aligned_offset,
                         off_t new_i_size, size_t *written_out)
@@ -576,7 +569,8 @@ static int write_aligned(struct bch_fs *c, fuse_ino_t inum,
        op.write_point  = writepoint_hashed(0);
        op.nr_replicas  = io_opts.data_replicas;
        op.target       = io_opts.foreground_target;
-       op.pos          = POS(inum, aligned_offset >> 9);
+       op.subvol       = inum.subvol;
+       op.pos          = POS(inum.inum, aligned_offset >> 9);
        op.new_i_size   = new_i_size;
 
        userbio_init(&op.wbio.bio, &bv, buf, aligned_size);
@@ -597,11 +591,12 @@ static int write_aligned(struct bch_fs *c, fuse_ino_t inum,
        return op.error;
 }
 
-static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t inum,
+static void bcachefs_fuse_write(fuse_req_t req, fuse_ino_t ino,
                                const char *buf, size_t size,
                                off_t offset,
                                struct fuse_file_info *fi)
 {
+       subvol_inum inum = map_root_ino(ino);
        struct bch_fs *c        = fuse_req_userdata(req);
        struct bch_io_opts      io_opts;
        size_t                  aligned_written;
@@ -686,24 +681,23 @@ err:
 }
 
 static void bcachefs_fuse_symlink(fuse_req_t req, const char *link,
-                                 fuse_ino_t dir, const char *name)
+                                 fuse_ino_t dir_ino, const char *name)
 {
+       subvol_inum dir = map_root_ino(dir_ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked new_inode;
        size_t link_len = strlen(link);
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_symlink(%s, %llu, %s)\n",
-                link, dir, name);
-
-       dir = map_root_ino(dir);
+                link, dir.inum, name);
 
        ret = do_create(c, dir, name, S_IFLNK|S_IRWXUGO, 0, &new_inode);
        if (ret)
                goto err;
 
        struct bch_io_opts io_opts;
-       ret = get_inode_io_opts(c, new_inode.bi_inum, &io_opts);
+       ret = get_inode_io_opts(c, dir, &io_opts);
        if (ret)
                goto err;
 
@@ -715,8 +709,10 @@ static void bcachefs_fuse_symlink(fuse_req_t req, const char *link,
        memset(aligned_buf, 0, align.size);
        memcpy(aligned_buf, link, link_len); /* already terminated */
 
+       subvol_inum inum = (subvol_inum) { dir.subvol, new_inode.bi_inum };
+
        size_t aligned_written;
-       ret = write_aligned(c, new_inode.bi_inum, io_opts, aligned_buf,
+       ret = write_aligned(c, inum, io_opts, aligned_buf,
                            align.size, align.start, link_len + 1,
                            &aligned_written);
        free(aligned_buf);
@@ -727,7 +723,7 @@ static void bcachefs_fuse_symlink(fuse_req_t req, const char *link,
        size_t written = align_fix_up_bytes(&align, aligned_written);
        BUG_ON(written != link_len + 1); // TODO: handle short
 
-       ret = inode_update_times(c, new_inode.bi_inum);
+       ret = inode_update_times(c, inum);
        if (ret)
                goto err;
 
@@ -741,12 +737,13 @@ err:
        fuse_reply_err(req, -ret);
 }
 
-static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t inum)
+static void bcachefs_fuse_readlink(fuse_req_t req, fuse_ino_t ino)
 {
+       subvol_inum inum = map_root_ino(ino);
        struct bch_fs *c = fuse_req_userdata(req);
        char *buf = NULL;
 
-       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum);
+       fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readlink(%llu)\n", inum.inum);
 
        struct bch_inode_unpacked bi;
        int ret = bch2_inode_find_by_inum(c, inum, &bi);
@@ -898,10 +895,11 @@ static bool handle_dots(struct fuse_dir_context *ctx, fuse_ino_t dir)
        return true;
 }
 
-static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir_ino,
                                  size_t size, off_t off,
                                  struct fuse_file_info *fi)
 {
+       subvol_inum dir = map_root_ino(dir_ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked bi;
        char *buf = calloc(size, 1);
@@ -915,9 +913,7 @@ static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir,
        int ret = 0;
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_readdir(dir=%llu, size=%zu, "
-                "off=%lld)\n", dir, size, off);
-
-       dir = map_root_ino(dir);
+                "off=%lld)\n", dir.inum, size, off);
 
        ret = bch2_inode_find_by_inum(c, dir, &bi);
        if (ret)
@@ -928,7 +924,7 @@ static void bcachefs_fuse_readdir(fuse_req_t req, fuse_ino_t dir,
                goto reply;
        }
 
-       if (!handle_dots(&ctx, dir))
+       if (!handle_dots(&ctx, dir.inum))
                goto reply;
 
        ret = bch2_readdir(c, dir, &ctx.ctx);
@@ -1012,16 +1008,17 @@ static void bcachefs_fuse_removexattr(fuse_req_t req, fuse_ino_t inum,
 }
 #endif
 
-static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir,
+static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir_ino,
                                 const char *name, mode_t mode,
                                 struct fuse_file_info *fi)
 {
+       subvol_inum dir = map_root_ino(dir_ino);
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked new_inode;
        int ret;
 
        fuse_log(FUSE_LOG_DEBUG, "bcachefs_fuse_create(%llu, %s, %x)\n",
-                dir, name, mode);
+                dir.inum, name, mode);
 
        ret = do_create(c, dir, name, mode, 0, &new_inode);
        if (ret)
@@ -1032,7 +1029,6 @@ static void bcachefs_fuse_create(fuse_req_t req, fuse_ino_t dir,
        return;
 err:
        fuse_reply_err(req, -ret);
-
 }
 
 #if 0
@@ -1222,6 +1218,17 @@ int cmd_fusemount(int argc, char *argv[])
        }
        tokenize_devices(&ctx);
 
+       struct printbuf fsname = PRINTBUF;
+       prt_printf(&fsname, "fsname=");
+       for (i = 0; i < ctx.nr_devices; ++i) {
+               if (i)
+                       prt_str(&fsname, ":");
+               prt_str(&fsname, ctx.devices[i]);
+       }
+
+       fuse_opt_add_arg(&args, "-o");
+       fuse_opt_add_arg(&args, fsname.buf);
+
        /* Open bch */
        printf("Opening bcachefs filesystem on:\n");
        for (i = 0; i < ctx.nr_devices; ++i)
index e8c3eeaf6ba3aacf93ea1b7bf333b617613f7d91..96206c4c36b5a62cce4cc27fd445a383bdfa87df 100644 (file)
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -92,7 +92,7 @@ int cmd_set_passphrase(int argc, char *argv[])
        if (IS_ERR(c))
                die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
 
-       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
        if (!crypt)
                die("Filesystem does not have encryption enabled");
 
@@ -111,6 +111,7 @@ int cmd_set_passphrase(int argc, char *argv[])
                die("error encrypting key");
        crypt->key = new_key;
 
+       bch2_revoke_key(c->disk_sb.sb);
        bch2_write_super(c);
        bch2_fs_stop(c);
        return 0;
@@ -129,7 +130,7 @@ int cmd_remove_passphrase(int argc, char *argv[])
        if (IS_ERR(c))
                die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c)));
 
-       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
        if (!crypt)
                die("Filesystem does not have encryption enabled");
 
index a8915a1f7c2921f78de255a1a904a2d5508b1751..21b78f1cd5cd6c1d16ec4e000fb89a45aeff8798 100644 (file)
@@ -11,6 +11,7 @@
 #include "libbcachefs/btree_iter.h"
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/error.h"
+#include "libbcachefs/sb-members.h"
 #include "libbcachefs/super.h"
 
 static void kill_btree_node_usage(void)
@@ -40,7 +41,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
                switch (opt) {
                case 'b':
                        btree_id = read_string_list_or_die(optarg,
-                                               bch2_btree_ids, "btree id");
+                                               __bch2_btree_ids, "btree id");
                        break;
                case 'l':
                        if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH)
@@ -63,7 +64,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
        if (IS_ERR(c))
                die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c)));
 
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct btree *b;
        int ret;
@@ -73,9 +74,7 @@ int cmd_kill_btree_node(int argc, char *argv[])
        if (ret)
                die("error %s from posix_memalign", bch2_err_str(ret));
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       __for_each_btree_node(&trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
+       __for_each_btree_node(trans, iter, btree_id, POS_MIN, 0, level, 0, b, ret) {
                if (b->c.level != level)
                        continue;
 
@@ -112,8 +111,8 @@ int cmd_kill_btree_node(int argc, char *argv[])
                bch_err(c, "node at specified index not found");
        ret = EXIT_FAILURE;
 done:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        bch2_fs_stop(c);
        return ret;
index 655bfe2e686e430416850c0d1fe9a54182371923..70f7e669207ef9d3f7600f75586e7fb983479089 100644 (file)
@@ -252,7 +252,7 @@ int cmd_list_journal(int argc, char *argv[])
                        darray_push(&transaction_filter, bbpos_parse(optarg));
                        break;
                case 'k':
-                       darray_push(&key_filter, read_string_list_or_die(optarg, bch2_btree_ids, "btree id"));
+                       darray_push(&key_filter, read_string_list_or_die(optarg, __bch2_btree_ids, "btree id"));
                        break;
                case 'v':
                        opt_set(opts, verbose, true);
index 3958ba6bdd23ee8c8d42c51c612767cde06c9a74..cde1fce4397c725baafa3326d10d54799b65ed5f 100644 (file)
@@ -33,7 +33,7 @@
 #include "libbcachefs/errcode.h"
 #include "libbcachefs/fs-common.h"
 #include "libbcachefs/inode.h"
-#include "libbcachefs/io.h"
+#include "libbcachefs/io_write.h"
 #include "libbcachefs/replicas.h"
 #include "libbcachefs/str_hash.h"
 #include "libbcachefs/super.h"
@@ -126,7 +126,7 @@ static void update_inode(struct bch_fs *c,
        bch2_inode_pack(&packed, inode);
        packed.inode.k.p.snapshot = U32_MAX;
        ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-                               NULL, NULL, 0);
+                               NULL, 0);
        if (ret)
                die("error updating inode: %s", bch2_err_str(ret));
 }
@@ -140,7 +140,7 @@ static void create_link(struct bch_fs *c,
        struct bch_inode_unpacked inode;
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_link_trans(&trans,
+               bch2_link_trans(trans,
                                (subvol_inum) { 1, parent->bi_inum }, &parent_u,
                                (subvol_inum) { 1, inum }, &inode, &qstr));
        if (ret)
@@ -159,7 +159,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
        bch2_inode_init_early(c, &new_inode);
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans,
+               bch2_create_trans(trans,
                                  (subvol_inum) { 1, parent->bi_inum }, parent,
                                  &new_inode, &qstr,
                                  uid, gid, mode, rdev, NULL, NULL,
@@ -232,7 +232,7 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
                struct bch_inode_unpacked inode_u;
 
                int ret = bch2_trans_do(c, NULL, NULL, 0,
-                               bch2_xattr_set(&trans,
+                               bch2_xattr_set(trans,
                                               (subvol_inum) { 1, dst->bi_inum },
                                               &inode_u, &hash_info, attr,
                                               val, val_size, h->flags, 0));
@@ -339,8 +339,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                        die("error reserving space in new filesystem: %s",
                            bch2_err_str(ret));
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
-                                       &res, NULL, 0);
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, 0);
                if (ret)
                        die("btree insert error %s", bch2_err_str(ret));
 
@@ -670,20 +669,24 @@ static int migrate_fs(const char          *fs_path,
        struct dev_opts dev = dev_opts_default();
 
        dev.path = dev_t_to_path(stat.st_dev);
-       dev.fd = xopen(dev.path, O_RDWR);
+       dev.bdev = blkdev_get_by_path(dev.path, BLK_OPEN_READ|BLK_OPEN_WRITE, &dev, NULL);
 
-       opt_set(fs_opts, block_size, get_blocksize(dev.path, dev.fd));
+       opt_set(fs_opts, block_size, get_blocksize(dev.bdev->bd_buffered_fd));
 
        char *file_path = mprintf("%s/bcachefs", fs_path);
        printf("Creating new filesystem on %s in space reserved at %s\n",
               dev.path, file_path);
 
-       bch2_pick_bucket_size(fs_opts, &dev);
+       dev.size        = get_size(dev.bdev->bd_buffered_fd);
+       dev.bucket_size = bch2_pick_bucket_size(fs_opts, &dev);
+       dev.nbuckets    = dev.size / dev.bucket_size;
+
+       bch2_check_bucket_size(fs_opts, &dev);
 
        u64 bcachefs_inum;
        ranges extents = reserve_new_fs_space(file_path,
                                fs_opts.block_size >> 9,
-                               get_size(dev.path, dev.fd) / 5,
+                               get_size(dev.bdev->bd_buffered_fd) / 5,
                                &bcachefs_inum, stat.st_dev, force);
 
        find_superblock_space(extents, format_opts, &dev);
index 3fb4b6e2c503427d6acddbfec02ee65bc367edd0..5fe30e5e328d033c323301d0f941caac8ccfd2fd 100644 (file)
@@ -4,6 +4,6 @@
 
 int cmd_version(int argc, char *argv[])
 {
-       printf("bcachefs tool version %s\n", VERSION_STRING);
+       printf("%s\n", VERSION_STRING);
        return 0;
 }
diff --git a/cmds.h b/cmds.h
index 96216b217de7bc1bdf834144be3d67905d4b1655..5b3f5f55d8b9717085d4b1078e48ab40d9a8b8bb 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -13,13 +13,6 @@ int cmd_format(int argc, char *argv[]);
 int cmd_show_super(int argc, char *argv[]);
 int cmd_set_option(int argc, char *argv[]);
 
-#if 0
-int cmd_assemble(int argc, char *argv[]);
-int cmd_incremental(int argc, char *argv[]);
-int cmd_run(int argc, char *argv[]);
-int cmd_stop(int argc, char *argv[]);
-#endif
-
 int cmd_fs_usage(int argc, char *argv[]);
 
 int device_usage(void);
@@ -60,6 +53,7 @@ int cmd_subvolume_delete(int argc, char *argv[]);
 int cmd_subvolume_snapshot(int argc, char *argv[]);
 
 int cmd_fusemount(int argc, char *argv[]);
-void cmd_mount(int agc, char *argv[]);
+int cmd_mount(int argc, char *argv[]);
+int cmd_completions(int argc, char *argv[]);
 
 #endif /* _CMDS_H */
index 4e4d15a90fe90d7fdbf051cd122c85353c162f69..32671bd84c332e90deb0510743443faf41df6c45 100644 (file)
--- a/crypto.c
+++ b/crypto.c
@@ -105,7 +105,7 @@ bool bch2_sb_is_encrypted(struct bch_sb *sb)
 {
        struct bch_sb_field_crypt *crypt;
 
-       return (crypt = bch2_sb_get_crypt(sb)) &&
+       return (crypt = bch2_sb_field_get(sb, crypt)) &&
                bch2_key_is_encrypted(&crypt->key);
 }
 
@@ -113,7 +113,7 @@ void bch2_passphrase_check(struct bch_sb *sb, const char *passphrase,
                           struct bch_key *passphrase_key,
                           struct bch_encrypted_key *sb_key)
 {
-       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(sb);
+       struct bch_sb_field_crypt *crypt = bch2_sb_field_get(sb, crypt);
        if (!crypt)
                die("filesystem is not encrypted");
 
index a1bfec309e4d3e15b5abbe042382aedec8bb324d..f1e4e5e6df80d7a5bc5a446320a990bb5cec44bf 100644 (file)
@@ -1,3 +1,9 @@
+bcachefs-tools (24+really1.3.4-1) unstable; urgency=medium
+
+  * New upstream release
+
+ -- Jonathan Carter <jcc@debian.org>  Tue, 21 Nov 2023 17:26:13 +0200
+
 bcachefs-tools (24+really1.2-1) unstable; urgency=medium
 
   * New upstream release (Closes: #1054613)
index 2cccff28d51c567520e8d511704079b4cc85e07d..80aeb43314821924503d8a287892550224c96268 100644 (file)
@@ -1,10 +1,6 @@
-(import
-  (
-    let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in
-    fetchTarball {
-      url = "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
-      sha256 = lock.nodes.flake-compat.locked.narHash;
-    }
-  )
-  { src = ./.; }
-).defaultNix
+(import (let lock = builtins.fromJSON (builtins.readFile ./flake.lock);
+in fetchTarball {
+  url =
+    "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz";
+  sha256 = lock.nodes.flake-compat.locked.narHash;
+}) { src = ./.; }).defaultNix
index 899f2971b904e4b88b52d0c675ef56d810b873d1..413959e45eeea8be21f42e79145114dae1f94fb6 100644 (file)
@@ -3,11 +3,11 @@
     "flake-compat": {
       "flake": false,
       "locked": {
-        "lastModified": 1673956053,
-        "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=",
+        "lastModified": 1696426674,
+        "narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
         "owner": "edolstra",
         "repo": "flake-compat",
-        "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9",
+        "rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
         "type": "github"
       },
       "original": {
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1686592866,
-        "narHash": "sha256-riGg89eWhXJcPNrQGcSwTEEm7CGxWC06oSX44hajeMw=",
+        "lastModified": 1698924604,
+        "narHash": "sha256-GCFbkl2tj8fEZBZCw3Tc0AkGo0v+YrQlohhEGJ/X4s0=",
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "0eeebd64de89e4163f4d3cf34ffe925a5cf67a05",
+        "rev": "fa804edfb7869c9fb230e174182a8a1a7e512c40",
         "type": "github"
       },
       "original": {
         "systems": "systems"
       },
       "locked": {
-        "lastModified": 1685518550,
-        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
+        "lastModified": 1694529238,
+        "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
         "owner": "numtide",
         "repo": "flake-utils",
-        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
+        "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
         "type": "github"
       },
       "original": {
index 0f8e90d26dcc3337e1adc33b5d0b5ed2efb9576b..c4185e77eac03aadec9497c2ea0abaedef400096 100644 (file)
--- a/flake.nix
+++ b/flake.nix
   };
 
   outputs = { self, nixpkgs, utils, ... }:
-    utils.lib.eachDefaultSystem (system:
+    {
+      overlays.default = final: prev: {
+        bcachefs = final.callPackage ./build.nix { };
+      };
+    } // utils.lib.eachDefaultSystem (system:
       let
-        pkgs = nixpkgs.legacyPackages.${system};
-        bcachefs = pkgs.callPackage ./build.nix {};
+        pkgs = import nixpkgs {
+          inherit system;
+          overlays = [ self.overlays.default ];
+        };
       in {
         packages = {
-          default = bcachefs;
+          inherit (pkgs) bcachefs;
+          default = pkgs.bcachefs;
         };
+
+        formatter = pkgs.nixfmt;
+
+        devShells.default = pkgs.callPackage ({ mkShell, rustc, cargo, gnumake
+          , gcc, clang, pkg-config, libuuid, libsodium, keyutils, liburcu, zlib
+          , libaio, zstd, lz4, udev, bcachefs }:
+          mkShell {
+            LIBCLANG_PATH = "${clang.cc.lib}/lib";
+            inherit (bcachefs) nativeBuildInputs buildInputs;
+          }) { };
       });
 }
diff --git a/fsck.bcachefs b/fsck.bcachefs
deleted file mode 100755 (executable)
index f8de4a8..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" fsck "$@"
index f4d047c1505e207049956ba63511b8a7be0681f0..2c983cd4efb871822a0ef5652235759d3bcb4b86 100644 (file)
@@ -47,6 +47,7 @@ typedef struct {
 #define smp_rmb()                      cmm_smp_rmb()
 #define smp_mb()                       cmm_smp_mb()
 #define smp_read_barrier_depends()     cmm_smp_read_barrier_depends()
+#define smp_acquire__after_ctrl_dep()  cmm_smp_mb()
 
 #else /* C11_ATOMICS */
 
@@ -205,6 +206,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v)                    \
        return __ATOMIC_DEC_RETURN(&v->counter);                        \
 }                                                                      \
                                                                        \
+static inline i_type a_type##_dec_return_release(a_type##_t *v)                \
+{                                                                      \
+       return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter);             \
+}                                                                      \
+                                                                       \
 static inline void a_type##_inc(a_type##_t *v)                         \
 {                                                                      \
        __ATOMIC_INC(&v->counter);                                      \
@@ -318,6 +324,12 @@ static inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
        return atomic64_cmpxchg(v, old, new);
 }
 
+static inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v)
+{
+       smp_mb__before_atomic();
+       return atomic64_sub_return(i, v);
+}
+
 #endif
 
 #endif /* __TOOLS_LINUX_ATOMIC_H */
index 62b91afaf9e0bb275f6cf5666ac53f77937a7a7d..873f08c2e24c8de10d15c800f97ebfb3dbca5263 100644 (file)
@@ -6,38 +6,78 @@
 #include <linux/futex.h>
 #include <urcu/futex.h>
 
+/*
+ * The futex wait op wants an explicit 32-bit address and value. If the bitmap
+ * used for the spinlock is 64-bit, cast down and pass the right 32-bit region
+ * for the in-kernel checks. The value is the copy that has already been read
+ * from the atomic op.
+ *
+ * The futex wake op interprets the value as the number of waiters to wake (up
+ * to INT_MAX), so pass that along directly.
+ */
+static inline void do_futex(int nr, unsigned long *addr, unsigned long v, int futex_flags)
+{
+       u32 *addr32 = (u32 *) addr;
+       u32 *v32 = (u32 *) &v;
+       int shift = 0;
+
+       futex_flags |= FUTEX_PRIVATE_FLAG;
+
+#if BITS_PER_LONG == 64
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+       shift = (nr >= 32) ? 1 : 0;
+#else
+       shift = (nr < 32) ? 1 : 0;
+#endif
+#endif
+       if (shift) {
+               addr32 += shift;
+               v32 += shift;
+       }
+       /*
+        * The shift to determine the futex address may have cast away a
+        * literal wake count value. The value is capped to INT_MAX and thus
+        * always in the low bytes of v regardless of bit nr. Copy in the wake
+        * count to whatever 32-bit range was selected.
+        */
+       if (futex_flags == FUTEX_WAKE_PRIVATE)
+               *v32 = (u32) v;
+       futex(addr32, futex_flags, *v32, NULL, NULL, 0);
+}
+
 static inline void bit_spin_lock(int nr, unsigned long *_addr)
 {
-       u32 mask, *addr = ((u32 *) _addr) + (nr / 32), v;
+       unsigned long mask;
+       unsigned long *addr = _addr + (nr / BITS_PER_LONG);
+       unsigned long v;
 
-       nr &= 31;
-       mask = 1U << nr;
+       nr &= BITS_PER_LONG - 1;
+       mask = 1UL << nr;
 
        while (1) {
                v = __atomic_fetch_or(addr, mask, __ATOMIC_ACQUIRE);
                if (!(v & mask))
                        break;
 
-               futex(addr, FUTEX_WAIT|FUTEX_PRIVATE_FLAG, v, NULL, NULL, 0);
+               do_futex(nr, addr, v, FUTEX_WAIT);
        }
 }
 
 static inline void bit_spin_wake(int nr, unsigned long *_addr)
 {
-       u32 *addr = ((u32 *) _addr) + (nr / 32);
-
-       futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+       do_futex(nr, _addr, INT_MAX, FUTEX_WAKE);
 }
 
 static inline void bit_spin_unlock(int nr, unsigned long *_addr)
 {
-       u32 mask, *addr = ((u32 *) _addr) + (nr / 32);
+       unsigned long mask;
+       unsigned long *addr = _addr + (nr / BITS_PER_LONG);
 
-       nr &= 31;
-       mask = 1U << nr;
+       nr &= BITS_PER_LONG - 1;
+       mask = 1UL << nr;
 
        __atomic_and_fetch(addr, ~mask, __ATOMIC_RELEASE);
-       futex(addr, FUTEX_WAKE|FUTEX_PRIVATE_FLAG, INT_MAX, NULL, NULL, 0);
+       do_futex(nr, addr, INT_MAX, FUTEX_WAKE);
 }
 
 #endif /* __LINUX_BIT_SPINLOCK_H */
index 7d378ab2cdf0a16587de7de7e74acf03e2e0cfff..39143117c1a9bf1e22a8cec790eb1e88d426257d 100644 (file)
@@ -6,6 +6,8 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
+#define MAX_LFS_FILESIZE       ((loff_t)LLONG_MAX)
+
 #define BIO_MAX_VECS   256U
 
 typedef unsigned fmode_t;
@@ -21,30 +23,20 @@ struct user_namespace;
 #define MINOR(dev)     ((unsigned int) ((dev) & MINORMASK))
 #define MKDEV(ma,mi)   (((ma) << MINORBITS) | (mi))
 
-/* file is open for reading */
-#define FMODE_READ             ((__force fmode_t)0x1)
-/* file is open for writing */
-#define FMODE_WRITE            ((__force fmode_t)0x2)
-/* file is seekable */
-#define FMODE_LSEEK            ((__force fmode_t)0x4)
-/* file can be accessed using pread */
-#define FMODE_PREAD            ((__force fmode_t)0x8)
-/* file can be accessed using pwrite */
-#define FMODE_PWRITE           ((__force fmode_t)0x10)
-/* File is opened for execution with sys_execve / sys_uselib */
-#define FMODE_EXEC             ((__force fmode_t)0x20)
-/* File is opened with O_NDELAY (only set for block devices) */
-#define FMODE_NDELAY           ((__force fmode_t)0x40)
-/* File is opened with O_EXCL (only set for block devices) */
-#define FMODE_EXCL             ((__force fmode_t)0x80)
-/* File is opened using open(.., 3, ..) and is writeable only for ioctls
-   (specialy hack for floppy.c) */
-#define FMODE_WRITE_IOCTL      ((__force fmode_t)0x100)
-/* 32bit hashes as llseek() offset (for directories) */
-#define FMODE_32BITHASH         ((__force fmode_t)0x200)
-/* 64bit hashes as llseek() offset (for directories) */
-#define FMODE_64BITHASH         ((__force fmode_t)0x400)
-#define FMODE_BUFFERED         ((__force fmode_t)0x800)
+typedef unsigned int __bitwise blk_mode_t;
+
+/* open for reading */
+#define BLK_OPEN_READ          ((__force blk_mode_t)(1 << 0))
+/* open for writing */
+#define BLK_OPEN_WRITE         ((__force blk_mode_t)(1 << 1))
+/* open exclusively (vs other exclusive openers */
+#define BLK_OPEN_EXCL          ((__force blk_mode_t)(1 << 2))
+/* opened with O_NDELAY */
+#define BLK_OPEN_NDELAY                ((__force blk_mode_t)(1 << 3))
+/* open for "writes" only for ioctls (specialy hack for floppy.c) */
+#define BLK_OPEN_WRITE_IOCTL   ((__force blk_mode_t)(1 << 4))
+
+#define BLK_OPEN_BUFFERED      ((__force blk_mode_t)(1 << 5))
 
 struct inode {
        unsigned long           i_ino;
@@ -93,9 +85,14 @@ int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsig
 unsigned bdev_logical_block_size(struct block_device *bdev);
 sector_t get_capacity(struct gendisk *disk);
 
-void blkdev_put(struct block_device *bdev, fmode_t mode);
+struct blk_holder_ops {
+        void (*mark_dead)(struct block_device *bdev);
+};
+
+void blkdev_put(struct block_device *bdev, void *holder);
 void bdput(struct block_device *bdev);
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder);
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+                                       void *holder, const struct blk_holder_ops *hop);
 int lookup_bdev(const char *path, dev_t *);
 
 struct super_block {
index 722a586bb22444418d31eb80d9d25e89de5d72a2..de7bb47d8a46ace38d95a81ed6df231d91ac725b 100644 (file)
@@ -154,6 +154,7 @@ struct closure {
        struct closure          *parent;
 
        atomic_t                remaining;
+       bool                    closure_get_happened;
 
 #ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD     0xc054dead
@@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
  */
 static inline void closure_sync(struct closure *cl)
 {
-       if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+       BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+       if (cl->closure_get_happened)
                __closure_sync(cl);
 }
 
@@ -233,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
        closure_set_ip(cl);
        cl->fn = fn;
        cl->wq = wq;
-       /* between atomic_dec() in closure_put() */
-       smp_mb__before_atomic();
 }
 
 static inline void closure_queue(struct closure *cl)
@@ -259,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
+       cl->closure_get_happened = true;
+
 #ifdef CONFIG_DEBUG_CLOSURES
        BUG_ON((atomic_inc_return(&cl->remaining) &
                CLOSURE_REMAINING_MASK) <= 1);
@@ -281,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
                closure_get(parent);
 
        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+       cl->closure_get_happened = false;
 
        closure_debug_create(cl);
        closure_set_ip(cl);
index 39df1f16ffc0250cce32a425f11125190299d886..577869062fe27a5602047375c4e297aeb529c710 100644 (file)
 #define unreachable()          __builtin_unreachable()
 #define __same_type(a, b)      __builtin_types_compatible_p(typeof(a), typeof(b))
 #define fallthrough            __attribute__((__fallthrough__))
+#define __noreturn             __attribute__((__noreturn__))
+
+#ifndef __counted_by
+#define __counted_by(nr)
+#endif
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
index c74b7376990d53301bfbd55acb414dbb491d8184..8474131647388b954812515930c81b0c83315f3a 100644 (file)
@@ -191,8 +191,8 @@ void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *,
                                size_t, size_t);
 
 /**
- * genradix_iter_peek - get first entry at or below iterator's current
- *                     position
+ * genradix_iter_peek_prev - get first entry at or below iterator's current
+ *                          position
  * @_iter:     a genradix_iter
  * @_radix:    genradix being iterated over
  *
index 35a7207e0495c08d950a991abbebebdf03b9a782..f9a5712938101e176864214c5ef6480275d0303b 100644 (file)
@@ -278,4 +278,7 @@ static inline void dump_stack(void) {}
 #define unsafe_memcpy(dst, src, bytes, justification)          \
        memcpy(dst, src, bytes)
 
+#define DECLARE_FLEX_ARRAY(TYPE, NAME) \
+       __DECLARE_FLEX_ARRAY(TYPE, NAME)
+
 #endif
index bdd09efa7968534149b7c47ec27d5983309424b8..d176d0d3485e5929f31a76b14ac5eb8008fa2fce 100644 (file)
@@ -98,4 +98,15 @@ static inline void hlist_del_init(struct hlist_node *n)
             pos;                                                       \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
 
+static inline size_t list_count_nodes(struct list_head *head)
+{
+       struct list_head *pos;
+       size_t count = 0;
+
+       list_for_each(pos, head)
+               count++;
+
+       return count;
+}
+
 #endif /* _LIST_LIST_H */
index e2dda66d116d61b5cec84e137f42d219bad1f0f4..111e5e68827cccc09dfd830b54639e47ef998966 100644 (file)
@@ -26,6 +26,9 @@ struct page;
 #define kmap_atomic(page)              page_address(page)
 #define kunmap_atomic(addr)            do {} while (0)
 
+#define kmap_local_page(page)          page_address(page)
+#define kunmap_local(addr)             do {} while (0)
+
 #define PageHighMem(page)              false
 
 static const char zero_page[PAGE_SIZE];
index ef03253158ed3f37d78b7357378a462c609746df..ec5f478f2a78ace640da4fe1cadc58222bfa3fe9 100644 (file)
@@ -12,7 +12,7 @@
 #define rcu_access_pointer(p)          READ_ONCE(p)
 
 #define kfree_rcu(ptr, rcu_head)       kfree(ptr) /* XXX */
-#define kvfree_rcu(ptr)                        kfree(ptr) /* XXX */
+#define kvfree_rcu_mightsleep(ptr)     kfree(ptr) /* XXX */
 
 #define RCU_INIT_POINTER(p, v)         WRITE_ONCE(p, v)
 
index c5c8e3ac10ee7fe64daab7359e85898810ccd25d..7afb6d54bb34101e3b78a9d02ce1196ffa1fbaf6 100644 (file)
@@ -107,7 +107,12 @@ extern __thread struct task_struct *current;
 #define set_current_state(state_value)                 \
        smp_store_mb(current->state, (state_value))
 
-#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
+static inline struct task_struct *get_task_struct(struct task_struct *task)
+{
+       atomic_inc(&task->usage);
+       return task;
+
+}
 
 extern void __put_task_struct(struct task_struct *t);
 
@@ -146,6 +151,14 @@ static inline u64 ktime_get_seconds(void)
        return ts.tv_sec;
 }
 
+static inline u64 ktime_get_real_ns(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_REALTIME, &ts);
+       return timespec_to_ns(&ts);
+}
+
 static inline u64 ktime_get_real_seconds(void)
 {
        struct timespec ts;
diff --git a/include/linux/six.h b/include/linux/six.h
deleted file mode 100644 (file)
index 394da42..0000000
+++ /dev/null
@@ -1,388 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-
-#ifndef _LINUX_SIX_H
-#define _LINUX_SIX_H
-
-/**
- * DOC: SIX locks overview
- *
- * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores
- * but with an additional state: read/shared, intent, exclusive/write
- *
- * The purpose of the intent state is to allow for greater concurrency on tree
- * structures without deadlocking. In general, a read can't be upgraded to a
- * write lock without deadlocking, so an operation that updates multiple nodes
- * will have to take write locks for the full duration of the operation.
- *
- * But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at thte start of the operation,
- * and then take write locks only for the actual update to each individual
- * nodes, without deadlocking.
- *
- * Example usage:
- *   six_lock_read(&foo->lock);
- *   six_unlock_read(&foo->lock);
- *
- * An intent lock must be held before taking a write lock:
- *   six_lock_intent(&foo->lock);
- *   six_lock_write(&foo->lock);
- *   six_unlock_write(&foo->lock);
- *   six_unlock_intent(&foo->lock);
- *
- * Other operations:
- *   six_trylock_read()
- *   six_trylock_intent()
- *   six_trylock_write()
- *
- *   six_lock_downgrade()      convert from intent to read
- *   six_lock_tryupgrade()     attempt to convert from read to intent, may fail
- *
- * There are also interfaces that take the lock type as an enum:
- *
- *   six_lock_type(&foo->lock, SIX_LOCK_read);
- *   six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent)
- *   six_lock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_write);
- *   six_unlock_type(&foo->lock, SIX_LOCK_intent);
- *
- * Lock sequence numbers - unlock(), relock():
- *
- *   Locks embed sequences numbers, which are incremented on write lock/unlock.
- *   This allows locks to be dropped and the retaken iff the state they protect
- *   hasn't changed; this makes it much easier to avoid holding locks while e.g.
- *   doing IO or allocating memory.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     u32 seq = six_lock_seq(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *
- *     some_operation_that_may_block();
- *
- *     if (six_relock_read(&foo->lock, seq)) { ... }
- *
- *   If the relock operation succeeds, it is as if the lock was never unlocked.
- *
- * Reentrancy:
- *
- *   Six locks are not by themselves reentrent, but have counters for both the
- *   read and intent states that can be used to provide reentrency by an upper
- *   layer that tracks held locks. If a lock is known to already be held in the
- *   read or intent state, six_lock_increment() can be used to bump the "lock
- *   held in this state" counter, increasing the number of unlock calls that
- *   will be required to fully unlock it.
- *
- *   Example usage:
- *     six_lock_read(&foo->lock);
- *     six_lock_increment(&foo->lock, SIX_LOCK_read);
- *     six_unlock_read(&foo->lock);
- *     six_unlock_read(&foo->lock);
- *   foo->lock is now fully unlocked.
- *
- *   Since the intent state supercedes read, it's legal to increment the read
- *   counter when holding an intent lock, but not the reverse.
- *
- *   A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write)
- *   is not legal.
- *
- * should_sleep_fn:
- *
- *   There is a six_lock() variant that takes a function pointer that is called
- *   immediately prior to schedule() when blocking, and may return an error to
- *   abort.
- *
- *   One possible use for this feature is when objects being locked are part of
- *   a cache and may reused, and lock ordering is based on a property of the
- *   object that will change when the object is reused - i.e. logical key order.
- *
- *   If looking up an object in the cache may race with object reuse, and lock
- *   ordering is required to prevent deadlock, object reuse may change the
- *   correct lock order for that object and cause a deadlock. should_sleep_fn
- *   can be used to check if the object is still the object we want and avoid
- *   this deadlock.
- *
- * Wait list entry interface:
- *
- *   There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a
- *   wait list entry. By embedding six_lock_waiter into another object, and by
- *   traversing lock waitlists, it is then possible for an upper layer to
- *   implement full cycle detection for deadlock avoidance.
- *
- *   should_sleep_fn should be used for invoking the cycle detector, walking the
- *   graph of held locks to check for a deadlock. The upper layer must track
- *   held locks for each thread, and each thread's held locks must be reachable
- *   from its six_lock_waiter object.
- *
- *   six_lock_waiter() will add the wait object to the waitlist re-trying taking
- *   the lock, and before calling should_sleep_fn, and the wait object will not
- *   be removed from the waitlist until either the lock has been successfully
- *   acquired, or we aborted because should_sleep_fn returned an error.
- *
- *   Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will
- *   have timestamps in strictly ascending order - this is so the timestamp can
- *   be used as a cursor for lock graph traverse.
- */
-
-#include <linux/lockdep.h>
-#include <linux/osq_lock.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-
-enum six_lock_type {
-       SIX_LOCK_read,
-       SIX_LOCK_intent,
-       SIX_LOCK_write,
-};
-
-struct six_lock {
-       atomic_t                state;
-       u32                     seq;
-       unsigned                intent_lock_recurse;
-       struct task_struct      *owner;
-       unsigned __percpu       *readers;
-       struct optimistic_spin_queue osq;
-       raw_spinlock_t          wait_lock;
-       struct list_head        wait_list;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lockdep_map      dep_map;
-#endif
-};
-
-struct six_lock_waiter {
-       struct list_head        list;
-       struct task_struct      *task;
-       enum six_lock_type      lock_want;
-       bool                    lock_acquired;
-       u64                     start_time;
-};
-
-typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
-
-void six_lock_exit(struct six_lock *lock);
-
-enum six_lock_init_flags {
-       SIX_LOCK_INIT_PCPU      = 1U << 0,
-};
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-                    struct lock_class_key *key, enum six_lock_init_flags flags);
-
-/**
- * six_lock_init - initialize a six lock
- * @lock:      lock to initialize
- * @flags:     optional flags, i.e. SIX_LOCK_INIT_PCPU
- */
-#define six_lock_init(lock, flags)                                     \
-do {                                                                   \
-       static struct lock_class_key __key;                             \
-                                                                       \
-       __six_lock_init((lock), #lock, &__key, flags);                  \
-} while (0)
-
-/**
- * six_lock_seq - obtain current lock sequence number
- * @lock:      six_lock to obtain sequence number for
- *
- * @lock should be held for read or intent, and not write
- *
- * By saving the lock sequence number, we can unlock @lock and then (typically
- * after some blocking operation) attempt to relock it: the relock will succeed
- * if the sequence number hasn't changed, meaning no write locks have been taken
- * and state corresponding to what @lock protects is still valid.
- */
-static inline u32 six_lock_seq(const struct six_lock *lock)
-{
-       return lock->seq;
-}
-
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_trylock_type - attempt to take a six lock without blocking
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       return six_trylock_ip(lock, type, _THIS_IP_);
-}
-
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-                      struct six_lock_waiter *wait,
-                      six_lock_should_sleep_fn should_sleep_fn, void *p,
-                      unsigned long ip);
-
-/**
- * six_lock_waiter - take a lock, with full waitlist interface
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:      pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- *
- * This is a convenience wrapper around six_lock_ip_waiter(), see that function
- * for full documentation.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type,
-                                 struct six_lock_waiter *wait,
-                                 six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-/**
- * six_lock_ip - take a six lock lock
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type,
-                             six_lock_should_sleep_fn should_sleep_fn, void *p,
-                             unsigned long ip)
-{
-       struct six_lock_waiter wait;
-
-       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip);
-}
-
-/**
- * six_lock_type - take a six lock lock
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
-                               six_lock_should_sleep_fn should_sleep_fn, void *p)
-{
-       struct six_lock_waiter wait;
-
-       return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_);
-}
-
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-                  unsigned seq, unsigned long ip);
-
-/**
- * six_relock_type - attempt to re-take a lock that was held previously
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:       lock sequence number obtained from six_lock_seq() while lock was
- *             held previously
- *
- * Return: true on success, false on failure.
- */
-static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
-                                  unsigned seq)
-{
-       return six_relock_ip(lock, type, seq, _THIS_IP_);
-}
-
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip);
-
-/**
- * six_unlock_type - drop a six lock
- * @lock:      lock to unlock
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);                          read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
- */
-static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       six_unlock_ip(lock, type, _THIS_IP_);
-}
-
-#define __SIX_LOCK(type)                                               \
-static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\
-{                                                                      \
-       return six_trylock_ip(lock, SIX_LOCK_##type, ip);               \
-}                                                                      \
-                                                                       \
-static inline bool six_trylock_##type(struct six_lock *lock)           \
-{                                                                      \
-       return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_);        \
-}                                                                      \
-                                                                       \
-static inline int six_lock_ip_waiter_##type(struct six_lock *lock,     \
-                          struct six_lock_waiter *wait,                \
-                          six_lock_should_sleep_fn should_sleep_fn, void *p,\
-                          unsigned long ip)                            \
-{                                                                      \
-       return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\
-}                                                                      \
-                                                                       \
-static inline int six_lock_ip_##type(struct six_lock *lock,            \
-                   six_lock_should_sleep_fn should_sleep_fn, void *p,  \
-                   unsigned long ip)                                   \
-{                                                                      \
-       return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\
-}                                                                      \
-                                                                       \
-static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\
-{                                                                      \
-       return six_relock_ip(lock, SIX_LOCK_##type, seq, ip);           \
-}                                                                      \
-                                                                       \
-static inline bool six_relock_##type(struct six_lock *lock, u32 seq)   \
-{                                                                      \
-       return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_);    \
-}                                                                      \
-                                                                       \
-static inline int six_lock_##type(struct six_lock *lock,               \
-                                 six_lock_should_sleep_fn fn, void *p)\
-{                                                                      \
-       return six_lock_ip_##type(lock, fn, p, _THIS_IP_);              \
-}                                                                      \
-                                                                       \
-static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip)       \
-{                                                                      \
-       six_unlock_ip(lock, SIX_LOCK_##type, ip);                       \
-}                                                                      \
-                                                                       \
-static inline void six_unlock_##type(struct six_lock *lock)            \
-{                                                                      \
-       six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_);                \
-}
-
-__SIX_LOCK(read)
-__SIX_LOCK(intent)
-__SIX_LOCK(write)
-#undef __SIX_LOCK
-
-void six_lock_downgrade(struct six_lock *);
-bool six_lock_tryupgrade(struct six_lock *);
-bool six_trylock_convert(struct six_lock *, enum six_lock_type,
-                        enum six_lock_type);
-
-void six_lock_increment(struct six_lock *, enum six_lock_type);
-
-void six_lock_wakeup_all(struct six_lock *);
-
-struct six_lock_count {
-       unsigned n[3];
-};
-
-struct six_lock_count six_lock_counts(struct six_lock *);
-void six_lock_readers_add(struct six_lock *, int);
-
-#endif /* _LINUX_SIX_H */
index 25ccf1a742049bebcac5a1d87a9340cb8f34a477..ca0c7934d3bbc0443e54b52e188dfd6d87d3d6c9 100644 (file)
@@ -27,7 +27,8 @@ static inline void *kmalloc_noprof(size_t size, gfp_t flags)
 
        for (i = 0; i < 10; i++) {
                if (size) {
-                       size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
+                       size_t alignment = min_t(size_t, PAGE_SIZE,
+                                                rounddown_pow_of_two(size));
                        alignment = max(sizeof(void *), alignment);
                        if (posix_memalign(&p, alignment, size))
                                p = NULL;
index bac772b22b332c2f1a49e7fbfeb129a216f56a84..123109f8dcbefaa695b560638d065c1f1c4c7ad5 100644 (file)
@@ -64,44 +64,44 @@ static u64 min_size(unsigned bucket_size)
        return BCH_MIN_NR_NBUCKETS * bucket_size;
 }
 
-void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+u64 bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
 {
-       if (!dev->size)
-               dev->size = get_size(dev->path, dev->fd);
+       u64 bucket_size;
 
-       if (!dev->bucket_size) {
-               if (dev->size < min_size(opts.block_size))
-                       die("cannot format %s, too small (%llu bytes, min %llu)",
-                           dev->path, dev->size, min_size(opts.block_size));
+       if (dev->size < min_size(opts.block_size))
+               die("cannot format %s, too small (%llu bytes, min %llu)",
+                   dev->path, dev->size, min_size(opts.block_size));
 
-               /* Bucket size must be >= block size: */
-               dev->bucket_size = opts.block_size;
+       /* Bucket size must be >= block size: */
+       bucket_size = opts.block_size;
 
-               /* Bucket size must be >= btree node size: */
-               if (opt_defined(opts, btree_node_size))
-                       dev->bucket_size = max_t(unsigned, dev->bucket_size,
-                                                opts.btree_node_size);
+       /* Bucket size must be >= btree node size: */
+       if (opt_defined(opts, btree_node_size))
+               bucket_size = max_t(unsigned, bucket_size,
+                                        opts.btree_node_size);
 
-               /* Want a bucket size of at least 128k, if possible: */
-               dev->bucket_size = max(dev->bucket_size, 128ULL << 10);
+       /* Want a bucket size of at least 128k, if possible: */
+       bucket_size = max(bucket_size, 128ULL << 10);
 
-               if (dev->size >= min_size(dev->bucket_size)) {
-                       unsigned scale = max(1,
-                               ilog2(dev->size / min_size(dev->bucket_size)) / 4);
+       if (dev->size >= min_size(bucket_size)) {
+               unsigned scale = max(1,
+                       ilog2(dev->size / min_size(bucket_size)) / 4);
 
-                       scale = rounddown_pow_of_two(scale);
+               scale = rounddown_pow_of_two(scale);
 
-                       /* max bucket size 1 mb */
-                       dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20);
-               } else {
-                       do {
-                               dev->bucket_size /= 2;
-                       } while (dev->size < min_size(dev->bucket_size));
-               }
+               /* max bucket size 1 mb */
+               bucket_size = min(bucket_size * scale, 1ULL << 20);
+       } else {
+               do {
+                       bucket_size /= 2;
+               } while (dev->size < min_size(bucket_size));
        }
 
-       dev->nbuckets = dev->size / dev->bucket_size;
+       return bucket_size;
+}
 
+void bch2_check_bucket_size(struct bch_opts opts, struct dev_opts *dev)
+{
        if (dev->bucket_size < opts.block_size)
                die("Bucket size (%llu) cannot be smaller than block size (%u)",
                    dev->bucket_size, opts.block_size);
@@ -150,13 +150,12 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
 {
        struct bch_sb_handle sb = { NULL };
        struct dev_opts *i;
-       struct bch_sb_field_members *mi;
        unsigned max_dev_block_size = 0;
        unsigned opt_id;
+       u64 min_bucket_size = U64_MAX;
 
        for (i = devs; i < devs + nr_devs; i++)
-               max_dev_block_size = max(max_dev_block_size,
-                                        get_blocksize(i->path, i->fd));
+               max_dev_block_size = max(max_dev_block_size, get_blocksize(i->bdev->bd_buffered_fd));
 
        /* calculate block size: */
        if (!opt_defined(fs_opts, block_size)) {
@@ -165,9 +164,24 @@ struct bch_sb *bch2_format(struct bch_opt_strs     fs_opt_strs,
                die("blocksize too small: %u, must be greater than device blocksize %u",
                    fs_opts.block_size, max_dev_block_size);
 
+       /* get device size, if it wasn't specified: */
+       for (i = devs; i < devs + nr_devs; i++)
+               if (!i->size)
+                       i->size = get_size(i->bdev->bd_buffered_fd);
+
        /* calculate bucket sizes: */
        for (i = devs; i < devs + nr_devs; i++)
-               bch2_pick_bucket_size(fs_opts, i);
+               min_bucket_size = min(min_bucket_size,
+                       i->bucket_size ?: bch2_pick_bucket_size(fs_opts, i));
+
+       for (i = devs; i < devs + nr_devs; i++)
+               if (!i->bucket_size)
+                       i->bucket_size = min_bucket_size;
+
+       for (i = devs; i < devs + nr_devs; i++) {
+               i->nbuckets = i->size / i->bucket_size;
+               bch2_check_bucket_size(fs_opts, i);
+       }
 
        /* calculate btree node size: */
        if (!opt_defined(fs_opts, btree_node_size)) {
@@ -193,7 +207,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
        sb.sb->nr_devices       = nr_devs;
 
        if (opts.version == bcachefs_metadata_version_current)
-               sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+               sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 
        uuid_generate(sb.sb->uuid.b);
 
@@ -222,12 +236,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
        sb.sb->time_precision   = cpu_to_le32(1);
 
        /* Member info: */
-       mi = bch2_sb_resize_members(&sb,
+       struct bch_sb_field_members_v2 *mi =
+               bch2_sb_field_resize(&sb, members_v2,
                        (sizeof(*mi) + sizeof(struct bch_member) *
                        nr_devs) / sizeof(u64));
-
+       mi->member_bytes = cpu_to_le16(sizeof(struct bch_member));
        for (i = devs; i < devs + nr_devs; i++) {
-               struct bch_member *m = mi->members + (i - devs);
+               struct bch_member *m = bch2_members_v2_get_mut(sb.sb, (i - devs));
 
                uuid_generate(m->uuid.b);
                m->nbuckets     = cpu_to_le64(i->nbuckets);
@@ -255,9 +270,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
                 * Recompute mi and m after each sb modification: its location
                 * in memory may have changed due to reallocation.
                 */
-               mi = bch2_sb_get_members(sb.sb);
-               m = mi->members + (i - devs);
-
+               m = bch2_members_v2_get_mut(sb.sb, (i - devs));
                SET_BCH_MEMBER_GROUP(m, idx + 1);
        }
 
@@ -273,12 +286,14 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
        /* Crypt: */
        if (opts.encrypted) {
                struct bch_sb_field_crypt *crypt =
-                       bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));
+                       bch2_sb_field_resize(&sb, crypt, sizeof(*crypt) / sizeof(u64));
 
                bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
                SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
        }
 
+       bch2_sb_members_cpy_v2_v1(&sb);
+
        for (i = devs; i < devs + nr_devs; i++) {
                u64 size_sectors = i->size >> 9;
 
@@ -312,12 +327,12 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
                        /* Zero start of disk */
                        static const char zeroes[BCH_SB_SECTOR << 9];
 
-                       xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0,
+                       xpwrite(i->bdev->bd_buffered_fd, zeroes, BCH_SB_SECTOR << 9, 0,
                                "zeroing start of disk");
                }
 
-               bch2_super_write(i->fd, sb.sb);
-               close(i->fd);
+               bch2_super_write(i->bdev->bd_buffered_fd, sb.sb);
+               close(i->bdev->bd_buffered_fd);
        }
 
        return sb.sb;
@@ -510,7 +525,7 @@ int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd)
                case BCH_DATA_btree:
                case BCH_DATA_user:
                        printf(" %s:%llu:%llu",
-                              bch2_btree_ids[e.p.btree_id],
+                              bch2_btree_id_str(e.p.btree_id),
                               e.p.pos.inode,
                               e.p.pos.offset);
                }
index ba5d380768e48db25daa2438e6240ccaccc8e530..a16ae8669bdf331d697f31ec6c05a1a6670dccc2 100644 (file)
@@ -52,7 +52,7 @@ static inline struct format_opts format_opts_default()
 }
 
 struct dev_opts {
-       int             fd;
+       struct block_device *bdev;
        char            *path;
        u64             size;           /* bytes*/
        u64             bucket_size;    /* bytes */
@@ -75,7 +75,9 @@ static inline struct dev_opts dev_opts_default()
        };
 }
 
-void bch2_pick_bucket_size(struct bch_opts, struct dev_opts *);
+u64 bch2_pick_bucket_size(struct bch_opts, struct dev_opts *);
+void bch2_check_bucket_size(struct bch_opts, struct dev_opts *);
+
 struct bch_sb *bch2_format(struct bch_opt_strs,
                           struct bch_opts,
                           struct format_opts, struct dev_opts *, size_t);
index b1a488860678cd9242f84e7ab5ddd2b1724cd26f..f3809897f00a7d5c98c7f33f36bc2fd587939dcc 100644 (file)
@@ -1,18 +1,71 @@
 // SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
 
 #include "bcachefs.h"
 
-#include <linux/fs.h>
+#include "acl.h"
+#include "xattr.h"
+
 #include <linux/posix_acl.h>
+
+static const char * const acl_types[] = {
+       [ACL_USER_OBJ]  = "user_obj",
+       [ACL_USER]      = "user",
+       [ACL_GROUP_OBJ] = "group_obj",
+       [ACL_GROUP]     = "group",
+       [ACL_MASK]      = "mask",
+       [ACL_OTHER]     = "other",
+       NULL,
+};
+
+void bch2_acl_to_text(struct printbuf *out, const void *value, size_t size)
+{
+       const void *p, *end = value + size;
+
+       if (!value ||
+           size < sizeof(bch_acl_header) ||
+           ((bch_acl_header *)value)->a_version != cpu_to_le32(BCH_ACL_VERSION))
+               return;
+
+       p = value + sizeof(bch_acl_header);
+       while (p < end) {
+               const bch_acl_entry *in = p;
+               unsigned tag = le16_to_cpu(in->e_tag);
+
+               prt_str(out, acl_types[tag]);
+
+               switch (tag) {
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       p += sizeof(bch_acl_entry_short);
+                       break;
+               case ACL_USER:
+                       prt_printf(out, " uid %u", le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               case ACL_GROUP:
+                       prt_printf(out, " gid %u", le32_to_cpu(in->e_id));
+                       p += sizeof(bch_acl_entry);
+                       break;
+               }
+
+               prt_printf(out, " %o", le16_to_cpu(in->e_perm));
+
+               if (p != end)
+                       prt_char(out, ' ');
+       }
+}
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
+#include "fs.h"
+
+#include <linux/fs.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include "acl.h"
-#include "fs.h"
-#include "xattr.h"
-
 static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
 {
        return sizeof(bch_acl_header) +
@@ -226,18 +279,16 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c_xattr xattr;
        struct posix_acl *acl = NULL;
        struct bkey_s_c k;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
                        &hash, inode_inum(inode), &search, 0);
        if (ret) {
                if (!bch2_err_matches(ret, ENOENT))
@@ -253,7 +304,7 @@ retry:
        }
 
        xattr = bkey_s_c_to_xattr(k);
-       acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v),
+       acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
 
        if (!IS_ERR(acl))
@@ -262,8 +313,8 @@ out:
        if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return acl;
 }
 
@@ -303,7 +354,7 @@ int bch2_set_acl(struct mnt_idmap *idmap,
 {
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl;
@@ -311,12 +362,11 @@ int bch2_set_acl(struct mnt_idmap *idmap,
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        acl = _acl;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -329,30 +379,30 @@ retry:
                        goto btree_err;
        }
 
-       ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
+       ret = bch2_set_acl_trans(trans, inode_inum(inode), &inode_u, acl, type);
        if (ret)
                goto btree_err;
 
        inode_u.bi_ctime        = bch2_current_time(c);
        inode_u.bi_mode         = mode;
 
-       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL, 0);
+       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL, 0);
 btree_err:
-       bch2_trans_iter_exit(&trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err;
 
-       bch2_inode_update_after_write(&trans, inode, &inode_u,
+       bch2_inode_update_after_write(trans, inode, &inode_u,
                                      ATTR_CTIME|ATTR_MODE);
 
        set_cached_acl(&inode->v, type, acl);
 err:
-       bch2_trans_exit(&trans);
        mutex_unlock(&inode->ei_update_lock);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -367,7 +417,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
        struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_i_xattr *new;
-       struct posix_acl *acl;
+       struct posix_acl *acl = NULL;
        struct bkey_s_c k;
        int ret;
 
@@ -377,9 +427,10 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                return bch2_err_matches(ret, ENOENT) ? 0 : ret;
 
        k = bch2_btree_iter_peek_slot(&iter);
-       xattr = bkey_s_c_to_xattr(k);
+       ret = bkey_err(k);
        if (ret)
                goto err;
+       xattr = bkey_s_c_to_xattr(k);
 
        acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
                        le16_to_cpu(xattr.v->x_val_len));
index bb21d8d696a2fc3806d9ee1e353999ccd424d99b..27e7eec0f278c63784ec9520ffc31c7fad8cd7eb 100644 (file)
@@ -7,8 +7,6 @@ struct bch_hash_info;
 struct bch_inode_info;
 struct posix_acl;
 
-#ifdef CONFIG_BCACHEFS_POSIX_ACL
-
 #define BCH_ACL_VERSION        0x0001
 
 typedef struct {
@@ -26,6 +24,10 @@ typedef struct {
        __le32          a_version;
 } bch_acl_header;
 
+void bch2_acl_to_text(struct printbuf *, const void *, size_t);
+
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+
 struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
 
 int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
index 7bf2a50653eb70629a380edf5daefb2310eb64df..113273b214645ff5ac43f508ed2d168ccd1c1743 100644 (file)
@@ -192,146 +192,109 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
        return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
+       int ret = 0;
 
        /* allow for unknown fields */
-       if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) {
-               prt_printf(err, "incorrect value size (%zu < %u)",
-                      bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err,
+                        alloc_v1_val_size_bad,
+                        "incorrect value size (%zu < %u)",
+                        bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v));
+fsck_err:
+       return ret;
 }
 
-int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_alloc_unpacked u;
+       int ret = 0;
 
-       if (bch2_alloc_unpack_v2(&u, k)) {
-               prt_printf(err, "unpack error");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err,
+                        alloc_v2_unpack_error,
+                        "unpack error");
+fsck_err:
+       return ret;
 }
 
-int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_alloc_unpacked u;
+       int ret = 0;
 
-       if (bch2_alloc_unpack_v3(&u, k)) {
-               prt_printf(err, "unpack error");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err,
+                        alloc_v2_unpack_error,
+                        "unpack error");
+fsck_err:
+       return ret;
 }
 
-int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         enum bkey_invalid_flags flags,
-                         struct printbuf *err)
+int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
+                         enum bkey_invalid_flags flags, struct printbuf *err)
 {
        struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k);
-       int rw = flags & WRITE;
-
-       if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) {
-               prt_printf(err, "bad val size (%u > %lu)",
-                      alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
-           BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) {
-               prt_printf(err, "invalid backpointers_start");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (rw == WRITE &&
-           !(flags & BKEY_INVALID_JOURNAL) &&
-           c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) {
-               unsigned i, bp_len = 0;
+       int ret = 0;
 
-               for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++)
-                       bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len;
+       bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err,
+                        alloc_v4_val_size_bad,
+                        "bad val size (%u > %zu)",
+                        alloc_v4_u64s(a.v), bkey_val_u64s(k.k));
 
-               if (bp_len > a.v->dirty_sectors) {
-                       prt_printf(err, "too many backpointers");
-                       return -BCH_ERR_invalid_bkey;
-               }
-       }
+       bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) &&
+                        BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err,
+                        alloc_v4_backpointers_start_bad,
+                        "invalid backpointers_start");
 
-       if (rw == WRITE) {
-               if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) {
-                       prt_printf(err, "invalid data type (got %u should be %u)",
-                              a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
-                       return -BCH_ERR_invalid_bkey;
-               }
+       bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err,
+                        alloc_key_data_type_bad,
+                        "invalid data type (got %u should be %u)",
+                        a.v->data_type, alloc_data_type(*a.v, a.v->data_type));
 
-               switch (a.v->data_type) {
-               case BCH_DATA_free:
-               case BCH_DATA_need_gc_gens:
-               case BCH_DATA_need_discard:
-                       if (a.v->dirty_sectors ||
-                           a.v->cached_sectors ||
-                           a.v->stripe) {
-                               prt_printf(err, "empty data type free but have data");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-                       break;
-               case BCH_DATA_sb:
-               case BCH_DATA_journal:
-               case BCH_DATA_btree:
-               case BCH_DATA_user:
-               case BCH_DATA_parity:
-                       if (!a.v->dirty_sectors) {
-                               prt_printf(err, "data_type %s but dirty_sectors==0",
-                                      bch2_data_types[a.v->data_type]);
-                               return -BCH_ERR_invalid_bkey;
-                       }
-                       break;
-               case BCH_DATA_cached:
-                       if (!a.v->cached_sectors ||
-                           a.v->dirty_sectors ||
-                           a.v->stripe) {
-                               prt_printf(err, "data type inconsistency");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       if (!a.v->io_time[READ] &&
-                           c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) {
-                               prt_printf(err, "cached bucket with read_time == 0");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-                       break;
-               case BCH_DATA_stripe:
-                       if (!a.v->stripe) {
-                               prt_printf(err, "data_type %s but stripe==0",
-                                      bch2_data_types[a.v->data_type]);
-                               return -BCH_ERR_invalid_bkey;
-                       }
-                       break;
-               }
+       switch (a.v->data_type) {
+       case BCH_DATA_free:
+       case BCH_DATA_need_gc_gens:
+       case BCH_DATA_need_discard:
+               bkey_fsck_err_on(a.v->dirty_sectors ||
+                                a.v->cached_sectors ||
+                                a.v->stripe, c, err,
+                                alloc_key_empty_but_have_data,
+                                "empty data type free but have data");
+               break;
+       case BCH_DATA_sb:
+       case BCH_DATA_journal:
+       case BCH_DATA_btree:
+       case BCH_DATA_user:
+       case BCH_DATA_parity:
+               bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
+                                alloc_key_dirty_sectors_0,
+                                "data_type %s but dirty_sectors==0",
+                                bch2_data_types[a.v->data_type]);
+               break;
+       case BCH_DATA_cached:
+               bkey_fsck_err_on(!a.v->cached_sectors ||
+                                a.v->dirty_sectors ||
+                                a.v->stripe, c, err,
+                                alloc_key_cached_inconsistency,
+                                "data type inconsistency");
+
+               bkey_fsck_err_on(!a.v->io_time[READ] &&
+                                c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
+                                c, err,
+                                alloc_key_cached_but_read_time_zero,
+                                "cached bucket with read_time == 0");
+               break;
+       case BCH_DATA_stripe:
+               break;
        }
-
-       return 0;
-}
-
-static inline u64 swab40(u64 x)
-{
-       return (((x & 0x00000000ffULL) << 32)|
-               ((x & 0x000000ff00ULL) << 16)|
-               ((x & 0x0000ff0000ULL) >>  0)|
-               ((x & 0x00ff000000ULL) >> 16)|
-               ((x & 0xff00000000ULL) >> 32));
+fsck_err:
+       return ret;
 }
 
 void bch2_alloc_v4_swab(struct bkey_s k)
@@ -347,6 +310,7 @@ void bch2_alloc_v4_swab(struct bkey_s k)
        a->io_time[1]           = swab64(a->io_time[1]);
        a->stripe               = swab32(a->stripe);
        a->nr_external_backpointers = swab32(a->nr_external_backpointers);
+       a->fragmentation_lru    = swab64(a->fragmentation_lru);
 
        bps = alloc_v4_backpointers(a);
        for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) {
@@ -544,17 +508,18 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset)
                : 0;
 }
 
-int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k,
                             enum bkey_invalid_flags flags,
                             struct printbuf *err)
 {
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) {
-               prt_printf(err, "bad val size (%lu != %zu)",
-                      bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       return 0;
+       bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err,
+                        bucket_gens_val_size_bad,
+                        "bad val size (%zu != %zu)",
+                        bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens));
+fsck_err:
+       return ret;
 }
 
 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
@@ -571,7 +536,7 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
 
 int bch2_bucket_gens_init(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_alloc_v4 a;
@@ -582,9 +547,7 @@ int bch2_bucket_gens_init(struct bch_fs *c)
        u8 gen;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                /*
                 * Not a fsck error because this is checked/repaired by
@@ -597,10 +560,10 @@ int bch2_bucket_gens_init(struct bch_fs *c)
                pos = alloc_gens_pos(iter.pos, &offset);
 
                if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-                       ret = commit_do(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_LAZY_RW,
-                               __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+                       ret = commit_do(trans, NULL, NULL,
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_lazy_rw,
+                               bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
                        if (ret)
                                break;
                        have_bucket_gens_key = false;
@@ -614,15 +577,15 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
                g.v.gens[offset] = gen;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (have_bucket_gens_key && !ret)
-               ret = commit_do(&trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_LAZY_RW,
-                       __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0));
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_lazy_rw,
+                       bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -631,20 +594,19 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 
 int bch2_alloc_read(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
        int ret;
 
        down_read(&c->gc_lock);
-       bch2_trans_init(&trans, c, 0, 0);
 
        if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
                const struct bch_bucket_gens *g;
                u64 b;
 
-               for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+               for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
                                   BTREE_ITER_PREFETCH, k, ret) {
                        u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
                        u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
@@ -668,11 +630,11 @@ int bch2_alloc_read(struct bch_fs *c)
                             b++)
                                *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        } else {
                struct bch_alloc_v4 a;
 
-               for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+               for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                                   BTREE_ITER_PREFETCH, k, ret) {
                        /*
                         * Not a fsck error because this is checked/repaired by
@@ -685,10 +647,10 @@ int bch2_alloc_read(struct bch_fs *c)
 
                        *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        up_read(&c->gc_lock);
 
        if (ret)
@@ -753,7 +715,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans,
                        "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n"
                        "  for %s",
                        set ? "setting" : "clearing",
-                       bch2_btree_ids[btree],
+                       bch2_btree_id_str(btree),
                        iter.pos.inode,
                        iter.pos.offset,
                        bch2_bkey_types[old.k->type],
@@ -1012,6 +974,7 @@ int bch2_check_alloc_key(struct btree_trans *trans,
        int ret;
 
        if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c,
+                       alloc_key_to_missing_dev_bucket,
                        "alloc key for invalid device:bucket %llu:%llu",
                        alloc_k.k->p.inode, alloc_k.k->p.offset))
                return bch2_btree_delete_at(trans, alloc_iter, 0);
@@ -1031,7 +994,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 
        if (k.k->type != discard_key_type &&
            (c->opts.reconstruct_alloc ||
-            fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n"
+            fsck_err(c, need_discard_key_wrong,
+                     "incorrect key in need_discard btree (got %s should be %s)\n"
                      "  %s",
                      bch2_bkey_types[k.k->type],
                      bch2_bkey_types[discard_key_type],
@@ -1061,7 +1025,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 
        if (k.k->type != freespace_key_type &&
            (c->opts.reconstruct_alloc ||
-            fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n"
+            fsck_err(c, freespace_key_wrong,
+                     "incorrect key in freespace btree (got %s should be %s)\n"
                      "  %s",
                      bch2_bkey_types[k.k->type],
                      bch2_bkey_types[freespace_key_type],
@@ -1092,7 +1057,8 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 
        if (a->gen != alloc_gen(k, gens_offset) &&
            (c->opts.reconstruct_alloc ||
-            fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n"
+            fsck_err(c, bucket_gens_key_wrong,
+                     "incorrect gen in bucket_gens btree (got %u should be %u)\n"
                      "  %s",
                      alloc_gen(k, gens_offset), a->gen,
                      (printbuf_reset(&buf),
@@ -1150,7 +1116,8 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 
        if (k.k->type != KEY_TYPE_set &&
            (c->opts.reconstruct_alloc ||
-            fsck_err(c, "hole in alloc btree missing in freespace btree\n"
+            fsck_err(c, freespace_hole_missing,
+                     "hole in alloc btree missing in freespace btree\n"
                      "  device %llu buckets %llu-%llu",
                      freespace_iter->pos.inode,
                      freespace_iter->pos.offset,
@@ -1213,6 +1180,7 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
 
                for (i = gens_offset; i < gens_end_offset; i++) {
                        if (fsck_err_on(g.v.gens[i], c,
+                                       bucket_gens_hole_wrong,
                                        "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)",
                                        bucket_gens_pos_to_alloc(k.k->p, i).inode,
                                        bucket_gens_pos_to_alloc(k.k->p, i).offset,
@@ -1223,15 +1191,15 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
                }
 
                if (need_update) {
-                       struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g));
+                       struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-                       ret = PTR_ERR_OR_ZERO(k);
+                       ret = PTR_ERR_OR_ZERO(u);
                        if (ret)
                                goto err;
 
-                       memcpy(k, &g, sizeof(g));
+                       memcpy(u, &g, sizeof(g));
 
-                       ret = bch2_trans_update(trans, bucket_gens_iter, k, 0);
+                       ret = bch2_trans_update(trans, bucket_gens_iter, u, 0);
                        if (ret)
                                goto err;
                }
@@ -1270,8 +1238,9 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
                return ret;
 
        if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+                       need_discard_freespace_key_to_invalid_dev_bucket,
                        "entry in %s btree for nonexistant dev:bucket %llu:%llu",
-                       bch2_btree_ids[iter->btree_id], pos.inode, pos.offset))
+                       bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset))
                goto delete;
 
        a = bch2_alloc_to_v4(alloc_k, &a_convert);
@@ -1279,9 +1248,10 @@ static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_tr
        if (fsck_err_on(a->data_type != state ||
                        (state == BCH_DATA_free &&
                         genbits != alloc_freespace_genbits(*a)), c,
+                       need_discard_freespace_key_bad,
                        "%s\n  incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)",
                        (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
-                       bch2_btree_ids[iter->btree_id],
+                       bch2_btree_id_str(iter->btree_id),
                        iter->pos.inode,
                        iter->pos.offset,
                        a->data_type == state,
@@ -1297,7 +1267,7 @@ delete:
        ret =   bch2_btree_delete_extent_at(trans, iter,
                        iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                       BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+                       BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw);
        goto out;
 }
 
@@ -1308,7 +1278,7 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans,
        if (!btree_id_is_extents(iter->btree_id)) {
                return __bch2_check_discard_freespace_key(trans, iter);
        } else {
-               int ret;
+               int ret = 0;
 
                while (!bkey_eq(iter->pos, end) &&
                       !(ret = btree_trans_too_many_iters(trans) ?:
@@ -1346,6 +1316,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
        dev_exists = bch2_dev_exists2(c, k.k->p.inode);
        if (!dev_exists) {
                if (fsck_err_on(!dev_exists, c,
+                               bucket_gens_to_invalid_dev,
                                "bucket_gens key for invalid device:\n  %s",
                                (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                        ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1356,6 +1327,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
        if (fsck_err_on(end <= ca->mi.first_bucket ||
                        start >= ca->mi.nbuckets, c,
+                       bucket_gens_to_invalid_buckets,
                        "bucket_gens key for invalid buckets:\n  %s",
                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                ret = bch2_btree_delete_at(trans, iter, 0);
@@ -1364,6 +1336,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 
        for (b = start; b < ca->mi.first_bucket; b++)
                if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+                               bucket_gens_nonzero_for_invalid_buckets,
                                "bucket_gens key has nonzero gen for invalid bucket")) {
                        g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
                        need_update = true;
@@ -1371,21 +1344,21 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans,
 
        for (b = ca->mi.nbuckets; b < end; b++)
                if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c,
+                               bucket_gens_nonzero_for_invalid_buckets,
                                "bucket_gens key has nonzero gen for invalid bucket")) {
                        g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0;
                        need_update = true;
                }
 
        if (need_update) {
-               struct bkey_i *k;
+               struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g));
 
-               k = bch2_trans_kmalloc(trans, sizeof(g));
-               ret = PTR_ERR_OR_ZERO(k);
+               ret = PTR_ERR_OR_ZERO(u);
                if (ret)
                        goto out;
 
-               memcpy(k, &g, sizeof(g));
-               ret = bch2_trans_update(trans, iter, k, 0);
+               memcpy(u, &g, sizeof(g));
+               ret = bch2_trans_update(trans, iter, u, 0);
        }
 out:
 fsck_err:
@@ -1395,27 +1368,25 @@ fsck_err:
 
 int bch2_check_alloc_info(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter;
        struct bkey hole;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
+       bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
+       bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN,
                             BTREE_ITER_PREFETCH);
-       bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
+       bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN,
                             BTREE_ITER_PREFETCH);
 
        while (1) {
                struct bpos next;
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                k = bch2_get_key_or_real_bucket_hole(&iter, &hole);
                ret = bkey_err(k);
@@ -1428,7 +1399,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
                if (k.k->type) {
                        next = bpos_nosnap_successor(k.k->p);
 
-                       ret = bch2_check_alloc_key(&trans,
+                       ret = bch2_check_alloc_key(trans,
                                                   k, &iter,
                                                   &discard_iter,
                                                   &freespace_iter,
@@ -1438,11 +1409,11 @@ int bch2_check_alloc_info(struct bch_fs *c)
                } else {
                        next = k.k->p;
 
-                       ret = bch2_check_alloc_hole_freespace(&trans,
+                       ret = bch2_check_alloc_hole_freespace(trans,
                                                    bkey_start_pos(k.k),
                                                    &next,
                                                    &freespace_iter) ?:
-                               bch2_check_alloc_hole_bucket_gens(&trans,
+                               bch2_check_alloc_hole_bucket_gens(trans,
                                                    bkey_start_pos(k.k),
                                                    &next,
                                                    &bucket_gens_iter);
@@ -1450,9 +1421,9 @@ int bch2_check_alloc_info(struct bch_fs *c)
                                goto bkey_err;
                }
 
-               ret = bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_LAZY_RW);
+               ret = bch2_trans_commit(trans, NULL, NULL,
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_lazy_rw);
                if (ret)
                        goto bkey_err;
 
@@ -1463,29 +1434,29 @@ bkey_err:
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &bucket_gens_iter);
-       bch2_trans_iter_exit(&trans, &freespace_iter);
-       bch2_trans_iter_exit(&trans, &discard_iter);
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &bucket_gens_iter);
+       bch2_trans_iter_exit(trans, &freespace_iter);
+       bch2_trans_iter_exit(trans, &discard_iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret < 0)
                goto err;
 
-       ret = for_each_btree_key2(&trans, iter,
+       ret = for_each_btree_key2(trans, iter,
                        BTREE_ID_need_discard, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-             for_each_btree_key2(&trans, iter,
+               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+             for_each_btree_key2(trans, iter,
                        BTREE_ID_freespace, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?:
-             for_each_btree_key_commit(&trans, iter,
+               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
+             for_each_btree_key_commit(trans, iter,
                        BTREE_ID_bucket_gens, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-               bch2_check_bucket_gens_key(&trans, &iter, k));
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+               bch2_check_bucket_gens_key(trans, &iter, k));
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1524,11 +1495,13 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
                return ret;
 
        if (fsck_err_on(!a->io_time[READ], c,
+                       alloc_key_cached_but_read_time_zero,
                        "cached bucket with read_time 0\n"
                        "  %s",
                (printbuf_reset(&buf),
                 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
            fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+                       alloc_key_to_missing_lru_entry,
                        "missing lru entry\n"
                        "  %s",
                        (printbuf_reset(&buf),
@@ -1571,10 +1544,10 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
        int ret = 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       bch2_check_alloc_to_lru_ref(&trans, &iter)));
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+                       bch2_check_alloc_to_lru_ref(trans, &iter)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1682,7 +1655,7 @@ write:
        ret =   bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BCH_WATERMARK_btree|
-                                 BTREE_INSERT_NOFAIL);
+                                 BCH_TRANS_COMMIT_no_enospc);
        if (ret)
                goto out;
 
@@ -1699,29 +1672,25 @@ out:
 static void bch2_do_discards_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
        struct bpos discard_pos_done = POS_MAX;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        /*
         * We're doing the commit in bch2_discard_one_bucket instead of using
         * for_each_btree_key_commit() so that we can increment counters after
         * successful commit:
         */
-       ret = for_each_btree_key2(&trans, iter,
-                       BTREE_ID_need_discard, POS_MIN, 0, k,
-               bch2_discard_one_bucket(&trans, &iter, &discard_pos_done,
-                                       &seen,
-                                       &open,
-                                       &need_journal_commit,
-                                       &discarded));
-
-       bch2_trans_exit(&trans);
+       ret = bch2_trans_run(c,
+               for_each_btree_key2(trans, iter,
+                               BTREE_ID_need_discard, POS_MIN, 0, k,
+                       bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
+                                               &seen,
+                                               &open,
+                                               &need_journal_commit,
+                                               &discarded)));
 
        if (need_journal_commit * 2 > seen)
                bch2_journal_flush_async(&c->journal, NULL);
@@ -1791,7 +1760,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
                                BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BCH_WATERMARK_btree|
-                                 BTREE_INSERT_NOFAIL);
+                                 BCH_TRANS_COMMIT_no_enospc);
        if (ret)
                goto out;
 
@@ -1827,15 +1796,13 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 {
        struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
        struct bch_dev *ca;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret)
                goto err;
 
@@ -1843,11 +1810,11 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                s64 nr_to_invalidate =
                        should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
 
-               ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru,
+               ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
                                lru_pos(ca->dev_idx, 0, 0),
                                lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
                                BTREE_ITER_INTENT, k,
-                       invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate));
+                       invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate));
 
                if (ret < 0) {
                        percpu_ref_put(&ca->ref);
@@ -1855,7 +1822,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
                }
        }
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
@@ -1866,34 +1833,36 @@ void bch2_do_invalidates(struct bch_fs *c)
                bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
 }
 
-static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
-                                  unsigned long *last_updated)
+int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
+                           u64 bucket_start, u64 bucket_end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey hole;
-       struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets);
+       struct bpos end = POS(ca->dev_idx, bucket_end);
        struct bch_member *m;
+       unsigned long last_updated = jiffies;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       BUG_ON(bucket_start > bucket_end);
+       BUG_ON(bucket_end > ca->mi.nbuckets);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc,
-                            POS(ca->dev_idx, ca->mi.first_bucket),
-                            BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+               POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)),
+               BTREE_ITER_PREFETCH);
        /*
         * Scan the alloc btree for every bucket on @ca, and add buckets to the
         * freespace/need_discard/need_gc_gens btrees as needed:
         */
        while (1) {
-               if (*last_updated + HZ * 10 < jiffies) {
+               if (last_updated + HZ * 10 < jiffies) {
                        bch_info(ca, "%s: currently at %llu/%llu",
                                 __func__, iter.pos.offset, ca->mi.nbuckets);
-                       *last_updated = jiffies;
+                       last_updated = jiffies;
                }
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                if (bkey_ge(iter.pos, end)) {
                        ret = 0;
@@ -1913,10 +1882,10 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                        struct bch_alloc_v4 a_convert;
                        const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
 
-                       ret =   bch2_bucket_do_index(&trans, k, a, true) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
-                                                 BTREE_INSERT_LAZY_RW|
-                                                 BTREE_INSERT_NOFAIL);
+                       ret =   bch2_bucket_do_index(trans, k, a, true) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
+                                                 BCH_TRANS_COMMIT_lazy_rw|
+                                                 BCH_TRANS_COMMIT_no_enospc);
                        if (ret)
                                goto bkey_err;
 
@@ -1924,7 +1893,7 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                } else {
                        struct bkey_i *freespace;
 
-                       freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace));
+                       freespace = bch2_trans_kmalloc(trans, sizeof(*freespace));
                        ret = PTR_ERR_OR_ZERO(freespace);
                        if (ret)
                                goto bkey_err;
@@ -1934,10 +1903,10 @@ static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
                        freespace->k.p          = k.k->p;
                        freespace->k.size       = k.k->size;
 
-                       ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
-                                                 BTREE_INSERT_LAZY_RW|
-                                                 BTREE_INSERT_NOFAIL);
+                       ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
+                                                 BCH_TRANS_COMMIT_lazy_rw|
+                                                 BCH_TRANS_COMMIT_no_enospc);
                        if (ret)
                                goto bkey_err;
 
@@ -1950,16 +1919,16 @@ bkey_err:
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        if (ret < 0) {
-               bch_err(ca, "error initializing free space: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "initializing free space");
                return ret;
        }
 
        mutex_lock(&c->sb_lock);
-       m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
        SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
        mutex_unlock(&c->sb_lock);
 
@@ -1972,7 +1941,6 @@ int bch2_fs_freespace_init(struct bch_fs *c)
        unsigned i;
        int ret = 0;
        bool doing_init = false;
-       unsigned long last_updated = jiffies;
 
        /*
         * We can crash during the device add path, so we need to check this on
@@ -1988,7 +1956,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
                        doing_init = true;
                }
 
-               ret = bch2_dev_freespace_init(c, ca, &last_updated);
+               ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
                if (ret) {
                        percpu_ref_put(&ca->ref);
                        bch_err_fn(c, ret);
@@ -2109,6 +2077,17 @@ void bch2_recalc_capacity(struct bch_fs *c)
        closure_wake_up(&c->freelist_wait);
 }
 
+u64 bch2_min_rw_member_capacity(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       u64 ret = U64_MAX;
+
+       for_each_rw_member(ca, c, i)
+               ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
+       return ret;
+}
+
 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 {
        struct open_bucket *ob;
index c0914feb54b5faecaca91dbb845880456947a84f..73faf99a222aac3b33035432666e4d9b272c6fe9 100644 (file)
@@ -149,13 +149,13 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v1_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v2_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v3_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
-int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_alloc_v4_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
 void bch2_alloc_v4_swab(struct bkey_s);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -193,7 +193,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .min_val_size   = 48,                           \
 })
 
-int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bucket_gens_invalid(struct bch_fs *, struct bkey_s_c,
                             enum bkey_invalid_flags, struct printbuf *);
 void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -245,9 +245,11 @@ static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct
        return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a));
 }
 
+int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64);
 int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
+u64 bch2_min_rw_member_capacity(struct bch_fs *);
 
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
index 1f4c5b38562d9cabb4b886b794c9750d7e64b1a8..b85c7765272f6e4ae5e8aceb5a4bbaa89c535912 100644 (file)
@@ -25,7 +25,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "movinggc.h"
 #include "nocow_locking.h"
@@ -399,12 +399,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans,
                        struct bucket_alloc_state *s,
                        struct closure *cl)
 {
-       struct btree_iter iter;
-       struct bkey_s_c k;
+       struct btree_iter iter, citer;
+       struct bkey_s_c k, ck;
        struct open_bucket *ob = NULL;
-       u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
-       u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
+       u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
+       u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor));
+       u64 alloc_cursor = alloc_start;
        int ret;
+
+       /*
+        * Scan with an uncached iterator to avoid polluting the key cache. An
+        * uncached iter will return a cached key if one exists, but if not
+        * there is no other underlying protection for the associated key cache
+        * slot. To avoid racing bucket allocations, look up the cached key slot
+        * of any likely allocation candidate before attempting to proceed with
+        * the allocation. This provides proper exclusion on the associated
+        * bucket.
+        */
 again:
        for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
                           BTREE_ITER_SLOTS, k, ret) {
@@ -419,25 +430,38 @@ again:
                        continue;
 
                a = bch2_alloc_to_v4(k, &a_convert);
-
                if (a->data_type != BCH_DATA_free)
                        continue;
 
+               /* now check the cached key to serialize concurrent allocs of the bucket */
+               ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED);
+               ret = bkey_err(ck);
+               if (ret)
+                       break;
+
+               a = bch2_alloc_to_v4(ck, &a_convert);
+               if (a->data_type != BCH_DATA_free)
+                       goto next;
+
                s->buckets_seen++;
 
                ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
+next:
+               citer.path->preserve = false;
+               bch2_trans_iter_exit(trans, &citer);
                if (ob)
                        break;
        }
        bch2_trans_iter_exit(trans, &iter);
 
+       alloc_cursor = iter.pos.offset;
        ca->alloc_cursor = alloc_cursor;
 
        if (!ob && ret)
                ob = ERR_PTR(ret);
 
-       if (!ob && alloc_cursor > alloc_start) {
-               alloc_cursor = alloc_start;
+       if (!ob && alloc_start > first_bucket) {
+               alloc_cursor = alloc_start = first_bucket;
                goto again;
        }
 
@@ -502,9 +526,14 @@ again:
 }
 
 /**
- * bch_bucket_alloc - allocate a single bucket from a specific device
+ * bch2_bucket_alloc_trans - allocate a single bucket from a specific device
+ * @trans:     transaction object
+ * @ca:                device to allocate from
+ * @watermark: how important is this allocation?
+ * @cl:                if not NULL, closure to be used to wait if buckets not available
+ * @usage:     for secondarily also returning the current device usage
  *
- * Returns index of bucket on success, 0 on failure
+ * Returns:    an open_bucket on success, or an ERR_PTR() on failure.
  */
 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
                                      struct bch_dev *ca,
@@ -597,7 +626,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
        struct open_bucket *ob;
 
        bch2_trans_do(c, NULL, NULL, 0,
-                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark,
+                     PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
                                                        cl, &usage)));
        return ob;
 }
@@ -775,7 +804,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
        struct dev_alloc_list devs_sorted;
        struct ec_stripe_head *h;
        struct open_bucket *ob;
-       struct bch_dev *ca;
        unsigned i, ec_idx;
        int ret = 0;
 
@@ -805,8 +833,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans,
                }
        goto out_put_head;
 got_bucket:
-       ca = bch_dev_bkey_exists(c, ob->dev);
-
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
        ec_stripe_new_get(h->s, STRIPE_REF_io);
@@ -989,7 +1015,6 @@ retry_blocking:
                        cl = _cl;
                        goto retry_blocking;
                }
-
        }
 
        return ret;
@@ -1031,6 +1056,19 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
        return ret < 0 ? ret : 0;
 }
 
+/**
+ * should_drop_bucket - check if this is open_bucket should go away
+ * @ob:                open_bucket to predicate on
+ * @c:         filesystem handle
+ * @ca:                if set, we're killing buckets for a particular device
+ * @ec:                if true, we're shutting down erasure coding and killing all ec
+ *             open_buckets
+ *             otherwise, return true
+ * Returns: true if we should kill this open_bucket
+ *
+ * We're killing open_buckets because we're shutting down a device, erasure
+ * coding, or the entire filesystem - check if this open_bucket matches:
+ */
 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
                               struct bch_dev *ca, bool ec)
 {
@@ -1516,25 +1554,47 @@ static const char * const bch2_write_point_states[] = {
        NULL
 };
 
+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c,
+                                    struct write_point *wp)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       prt_printf(out, "%lu: ", wp->write_point);
+       prt_human_readable_u64(out, wp->sectors_allocated);
+
+       prt_printf(out, " last wrote: ");
+       bch2_pr_time_units(out, sched_clock() - wp->last_used);
+
+       for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
+               prt_printf(out, " %s: ", bch2_write_point_states[i]);
+               bch2_pr_time_units(out, wp->time[i]);
+       }
+
+       prt_newline(out);
+
+       printbuf_indent_add(out, 2);
+       open_bucket_for_each(c, &wp->ptrs, ob, i)
+               bch2_open_bucket_to_text(out, c, ob);
+       printbuf_indent_sub(out, 2);
+}
+
 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct write_point *wp;
-       unsigned i;
 
+       prt_str(out, "Foreground write points\n");
        for (wp = c->write_points;
             wp < c->write_points + ARRAY_SIZE(c->write_points);
-            wp++) {
-               prt_printf(out, "%lu: ", wp->write_point);
-               prt_human_readable_u64(out, wp->sectors_allocated);
+            wp++)
+               bch2_write_point_to_text(out, c, wp);
 
-               prt_printf(out, " last wrote: ");
-               bch2_pr_time_units(out, sched_clock() - wp->last_used);
+       prt_str(out, "Copygc write point\n");
+       bch2_write_point_to_text(out, c, &c->copygc_write_point);
 
-               for (i = 0; i < WRITE_POINT_STATE_NR; i++) {
-                       prt_printf(out, " %s: ", bch2_write_point_states[i]);
-                       bch2_pr_time_units(out, wp->time[i]);
-               }
+       prt_str(out, "Rebalance write point\n");
+       bch2_write_point_to_text(out, c, &c->rebalance_write_point);
 
-               prt_newline(out);
-       }
+       prt_str(out, "Btree write point\n");
+       bch2_write_point_to_text(out, c, &c->btree_write_point);
 }
index fee195f7eabfce5b6ef4dea079a8c98b88c9eebb..7aaeec44c7466cfd8aebf4d7daa9c099ff751fd2 100644 (file)
@@ -5,7 +5,7 @@
 #include "bcachefs.h"
 #include "alloc_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
 
 #include <linux/hash.h>
 
index 804a843f23c16ea4100baba6690d589ab98368e1..b91b7a46105608d089828db3bd65d1cc359475af 100644 (file)
@@ -105,7 +105,7 @@ struct write_point {
                struct dev_stripe_state stripe;
 
                u64                     sectors_allocated;
-       } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+       } __aligned(SMP_CACHE_BYTES);
 
        struct {
                struct work_struct      index_update_work;
@@ -116,7 +116,7 @@ struct write_point {
                enum write_point_state  state;
                u64                     last_state_change;
                u64                     time[WRITE_POINT_STATE_NR];
-       } __attribute__((__aligned__(SMP_CACHE_BYTES)));
+       } __aligned(SMP_CACHE_BYTES);
 };
 
 struct write_point_specifier {
index 8747c5e19f9997f1a11b0c32c093c1b3ba602f17..4c8bcf23bb27194875191f0214a583b2340de3db 100644 (file)
@@ -5,6 +5,7 @@
 #include "backpointers.h"
 #include "btree_cache.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_write_buffer.h"
 #include "error.h"
 
@@ -37,25 +38,26 @@ static bool extent_matches_bp(struct bch_fs *c,
        return false;
 }
 
-int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
                             enum bkey_invalid_flags flags,
                             struct printbuf *err)
 {
        struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
        struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
+       int ret = 0;
 
-       if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) {
-               prt_str(err, "backpointer at wrong pos");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+                        c, err,
+                        backpointer_pos_wrong,
+                        "backpointer at wrong pos");
+fsck_err:
+       return ret;
 }
 
 void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp)
 {
        prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=",
-              bch2_btree_ids[bp->btree_id],
+              bch2_btree_id_str(bp->btree_id),
               bp->level,
               (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT),
               (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT),
@@ -76,7 +78,7 @@ void bch2_backpointer_swab(struct bkey_s k)
 {
        struct bkey_s_backpointer bp = bkey_s_to_backpointer(k);
 
-       bp.v->bucket_offset     = swab32(bp.v->bucket_offset);
+       bp.v->bucket_offset     = swab40(bp.v->bucket_offset);
        bp.v->bucket_len        = swab32(bp.v->bucket_len);
        bch2_bpos_swab(&bp.v->pos);
 }
@@ -219,18 +221,22 @@ out:
 static void backpointer_not_found(struct btree_trans *trans,
                                  struct bpos bp_pos,
                                  struct bch_backpointer bp,
-                                 struct bkey_s_c k,
-                                 const char *thing_it_points_to)
+                                 struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        struct printbuf buf = PRINTBUF;
        struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
 
+       /*
+        * If we're using the btree write buffer, the backpointer we were
+        * looking at may have already been deleted - failure to find what it
+        * pointed to is not an error:
+        */
        if (likely(!bch2_backpointers_no_use_write_buffer))
                return;
 
        prt_printf(&buf, "backpointer doesn't match %s it points to:\n  ",
-                  thing_it_points_to);
+                  bp.level ? "btree node" : "extent");
        prt_printf(&buf, "bucket: ");
        bch2_bpos_to_text(&buf, bucket);
        prt_printf(&buf, "\n  ");
@@ -256,56 +262,37 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans,
                                         struct bch_backpointer bp,
                                         unsigned iter_flags)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_root *r = bch2_btree_id_root(c, bp.btree_id);
-       struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
-       struct bkey_s_c k;
-
-       bch2_trans_node_iter_init(trans, iter,
-                                 bp.btree_id,
-                                 bp.pos,
-                                 0,
-                                 min(bp.level, r->level),
-                                 iter_flags);
-       k = bch2_btree_iter_peek_slot(iter);
-       if (bkey_err(k)) {
-               bch2_trans_iter_exit(trans, iter);
-               return k;
-       }
-
-       if (bp.level == r->level + 1)
-               k = bkey_i_to_s_c(&r->key);
-
-       if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
-               return k;
-
-       bch2_trans_iter_exit(trans, iter);
+       if (likely(!bp.level)) {
+               struct bch_fs *c = trans->c;
+               struct bpos bucket = bp_pos_to_bucket(c, bp_pos);
+               struct bkey_s_c k;
+
+               bch2_trans_node_iter_init(trans, iter,
+                                         bp.btree_id,
+                                         bp.pos,
+                                         0, 0,
+                                         iter_flags);
+               k = bch2_btree_iter_peek_slot(iter);
+               if (bkey_err(k)) {
+                       bch2_trans_iter_exit(trans, iter);
+                       return k;
+               }
 
-       if (unlikely(bch2_backpointers_no_use_write_buffer)) {
-               if (bp.level) {
-                       struct btree *b;
+               if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp))
+                       return k;
 
-                       /*
-                        * If a backpointer for a btree node wasn't found, it may be
-                        * because it was overwritten by a new btree node that hasn't
-                        * been written out yet - backpointer_get_node() checks for
-                        * this:
-                        */
-                       b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
-                       if (!IS_ERR_OR_NULL(b))
-                               return bkey_i_to_s_c(&b->key);
+               bch2_trans_iter_exit(trans, iter);
+               backpointer_not_found(trans, bp_pos, bp, k);
+               return bkey_s_c_null;
+       } else {
+               struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp);
 
+               if (IS_ERR_OR_NULL(b)) {
                        bch2_trans_iter_exit(trans, iter);
-
-                       if (IS_ERR(b))
-                               return bkey_s_c_err(PTR_ERR(b));
-                       return bkey_s_c_null;
+                       return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null;
                }
-
-               backpointer_not_found(trans, bp_pos, bp, k, "extent");
+               return bkey_i_to_s_c(&b->key);
        }
-
-       return bkey_s_c_null;
 }
 
 struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
@@ -326,19 +313,20 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans,
                                  bp.level - 1,
                                  0);
        b = bch2_btree_iter_peek_node(iter);
-       if (IS_ERR(b))
+       if (IS_ERR_OR_NULL(b))
                goto err;
 
-       if (b && extent_matches_bp(c, bp.btree_id, bp.level,
-                                  bkey_i_to_s_c(&b->key),
-                                  bucket, bp))
+       BUG_ON(b->c.level != bp.level - 1);
+
+       if (extent_matches_bp(c, bp.btree_id, bp.level,
+                             bkey_i_to_s_c(&b->key),
+                             bucket, bp))
                return b;
 
-       if (b && btree_node_will_make_reachable(b)) {
+       if (btree_node_will_make_reachable(b)) {
                b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node);
        } else {
-               backpointer_not_found(trans, bp_pos, bp,
-                                     bkey_i_to_s_c(&b->key), "btree node");
+               backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key));
                b = NULL;
        }
 err:
@@ -351,20 +339,18 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 {
        struct bch_fs *c = trans->c;
        struct btree_iter alloc_iter = { NULL };
-       struct bch_dev *ca;
        struct bkey_s_c alloc_k;
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c,
-                       "backpointer for mising device:\n%s",
+                       backpointer_to_missing_device,
+                       "backpointer for missing device:\n%s",
                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                ret = bch2_btree_delete_at(trans, bp_iter, 0);
                goto out;
        }
 
-       ca = bch_dev_bkey_exists(c, k.k->p.inode);
-
        alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc,
                                     bp_pos_to_bucket(c, k.k->p), 0);
        ret = bkey_err(alloc_k);
@@ -372,6 +358,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
                goto out;
 
        if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c,
+                       backpointer_to_missing_alloc,
                        "backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
                        alloc_iter.pos.inode, alloc_iter.pos.offset,
                        (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
@@ -393,10 +380,10 @@ int bch2_check_btree_backpointers(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_backpointers, POS_MIN, 0, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-                 bch2_check_btree_backpointer(&trans, &iter, k)));
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+                 bch2_check_btree_backpointer(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -456,39 +443,32 @@ fsck_err:
        return ret;
 missing:
        prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
-              bch2_btree_ids[bp.btree_id], bp.level);
+              bch2_btree_id_str(bp.btree_id), bp.level);
        bch2_bkey_val_to_text(&buf, c, orig_k);
        prt_printf(&buf, "\nbp pos ");
        bch2_bpos_to_text(&buf, bp_iter.pos);
 
        if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
            c->opts.reconstruct_alloc ||
-           fsck_err(c, "%s", buf.buf))
+           fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
                ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
 
        goto out;
 }
 
 static int check_extent_to_backpointers(struct btree_trans *trans,
-                                       struct btree_iter *iter,
+                                       enum btree_id btree, unsigned level,
                                        struct bpos bucket_start,
                                        struct bpos bucket_end,
-                                       struct bpos_level *last_flushed)
+                                       struct bpos_level *last_flushed,
+                                       struct bkey_s_c k)
 {
        struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs;
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       struct bkey_s_c k;
        int ret;
 
-       k = bch2_btree_iter_peek_all_levels(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-       if (!k.k)
-               return 0;
-
        ptrs = bch2_bkey_ptrs_c(k);
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                struct bpos bucket_pos;
@@ -497,7 +477,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
                if (p.ptr.cached)
                        continue;
 
-               bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+               bch2_extent_ptr_to_bp(c, btree, level,
                                      k, p, &bucket_pos, &bp);
 
                ret = check_bp_exists(trans, bucket_pos, bp, k,
@@ -514,44 +494,33 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans,
                                            enum btree_id btree_id,
                                            struct bpos bucket_start,
                                            struct bpos bucket_end,
-                                           struct bpos_level *last_flushed)
+                                           struct bpos_level *last_flushed,
+                                           int *level)
 {
        struct bch_fs *c = trans->c;
-       struct btree_root *r = bch2_btree_id_root(c, btree_id);
        struct btree_iter iter;
        struct btree *b;
        struct bkey_s_c k;
-       struct bkey_ptrs_c ptrs;
-       struct extent_ptr_decoded p;
-       const union bch_extent_entry *entry;
        int ret;
-
-       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+retry:
+       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+                                 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
        b = bch2_btree_iter_peek_node(&iter);
        ret = PTR_ERR_OR_ZERO(b);
        if (ret)
                goto err;
 
-       BUG_ON(b != btree_node_root(c, b));
-
-       k = bkey_i_to_s_c(&b->key);
-       ptrs = bch2_bkey_ptrs_c(k);
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-               struct bpos bucket_pos;
-               struct bch_backpointer bp;
-
-               if (p.ptr.cached)
-                       continue;
+       if (b != btree_node_root(c, b)) {
+               bch2_trans_iter_exit(trans, &iter);
+               goto retry;
+       }
 
-               bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
-                                     k, p, &bucket_pos, &bp);
+       *level = b->c.level;
 
-               ret = check_bp_exists(trans, bucket_pos, bp, k,
+       k = bkey_i_to_s_c(&b->key);
+       ret = check_extent_to_backpointers(trans, btree_id, b->c.level + 1,
                                      bucket_start, bucket_end,
-                                     last_flushed);
-               if (ret)
-                       goto err;
-       }
+                                     last_flushed, k);
 err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -629,43 +598,49 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        enum btree_id btree_id;
-       struct bpos_level last_flushed = { UINT_MAX };
+       struct bkey_s_c k;
+       struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
        int ret = 0;
 
        for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
-               unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
-
-               bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
-                                         depth,
-                                         BTREE_ITER_ALL_LEVELS|
-                                         BTREE_ITER_PREFETCH);
-
-               do {
-                       ret = commit_do(trans, NULL, NULL,
-                                       BTREE_INSERT_LAZY_RW|
-                                       BTREE_INSERT_NOFAIL,
-                                       check_extent_to_backpointers(trans, &iter,
-                                                               bucket_start, bucket_end,
-                                                               &last_flushed));
-                       if (ret)
-                               break;
-               } while (!bch2_btree_iter_advance(&iter));
-
-               bch2_trans_iter_exit(trans, &iter);
-
-               if (ret)
-                       break;
+               int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
 
                ret = commit_do(trans, NULL, NULL,
-                               BTREE_INSERT_LAZY_RW|
-                               BTREE_INSERT_NOFAIL,
+                               BCH_TRANS_COMMIT_lazy_rw|
+                               BCH_TRANS_COMMIT_no_enospc,
                                check_btree_root_to_backpointers(trans, btree_id,
                                                        bucket_start, bucket_end,
-                                                       &last_flushed));
+                                                       &last_flushed, &level));
                if (ret)
-                       break;
+                       return ret;
+
+               while (level >= depth) {
+                       bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+                                                 level,
+                                                 BTREE_ITER_PREFETCH);
+                       for_each_btree_key_continue(trans, iter, BTREE_ITER_PREFETCH, k, ret) {
+                               ret = commit_do(trans, NULL, NULL,
+                                               BCH_TRANS_COMMIT_lazy_rw|
+                                               BCH_TRANS_COMMIT_no_enospc,
+                                       check_extent_to_backpointers(trans, btree_id, level,
+                                                                    bucket_start, bucket_end,
+                                                                    &last_flushed, k));
+                               if (ret)
+                                       break;
+
+                               if (bpos_eq(iter.pos, SPOS_MAX))
+                                       break;
+                       }
+                       bch2_trans_iter_exit(trans, &iter);
+
+                       if (ret)
+                               return ret;
+
+                       --level;
+               }
        }
-       return ret;
+
+       return 0;
 }
 
 static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
@@ -706,7 +681,7 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
                --btree_nodes;
                if (!btree_nodes) {
-                       *end = alloc_k.k->p;
+                       *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
                        break;
                }
 
@@ -726,13 +701,12 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
 
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bpos start = POS_MIN, end;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        while (1) {
-               ret = bch2_get_alloc_in_memory_pos(&trans, start, &end);
+               ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
                if (ret)
                        break;
 
@@ -752,13 +726,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
                        printbuf_exit(&buf);
                }
 
-               ret = bch2_check_extents_to_backpointers_pass(&trans, start, end);
+               ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
                if (ret || bpos_eq(end, SPOS_MAX))
                        break;
 
                start = bpos_successor(end);
        }
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -797,7 +771,9 @@ static int check_one_backpointer(struct btree_trans *trans,
        }
 
        if (fsck_err_on(!k.k, c,
-                       "backpointer for missing extent\n  %s",
+                       backpointer_to_missing_ptr,
+                       "backpointer for missing %s\n  %s",
+                       bp.v->level ? "btree node" : "extent",
                        (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) {
                ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p);
                goto out;
@@ -819,7 +795,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
        return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
                                  POS_MIN, BTREE_ITER_PREFETCH, k,
-                                 NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+                                 NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
                check_one_backpointer(trans, start, end,
                                      bkey_s_c_to_backpointer(k),
                                      &last_flushed_pos));
@@ -827,13 +803,12 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
 
 int bch2_check_backpointers_to_extents(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        while (1) {
-               ret = bch2_get_btree_in_memory_pos(&trans,
+               ret = bch2_get_btree_in_memory_pos(trans,
                                                   (1U << BTREE_ID_extents)|
                                                   (1U << BTREE_ID_reflink),
                                                   ~0,
@@ -859,13 +834,13 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
                        printbuf_exit(&buf);
                }
 
-               ret = bch2_check_backpointers_to_extents_pass(&trans, start, end);
+               ret = bch2_check_backpointers_to_extents_pass(trans, start, end);
                if (ret || !bbpos_cmp(end, BBPOS_MAX))
                        break;
 
                start = bbpos_successor(end);
        }
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
index 547e0617602ab21049571e171eca313ab968a65c..ab866feeaf660f497cc58ddf73a2692ab32865ac 100644 (file)
@@ -7,7 +7,16 @@
 #include "buckets.h"
 #include "super.h"
 
-int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k,
+static inline u64 swab40(u64 x)
+{
+       return (((x & 0x00000000ffULL) << 32)|
+               ((x & 0x000000ff00ULL) << 16)|
+               ((x & 0x0000ff0000ULL) >>  0)|
+               ((x & 0x00ff000000ULL) >> 16)|
+               ((x & 0xff00000000ULL) >> 32));
+}
+
+int bch2_backpointer_invalid(struct bch_fs *, struct bkey_s_c k,
                             enum bkey_invalid_flags, struct printbuf *);
 void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *);
 void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
index 1fbed1f8378d1aeeca4d781f9fa3c64583e2a833..be2edced52133e6592092d5d8e20c643cd8b372a 100644 (file)
@@ -2,20 +2,9 @@
 #ifndef _BCACHEFS_BBPOS_H
 #define _BCACHEFS_BBPOS_H
 
+#include "bbpos_types.h"
 #include "bkey_methods.h"
-
-struct bbpos {
-       enum btree_id           btree;
-       struct bpos             pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
-       return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN      BBPOS(0, POS_MIN)
-#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#include "btree_cache.h"
 
 static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
 {
@@ -40,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos)
 
 static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos)
 {
-       prt_str(out, bch2_btree_ids[pos.btree]);
+       prt_str(out, bch2_btree_id_str(pos.btree));
        prt_char(out, ':');
        bch2_bpos_to_text(out, pos.pos);
 }
index e1f1e8e871a81259104a0e16c43fdba739995aa2..3117ab4426a74ae8262f79ccaffc325d28ba4254 100644 (file)
 #include "nocow_locking_types.h"
 #include "opts.h"
 #include "recovery_types.h"
+#include "sb-errors_types.h"
 #include "seqmutex.h"
 #include "util.h"
 
@@ -293,9 +294,17 @@ do {                                                                       \
        printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
 
 #define bch_err_fn(_c, _ret)                                           \
-        bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret))
-#define bch_err_msg(_c, _ret, _msg)                                    \
-        bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret))
+do {                                                                   \
+       if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
+#define bch_err_msg(_c, _ret, _msg, ...)                               \
+do {                                                                   \
+       if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               bch_err(_c, "%s(): error " _msg " %s", __func__,        \
+                       ##__VA_ARGS__, bch2_err_str(_ret));             \
+} while (0)
 
 #define bch_verbose(c, fmt, ...)                                       \
 do {                                                                   \
@@ -371,7 +380,7 @@ BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
 #ifndef CONFIG_BCACHEFS_DEBUG
-#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
+#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name;
 BCH_DEBUG_PARAMS_DEBUG()
 #undef BCH_DEBUG_PARAM
 #endif
@@ -392,7 +401,9 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(journal_flush_write)                  \
        x(journal_noflush_write)                \
        x(journal_flush_seq)                    \
-       x(blocked_journal)                      \
+       x(blocked_journal_low_on_space)         \
+       x(blocked_journal_low_on_pin)           \
+       x(blocked_journal_max_in_flight)        \
        x(blocked_allocate)                     \
        x(blocked_allocate_open_bucket)         \
        x(nocow_lock_contended)
@@ -410,6 +421,7 @@ enum bch_time_stats {
 #include "buckets_types.h"
 #include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
+#include "disk_groups_types.h"
 #include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
@@ -454,6 +466,8 @@ enum gc_phase {
        GC_PHASE_BTREE_bucket_gens,
        GC_PHASE_BTREE_snapshot_trees,
        GC_PHASE_BTREE_deleted_inodes,
+       GC_PHASE_BTREE_logged_ops,
+       GC_PHASE_BTREE_rebalance_work,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -491,6 +505,8 @@ struct bch_dev {
         * Committed by bch2_write_super() -> bch_fs_mi_update()
         */
        struct bch_member_cpu   mi;
+       atomic64_t              errors[BCH_MEMBER_ERROR_NR];
+
        __uuid_t                uuid;
        char                    name[BDEVNAME_SIZE];
 
@@ -569,7 +585,7 @@ enum {
        BCH_FS_INITIAL_GC_UNFIXED,      /* kill when we enumerate fsck errors */
        BCH_FS_NEED_ANOTHER_GC,
 
-       BCH_FS_HAVE_DELETED_SNAPSHOTS,
+       BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
 
        /* errors: */
        BCH_FS_ERROR,
@@ -603,7 +619,7 @@ struct journal_seq_blacklist_table {
                u64             start;
                u64             end;
                bool            dirty;
-       }                       entries[0];
+       }                       entries[];
 };
 
 struct journal_keys {
@@ -626,8 +642,8 @@ struct journal_keys {
        size_t                  size;
 };
 
-struct btree_path_buf {
-       struct btree_path       *path;
+struct btree_trans_buf {
+       struct btree_trans      *trans;
 };
 
 #define REPLICAS_DELTA_LIST_MAX        (1U << 16)
@@ -737,6 +753,7 @@ struct bch_fs {
        struct snapshot_table __rcu *snapshots;
        size_t                  snapshot_table_size;
        struct mutex            snapshot_table_lock;
+       struct rw_semaphore     snapshot_create_lock;
 
        struct work_struct      snapshot_delete_work;
        struct work_struct      snapshot_wait_for_pagecache_and_delete_work;
@@ -786,9 +803,9 @@ struct bch_fs {
        /* btree_iter.c: */
        struct seqmutex         btree_trans_lock;
        struct list_head        btree_trans_list;
-       mempool_t               btree_paths_pool;
+       mempool_t               btree_trans_pool;
        mempool_t               btree_trans_mem_pool;
-       struct btree_path_buf  __percpu *btree_paths_bufs;
+       struct btree_trans_buf  __percpu        *btree_trans_bufs;
 
        struct srcu_struct      btree_trans_barrier;
        bool                    btree_trans_barrier_initialized;
@@ -928,9 +945,6 @@ struct bch_fs {
        struct list_head        moving_context_list;
        struct mutex            moving_context_lock;
 
-       struct list_head        data_progress_list;
-       struct mutex            data_progress_lock;
-
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
@@ -981,11 +995,6 @@ struct bch_fs {
        struct bio_set          dio_read_bioset;
        struct bio_set          nocow_flush_bioset;
 
-       /* ERRORS */
-       struct list_head        fsck_errors;
-       struct mutex            fsck_error_lock;
-       bool                    fsck_alloc_err;
-
        /* QUOTAS */
        struct bch_memquota_type quotas[QTYP_NR];
 
@@ -995,6 +1004,7 @@ struct bch_fs {
        enum bch_recovery_pass  curr_recovery_pass;
        /* bitmap of explicitly enabled recovery passes: */
        u64                     recovery_passes_explicit;
+       u64                     recovery_passes_complete;
 
        /* DEBUG JUNK */
        struct dentry           *fs_debug_dir;
@@ -1033,6 +1043,14 @@ struct bch_fs {
        struct bch2_time_stats  times[BCH_TIME_STAT_NR];
 
        struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
+
+       /* ERRORS */
+       struct list_head        fsck_error_msgs;
+       struct mutex            fsck_error_msgs_lock;
+       bool                    fsck_alloc_msgs_err;
+
+       bch_sb_errors_cpu       fsck_error_counts;
+       struct mutex            fsck_error_counts_lock;
 };
 
 extern struct wait_queue_head bch2_read_only_wait;
@@ -1139,22 +1157,6 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
        return dev < c->sb.nr_devices && c->devs[dev];
 }
 
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-                                                 enum bch_recovery_pass pass)
-{
-       c->recovery_passes_explicit |= BIT_ULL(pass);
-
-       if (c->curr_recovery_pass >= pass) {
-               c->curr_recovery_pass = pass;
-               return -BCH_ERR_restart_recovery;
-       } else {
-               return 0;
-       }
-}
-
 #define BKEY_PADDED_ONSTACK(key, pad)                          \
        struct { struct bkey_i key; __u64 key ## _pad[pad]; }
 
index 5ec218ee356947140c4e91f32a38c65c6609db54..0a750953ff921b9d62d9fd1918da27d375c2c6dc 100644 (file)
@@ -83,8 +83,8 @@ typedef uuid_t __uuid_t;
 #endif
 
 #define BITMASK(name, type, field, offset, end)                                \
-static const unsigned  name##_OFFSET = offset;                         \
-static const unsigned  name##_BITS = (end - offset);                   \
+static const __maybe_unused unsigned   name##_OFFSET = offset;         \
+static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
                                                                        \
 static inline __u64 name(const type *k)                                        \
 {                                                                      \
@@ -98,9 +98,9 @@ static inline void SET_##name(type *k, __u64 v)                               \
 }
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)              \
-static const unsigned  name##_OFFSET = offset;                         \
-static const unsigned  name##_BITS = (end - offset);                   \
-static const __u##_bits        name##_MAX = (1ULL << (end - offset)) - 1;      \
+static const __maybe_unused unsigned   name##_OFFSET = offset;         \
+static const __maybe_unused unsigned   name##_BITS = (end - offset);   \
+static const __maybe_unused __u##_bits name##_MAX = (1ULL << (end - offset)) - 1;\
                                                                        \
 static inline __u64 name(const type *k)                                        \
 {                                                                      \
@@ -370,7 +370,9 @@ static inline void bkey_init(struct bkey *k)
        x(backpointer,          28)                     \
        x(inode_v3,             29)                     \
        x(bucket_gens,          30)                     \
-       x(snapshot_tree,        31)
+       x(snapshot_tree,        31)                     \
+       x(logged_op_truncate,   32)                     \
+       x(logged_op_finsert,    33)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -611,31 +613,17 @@ struct bch_extent_stripe_ptr {
 #endif
 };
 
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:6,
-                               unused:22,
-                               replicas:4,
-                               generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   generation:32,
-                               replicas:4,
-                               unused:22,
-                               type:6;
-#endif
-};
-
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:7,
-                               unused:33,
-                               compression:8,
+       __u64                   type:6,
+                               unused:34,
+                               compression:8, /* enum bch_compression_opt */
                                target:16;
 #elif defined (__BIG_ENDIAN_BITFIELD)
        __u64                   target:16,
                                compression:8,
-                               unused:33,
-                               type:7;
+                               unused:34,
+                               type:6;
 #endif
 };
 
@@ -723,7 +711,7 @@ struct bch_inode {
        __le64                  bi_hash_seed;
        __le32                  bi_flags;
        __le16                  bi_mode;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v2 {
@@ -733,7 +721,7 @@ struct bch_inode_v2 {
        __le64                  bi_hash_seed;
        __le64                  bi_flags;
        __le16                  bi_mode;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 struct bch_inode_v3 {
@@ -745,7 +733,7 @@ struct bch_inode_v3 {
        __le64                  bi_sectors;
        __le64                  bi_size;
        __le64                  bi_version;
-       __u8                    fields[0];
+       __u8                    fields[];
 } __packed __aligned(8);
 
 #define INODEv3_FIELDS_START_INITIAL   6
@@ -836,34 +824,30 @@ enum inode_opt_id {
        Inode_opt_nr,
 };
 
-enum {
-       /*
-        * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
-        * flags)
-        */
-       __BCH_INODE_SYNC                = 0,
-       __BCH_INODE_IMMUTABLE           = 1,
-       __BCH_INODE_APPEND              = 2,
-       __BCH_INODE_NODUMP              = 3,
-       __BCH_INODE_NOATIME             = 4,
-
-       __BCH_INODE_I_SIZE_DIRTY        = 5,
-       __BCH_INODE_I_SECTORS_DIRTY     = 6,
-       __BCH_INODE_UNLINKED            = 7,
-       __BCH_INODE_BACKPTR_UNTRUSTED   = 8,
-
-       /* bits 20+ reserved for packed fields below: */
-};
-
-#define BCH_INODE_SYNC         (1 << __BCH_INODE_SYNC)
-#define BCH_INODE_IMMUTABLE    (1 << __BCH_INODE_IMMUTABLE)
-#define BCH_INODE_APPEND       (1 << __BCH_INODE_APPEND)
-#define BCH_INODE_NODUMP       (1 << __BCH_INODE_NODUMP)
-#define BCH_INODE_NOATIME      (1 << __BCH_INODE_NOATIME)
-#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
-#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
-#define BCH_INODE_UNLINKED     (1 << __BCH_INODE_UNLINKED)
-#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
+#define BCH_INODE_FLAGS()                      \
+       x(sync,                         0)      \
+       x(immutable,                    1)      \
+       x(append,                       2)      \
+       x(nodump,                       3)      \
+       x(noatime,                      4)      \
+       x(i_size_dirty,                 5)      \
+       x(i_sectors_dirty,              6)      \
+       x(unlinked,                     7)      \
+       x(backptr_untrusted,            8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n)        BCH_INODE_##t = 1U << n,
+       BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n)        __BCH_INODE_##t = n,
+       BCH_INODE_FLAGS()
+#undef x
+};
 
 LE32_BITMASK(INODE_STR_HASH,   struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
@@ -916,9 +900,7 @@ struct bch_dirent {
 #define DT_SUBVOL      16
 #define BCH_DT_MAX     17
 
-#define BCH_NAME_MAX   ((unsigned) (U8_MAX * sizeof(__u64) -           \
-                        sizeof(struct bkey) -                          \
-                        offsetof(struct bch_dirent, d_name)))
+#define BCH_NAME_MAX   512
 
 /* Xattrs */
 
@@ -1099,20 +1081,20 @@ struct bch_reflink_v {
        struct bch_val          v;
        __le64                  refcount;
        union bch_extent_entry  start[0];
-       __u64                   _data[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 struct bch_indirect_inline_data {
        struct bch_val          v;
        __le64                  refcount;
-       u8                      data[0];
+       u8                      data[];
 };
 
 /* Inline data */
 
 struct bch_inline_data {
        struct bch_val          v;
-       u8                      data[0];
+       u8                      data[];
 };
 
 /* Subvolumes: */
@@ -1126,6 +1108,11 @@ struct bch_subvolume {
        __le32                  flags;
        __le32                  snapshot;
        __le64                  inode;
+       /*
+        * Snapshot subvolumes form a tree, separate from the snapshot nodes
+        * tree - if this subvolume is a snapshot, this is the ID of the
+        * subvolume it was created from:
+        */
        __le32                  parent;
        __le32                  pad;
        bch_le128               otime;
@@ -1147,6 +1134,7 @@ struct bch_snapshot {
        __le32                  parent;
        __le32                  children[2];
        __le32                  subvol;
+       /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
        __le32                  tree;
        __le32                  depth;
        __le32                  skip[3];
@@ -1179,6 +1167,33 @@ struct bch_lru {
 
 #define LRU_ID_STRIPES         (1U << 16)
 
+/* Logged operations btree: */
+
+struct bch_logged_op_truncate {
+       struct bch_val          v;
+       __le32                  subvol;
+       __le32                  pad;
+       __le64                  inum;
+       __le64                  new_i_size;
+};
+
+enum logged_op_finsert_state {
+       LOGGED_OP_FINSERT_start,
+       LOGGED_OP_FINSERT_shift_extents,
+       LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+       struct bch_val          v;
+       __u8                    state;
+       __u8                    pad[3];
+       __le32                  subvol;
+       __le64                  inum;
+       __le64                  dst_offset;
+       __le64                  src_offset;
+       __le64                  pos;
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1189,7 +1204,7 @@ struct bch_sb_field {
 
 #define BCH_SB_FIELDS()                                \
        x(journal,      0)                      \
-       x(members,      1)                      \
+       x(members_v1,   1)                      \
        x(crypt,        2)                      \
        x(replicas_v0,  3)                      \
        x(quota,        4)                      \
@@ -1198,7 +1213,9 @@ struct bch_sb_field {
        x(replicas,     7)                      \
        x(journal_seq_blacklist, 8)             \
        x(journal_v2,   9)                      \
-       x(counters,     10)
+       x(counters,     10)                     \
+       x(members_v2,   11)                     \
+       x(errors,       12)
 
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -1219,7 +1236,7 @@ enum bch_sb_field_type {
 
 struct bch_sb_field_journal {
        struct bch_sb_field     field;
-       __le64                  buckets[0];
+       __le64                  buckets[];
 };
 
 struct bch_sb_field_journal_v2 {
@@ -1228,13 +1245,38 @@ struct bch_sb_field_journal_v2 {
        struct bch_sb_field_journal_v2_entry {
                __le64          start;
                __le64          nr;
-       }                       d[0];
+       }                       d[];
 };
 
-/* BCH_SB_FIELD_members: */
+/* BCH_SB_FIELD_members_v1: */
 
 #define BCH_MIN_NR_NBUCKETS    (1 << 6)
 
+#define BCH_IOPS_MEASUREMENTS()                        \
+       x(seqread,      0)                      \
+       x(seqwrite,     1)                      \
+       x(randread,     2)                      \
+       x(randwrite,    3)
+
+enum bch_iops_measurement {
+#define x(t, n) BCH_IOPS_##t = n,
+       BCH_IOPS_MEASUREMENTS()
+#undef x
+       BCH_IOPS_NR
+};
+
+#define BCH_MEMBER_ERROR_TYPES()               \
+       x(read,         0)                      \
+       x(write,        1)                      \
+       x(checksum,     2)
+
+enum bch_member_error_type {
+#define x(t, n) BCH_MEMBER_ERROR_##t = n,
+       BCH_MEMBER_ERROR_TYPES()
+#undef x
+       BCH_MEMBER_ERROR_NR
+};
+
 struct bch_member {
        __uuid_t                uuid;
        __le64                  nbuckets;       /* device size */
@@ -1243,17 +1285,23 @@ struct bch_member {
        __le32                  pad;
        __le64                  last_mount;     /* time_t */
 
-       __le64                  flags[2];
+       __le64                  flags;
+       __le32                  iops[4];
+       __le64                  errors[BCH_MEMBER_ERROR_NR];
+       __le64                  errors_at_reset[BCH_MEMBER_ERROR_NR];
+       __le64                  errors_reset_time;
 };
 
-LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
+#define BCH_MEMBER_V1_BYTES    56
+
+LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags,  0,  4)
 /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
-LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15)
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
-LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
-LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags, 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags, 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags, 20, 28)
+LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags, 28, 30)
 LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
-                                       struct bch_member, flags[0], 30, 31)
+                                       struct bch_member, flags, 30, 31)
 
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
@@ -1273,9 +1321,16 @@ enum bch_member_state {
        BCH_MEMBER_STATE_NR
 };
 
-struct bch_sb_field_members {
+struct bch_sb_field_members_v1 {
+       struct bch_sb_field     field;
+       struct bch_member       _members[]; //Members are now variable size
+};
+
+struct bch_sb_field_members_v2 {
        struct bch_sb_field     field;
-       struct bch_member       members[0];
+       __le16                  member_bytes; //size of single member entry
+       u8                      pad[6];
+       struct bch_member       _members[];
 };
 
 /* BCH_SB_FIELD_crypt: */
@@ -1373,19 +1428,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 struct bch_replicas_entry_v0 {
        __u8                    data_type;
        __u8                    nr_devs;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __packed;
 
 struct bch_sb_field_replicas_v0 {
        struct bch_sb_field     field;
-       struct bch_replicas_entry_v0 entries[0];
+       struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
 struct bch_replicas_entry {
        __u8                    data_type;
        __u8                    nr_devs;
        __u8                    nr_required;
-       __u8                    devs[0];
+       __u8                    devs[];
 } __packed;
 
 #define replicas_entry_bytes(_i)                                       \
@@ -1393,7 +1448,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
        struct bch_sb_field     field;
-       struct bch_replicas_entry entries[0];
+       struct bch_replicas_entry entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -1428,7 +1483,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,            struct bch_disk_group, flags[0], 6, 24)
 
 struct bch_sb_field_disk_groups {
        struct bch_sb_field     field;
-       struct bch_disk_group   entries[0];
+       struct bch_disk_group   entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_counters */
@@ -1521,7 +1576,7 @@ enum bch_persistent_counters {
 
 struct bch_sb_field_counters {
        struct bch_sb_field     field;
-       __le64                  d[0];
+       __le64                  d[];
 };
 
 /*
@@ -1535,10 +1590,8 @@ struct jset_entry {
        __u8                    type; /* designates what this jset holds */
        __u8                    pad[3];
 
-       union {
-               struct bkey_i   start[0];
-               __u64           _data[0];
-       };
+       struct bkey_i           start[0];
+       __u64                   _data[];
 };
 
 struct bch_sb_field_clean {
@@ -1549,10 +1602,8 @@ struct bch_sb_field_clean {
        __le16                  _write_clock;
        __le64                  journal_seq;
 
-       union {
-               struct jset_entry start[0];
-               __u64           _data[0];
-       };
+       struct jset_entry       start[0];
+       __u64                   _data[];
 };
 
 struct journal_seq_blacklist_entry {
@@ -1562,13 +1613,20 @@ struct journal_seq_blacklist_entry {
 
 struct bch_sb_field_journal_seq_blacklist {
        struct bch_sb_field     field;
+       struct journal_seq_blacklist_entry start[];
+};
 
-       union {
-               struct journal_seq_blacklist_entry start[0];
-               __u64           _data[0];
-       };
+struct bch_sb_field_errors {
+       struct bch_sb_field     field;
+       struct bch_sb_field_error_entry {
+               __le64          v;
+               __le64          last_error_time;
+       }                       entries[];
 };
 
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID,    struct bch_sb_field_error_entry, v,  0, 16);
+LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR,    struct bch_sb_field_error_entry, v, 16, 64);
+
 /* Superblock: */
 
 /*
@@ -1631,7 +1689,9 @@ struct bch_sb_field_journal_seq_blacklist {
        x(snapshot_skiplists,           BCH_VERSION(1,  1),             \
          BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))                   \
        x(deleted_inodes,               BCH_VERSION(1,  2),             \
-         BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+         BIT_ULL(BCH_RECOVERY_PASS_check_inodes))                      \
+       x(rebalance_work,               BCH_VERSION(1,  3),             \
+         BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
@@ -1641,7 +1701,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_max
 };
 
-static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+static const __maybe_unused
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
 
@@ -1702,10 +1763,8 @@ struct bch_sb {
 
        struct bch_sb_layout    layout;
 
-       union {
-               struct bch_sb_field start[0];
-               __le64          _data[0];
-       };
+       struct bch_sb_field     start[0];
+       __le64                  _data[];
 } __packed __aligned(8);
 
 /*
@@ -1950,7 +2009,7 @@ enum bch_csum_type {
        BCH_CSUM_NR
 };
 
-static const unsigned bch_crc_bytes[] = {
+static const __maybe_unused unsigned bch_crc_bytes[] = {
        [BCH_CSUM_none]                         = 0,
        [BCH_CSUM_crc32c_nonzero]               = 4,
        [BCH_CSUM_crc32c]                       = 4,
@@ -2182,10 +2241,8 @@ struct jset {
        __le64                  last_seq;
 
 
-       union {
-               struct jset_entry start[0];
-               __u64           _data[0];
-       };
+       struct jset_entry       start[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(JSET_CSUM_TYPE,   struct jset, flags, 0, 4);
@@ -2199,7 +2256,8 @@ LE32_BITMASK(JSET_NO_FLUSH,       struct jset, flags, 5, 6);
 enum btree_id_flags {
        BTREE_ID_EXTENTS        = BIT(0),
        BTREE_ID_SNAPSHOTS      = BIT(1),
-       BTREE_ID_DATA           = BIT(2),
+       BTREE_ID_SNAPSHOT_FIELD = BIT(2),
+       BTREE_ID_DATA           = BIT(3),
 };
 
 #define BCH_BTREE_IDS()                                                                \
@@ -2254,8 +2312,13 @@ enum btree_id_flags {
          BIT_ULL(KEY_TYPE_bucket_gens))                                        \
        x(snapshot_trees,       15,     0,                                      \
          BIT_ULL(KEY_TYPE_snapshot_tree))                                      \
-       x(deleted_inodes,       16,     BTREE_ID_SNAPSHOTS,                     \
-         BIT_ULL(KEY_TYPE_set))
+       x(deleted_inodes,       16,     BTREE_ID_SNAPSHOT_FIELD,                \
+         BIT_ULL(KEY_TYPE_set))                                                \
+       x(logged_ops,           17,     0,                                      \
+         BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
+         BIT_ULL(KEY_TYPE_logged_op_finsert))                                  \
+       x(rebalance_work,       18,     BTREE_ID_SNAPSHOT_FIELD,                \
+         BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -2290,10 +2353,8 @@ struct bset {
        __le16                  version;
        __le16                  u64s; /* count of d[] in u64s */
 
-       union {
-               struct bkey_packed start[0];
-               __u64           _data[0];
-       };
+       struct bkey_packed      start[0];
+       __u64                   _data[];
 } __packed __aligned(8);
 
 LE32_BITMASK(BSET_CSUM_TYPE,   struct bset, flags, 0, 4);
index ee7ba700e75f4ee3afbac3ab241c7d6d9b1278c3..abdb05507d162c7c06bb89ce96bf67f6484207a7 100644 (file)
@@ -7,14 +7,6 @@
 #include "bset.h"
 #include "util.h"
 
-#undef EBUG_ON
-
-#ifdef DEBUG_BKEYS
-#define EBUG_ON(cond)          BUG_ON(cond)
-#else
-#define EBUG_ON(cond)
-#endif
-
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
 void bch2_bkey_packed_to_binary_text(struct printbuf *out,
@@ -135,7 +127,7 @@ static void pack_state_finish(struct pack_state *state,
                              struct bkey_packed *k)
 {
        EBUG_ON(state->p <  k->_data);
-       EBUG_ON(state->p >= k->_data + state->format->key_u64s);
+       EBUG_ON(state->p >= (u64 *) k->_data + state->format->key_u64s);
 
        *state->p = state->w;
 }
@@ -184,6 +176,28 @@ static u64 get_inc_field(struct unpack_state *state, unsigned field)
        return v + offset;
 }
 
+__always_inline
+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v)
+{
+       unsigned bits = state->format->bits_per_field[field];
+
+       if (bits) {
+               if (bits > state->bits) {
+                       bits -= state->bits;
+                       /* avoid shift by 64 if bits is 64 - bits is never 0 here: */
+                       state->w |= (v >> 1) >> (bits - 1);
+
+                       *state->p = state->w;
+                       state->p = next_word(state->p);
+                       state->w = 0;
+                       state->bits = 64;
+               }
+
+               state->bits -= bits;
+               state->w |= v << state->bits;
+       }
+}
+
 __always_inline
 static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
 {
@@ -198,20 +212,7 @@ static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
        if (fls64(v) > bits)
                return false;
 
-       if (bits > state->bits) {
-               bits -= state->bits;
-               /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
-               state->w |= (v >> 1) >> (bits - 1);
-
-               *state->p = state->w;
-               state->p = next_word(state->p);
-               state->w = 0;
-               state->bits = 64;
-       }
-
-       state->bits -= bits;
-       state->w |= v << state->bits;
-
+       __set_inc_field(state, field, v);
        return true;
 }
 
@@ -307,9 +308,14 @@ struct bpos __bkey_unpack_pos(const struct bkey_format *format,
 
 /**
  * bch2_bkey_pack_key -- pack just the key, not the value
+ * @out:       packed result
+ * @in:                key to pack
+ * @format:    format of packed result
+ *
+ * Returns: true on success, false on failure
  */
 bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
-                  const struct bkey_format *format)
+                       const struct bkey_format *format)
 {
        struct pack_state state = pack_state_init(format, out);
        u64 *w = out->_data;
@@ -335,9 +341,12 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
 
 /**
  * bch2_bkey_unpack -- unpack the key and the value
+ * @b:         btree node of @src key (for packed format)
+ * @dst:       unpacked result
+ * @src:       packed input
  */
 void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
-                const struct bkey_packed *src)
+                     const struct bkey_packed *src)
 {
        __bkey_unpack_key(b, &dst->k, src);
 
@@ -348,19 +357,24 @@ void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
 
 /**
  * bch2_bkey_pack -- pack the key and the value
+ * @dst:       packed result
+ * @src:       unpacked input
+ * @format:    format of packed result
+ *
+ * Returns: true on success, false on failure
  */
-bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
-              const struct bkey_format *format)
+bool bch2_bkey_pack(struct bkey_packed *dst, const struct bkey_i *src,
+                   const struct bkey_format *format)
 {
        struct bkey_packed tmp;
 
-       if (!bch2_bkey_pack_key(&tmp, &in->k, format))
+       if (!bch2_bkey_pack_key(&tmp, &src->k, format))
                return false;
 
-       memmove_u64s((u64 *) out + format->key_u64s,
-                    &in->v,
-                    bkey_val_u64s(&in->k));
-       memcpy_u64s_small(out, &tmp, format->key_u64s);
+       memmove_u64s((u64 *) dst + format->key_u64s,
+                    &src->v,
+                    bkey_val_u64s(&src->k));
+       memcpy_u64s_small(dst, &tmp, format->key_u64s);
 
        return true;
 }
@@ -380,19 +394,7 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
                ret = false;
        }
 
-       if (bits > state->bits) {
-               bits -= state->bits;
-               state->w |= (v >> 1) >> (bits - 1);
-
-               *state->p = state->w;
-               state->p = next_word(state->p);
-               state->w = 0;
-               state->bits = 64;
-       }
-
-       state->bits -= bits;
-       state->w |= v << state->bits;
-
+       __set_inc_field(state, field, v);
        return ret;
 }
 
@@ -435,6 +437,24 @@ static bool bkey_packed_successor(struct bkey_packed *out,
 
        return false;
 }
+
+static bool bkey_format_has_too_big_fields(const struct bkey_format *f)
+{
+       for (unsigned i = 0; i < f->nr_fields; i++) {
+               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+               u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+               u64 packed_max = f->bits_per_field[i]
+                       ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+                       : 0;
+               u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+               if (packed_max + field_offset < packed_max ||
+                   packed_max + field_offset > unpacked_max)
+                       return true;
+       }
+
+       return false;
+}
 #endif
 
 /*
@@ -515,7 +535,8 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
 
                BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
                BUG_ON(bkey_packed_successor(&successor, b, *out) &&
-                      bkey_cmp_left_packed(b, &successor, &orig) < 0);
+                      bkey_cmp_left_packed(b, &successor, &orig) < 0 &&
+                      !bkey_format_has_too_big_fields(f));
        }
 #endif
 
@@ -583,8 +604,10 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
 
        /* allow for extent merging: */
        if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
-               ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
-               bits += 4;
+               unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]);
+
+               ret.bits_per_field[BKEY_FIELD_SIZE] += b;
+               bits += b;
        }
 
        ret.key_u64s = DIV_ROUND_UP(bits, 64);
@@ -604,40 +627,74 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
                }
        }
 
-       EBUG_ON(bch2_bkey_format_validate(&ret));
+#ifdef CONFIG_BCACHEFS_DEBUG
+       {
+               struct printbuf buf = PRINTBUF;
+
+               BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf));
+               printbuf_exit(&buf);
+       }
+#endif
        return ret;
 }
 
-const char *bch2_bkey_format_validate(struct bkey_format *f)
+int bch2_bkey_format_invalid(struct bch_fs *c,
+                            struct bkey_format *f,
+                            enum bkey_invalid_flags flags,
+                            struct printbuf *err)
 {
        unsigned i, bits = KEY_PACKED_BITS_START;
 
-       if (f->nr_fields != BKEY_NR_FIELDS)
-               return "incorrect number of fields";
+       if (f->nr_fields != BKEY_NR_FIELDS) {
+               prt_printf(err, "incorrect number of fields: got %u, should be %u",
+                          f->nr_fields, BKEY_NR_FIELDS);
+               return -BCH_ERR_invalid;
+       }
 
        /*
         * Verify that the packed format can't represent fields larger than the
         * unpacked format:
         */
        for (i = 0; i < f->nr_fields; i++) {
-               unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
-               u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
-               u64 packed_max = f->bits_per_field[i]
-                       ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
-                       : 0;
-               u64 field_offset = le64_to_cpu(f->field_offset[i]);
-
-               if (packed_max + field_offset < packed_max ||
-                   packed_max + field_offset > unpacked_max)
-                       return "field too large";
+               if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) {
+                       unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
+                       u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
+                       u64 packed_max = f->bits_per_field[i]
+                               ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1))
+                               : 0;
+                       u64 field_offset = le64_to_cpu(f->field_offset[i]);
+
+                       if (packed_max + field_offset < packed_max ||
+                           packed_max + field_offset > unpacked_max) {
+                               prt_printf(err, "field %u too large: %llu + %llu > %llu",
+                                          i, packed_max, field_offset, unpacked_max);
+                               return -BCH_ERR_invalid;
+                       }
+               }
 
                bits += f->bits_per_field[i];
        }
 
-       if (f->key_u64s != DIV_ROUND_UP(bits, 64))
-               return "incorrect key_u64s";
+       if (f->key_u64s != DIV_ROUND_UP(bits, 64)) {
+               prt_printf(err, "incorrect key_u64s: got %u, should be %u",
+                          f->key_u64s, DIV_ROUND_UP(bits, 64));
+               return -BCH_ERR_invalid;
+       }
 
-       return NULL;
+       return 0;
+}
+
+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f)
+{
+       prt_printf(out, "u64s %u fields ", f->key_u64s);
+
+       for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) {
+               if (i)
+                       prt_str(out, ", ");
+               prt_printf(out, "%u:%llu",
+                          f->bits_per_field[i],
+                          le64_to_cpu(f->field_offset[i]));
+       }
 }
 
 /*
index e81fb3e00c602dfca2e544ac85de7b4584c5b92d..831be01809f2c9271d4db159377decd1b8686bb6 100644 (file)
@@ -9,6 +9,12 @@
 #include "util.h"
 #include "vstructs.h"
 
+enum bkey_invalid_flags {
+       BKEY_INVALID_WRITE              = (1U << 0),
+       BKEY_INVALID_COMMIT             = (1U << 1),
+       BKEY_INVALID_JOURNAL            = (1U << 2),
+};
+
 #if 0
 
 /*
@@ -46,7 +52,7 @@ struct bkey_s {
 
 static inline struct bkey_i *bkey_next(struct bkey_i *k)
 {
-       return (struct bkey_i *) (k->_data + k->k.u64s);
+       return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
 }
 
 #define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
@@ -86,19 +92,15 @@ enum bkey_lr_packed {
 #define bkey_lr_packed(_l, _r)                                         \
        ((_l)->format + ((_r)->format << 1))
 
-#define bkey_copy(_dst, _src)                                  \
-do {                                                           \
-       BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) &&         \
-                    !type_is(_dst, struct bkey_packed *));     \
-       BUILD_BUG_ON(!type_is(_src, struct bkey_i *) &&         \
-                    !type_is(_src, struct bkey_packed *));     \
-       EBUG_ON((u64 *) (_dst) > (u64 *) (_src) &&              \
-               (u64 *) (_dst) < (u64 *) (_src) +               \
-               ((struct bkey *) (_src))->u64s);                \
-                                                               \
-       memcpy_u64s_small((_dst), (_src),                       \
-                         ((struct bkey *) (_src))->u64s);      \
-} while (0)
+static inline void bkey_p_copy(struct bkey_packed *dst, const struct bkey_packed *src)
+{
+       memcpy_u64s_small(dst, src, src->u64s);
+}
+
+static inline void bkey_copy(struct bkey_i *dst, const struct bkey_i *src)
+{
+       memcpy_u64s_small(dst, src, src->k.u64s);
+}
 
 struct btree;
 
@@ -391,7 +393,7 @@ static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
 }
 
 #define bkeyp_val(_format, _k)                                         \
-        ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
+        ((struct bch_val *) ((u64 *) (_k)->_data + bkeyp_key_u64s(_format, _k)))
 
 extern const struct bkey_format bch2_bkey_format_current;
 
@@ -726,7 +728,7 @@ static inline unsigned high_word_offset(const struct bkey_format *f)
 #error edit for your odd byteorder.
 #endif
 
-#define high_word(f, k)                ((k)->_data + high_word_offset(f))
+#define high_word(f, k)                ((u64 *) (k)->_data + high_word_offset(f))
 #define next_word(p)           nth_word(p, 1)
 #define prev_word(p)           nth_word(p, -1)
 
@@ -769,6 +771,8 @@ static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const s
 
 void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
 struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
-const char *bch2_bkey_format_validate(struct bkey_format *);
+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *,
+                            enum bkey_invalid_flags, struct printbuf *);
+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *);
 
 #endif /* _BCACHEFS_BKEY_H */
index 90557f4c156db96853fded059e13c09fa2f3dd42..761f5e33b1e69e94ca0aaaa41a9825e496b5840f 100644 (file)
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "backpointers.h"
 #include "bkey_methods.h"
+#include "btree_cache.h"
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "io_misc.h"
 #include "lru.h"
 #include "quota.h"
 #include "reflink.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "xattr.h"
 
@@ -23,8 +26,8 @@ const char * const bch2_bkey_types[] = {
        NULL
 };
 
-static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                              unsigned flags, struct printbuf *err)
+static int deleted_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+                              enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -37,24 +40,25 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
        .key_invalid = deleted_key_invalid,             \
 })
 
-static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                unsigned flags, struct printbuf *err)
+static int empty_val_key_invalid(struct bch_fs *c, struct bkey_s_c k,
+                                enum bkey_invalid_flags flags, struct printbuf *err)
 {
-       if (bkey_val_bytes(k.k)) {
-               prt_printf(err, "incorrect value size (%zu != 0)",
-                      bkey_val_bytes(k.k));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_val_bytes(k.k), c, err,
+                        bkey_val_size_nonzero,
+                        "incorrect value size (%zu != 0)",
+                        bkey_val_bytes(k.k));
+fsck_err:
+       return ret;
 }
 
 #define bch2_bkey_ops_error ((struct bkey_ops) {       \
        .key_invalid = empty_val_key_invalid,           \
 })
 
-static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                  unsigned flags, struct printbuf *err)
+static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
+                                  enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -68,8 +72,8 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k,
        .key_invalid = empty_val_key_invalid,           \
 })
 
-static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                                       unsigned flags, struct printbuf *err)
+static int key_type_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
+                                       enum bkey_invalid_flags flags, struct printbuf *err)
 {
        return 0;
 }
@@ -89,18 +93,6 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
        .val_to_text    = key_type_inline_data_to_text, \
 })
 
-static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                               unsigned flags, struct printbuf *err)
-{
-       if (bkey_val_bytes(k.k)) {
-               prt_printf(err, "incorrect value size (%zu != %zu)",
-                      bkey_val_bytes(k.k), sizeof(struct bch_cookie));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
-}
-
 static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
        bch2_key_resize(l.k, l.k->size + r.k->size);
@@ -108,7 +100,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
 }
 
 #define bch2_bkey_ops_set ((struct bkey_ops) {         \
-       .key_invalid    = key_type_set_invalid,         \
+       .key_invalid    = empty_val_key_invalid,        \
        .key_merge      = key_type_set_merge,           \
 })
 
@@ -119,7 +111,6 @@ const struct bkey_ops bch2_bkey_ops[] = {
 };
 
 const struct bkey_ops bch2_bkey_null_ops = {
-       .min_val_size = U8_MAX,
 };
 
 int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -127,84 +118,95 @@ int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k,
                          struct printbuf *err)
 {
        const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type);
+       int ret = 0;
 
-       if (bkey_val_bytes(k.k) < ops->min_val_size) {
-               prt_printf(err, "bad val size (%zu < %u)",
-                          bkey_val_bytes(k.k), ops->min_val_size);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_val_bytes(k.k) < ops->min_val_size, c, err,
+                        bkey_val_size_too_small,
+                        "bad val size (%zu < %u)",
+                        bkey_val_bytes(k.k), ops->min_val_size);
 
        if (!ops->key_invalid)
                return 0;
 
-       return ops->key_invalid(c, k, flags, err);
+       ret = ops->key_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 static u64 bch2_key_types_allowed[] = {
-#define x(name, nr, flags, keys)       [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
-       BCH_BTREE_IDS()
-#undef x
        [BKEY_TYPE_btree] =
                BIT_ULL(KEY_TYPE_deleted)|
                BIT_ULL(KEY_TYPE_btree_ptr)|
                BIT_ULL(KEY_TYPE_btree_ptr_v2),
+#define x(name, nr, flags, keys)       [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys,
+       BCH_BTREE_IDS()
+#undef x
 };
 
+const char *bch2_btree_node_type_str(enum btree_node_type type)
+{
+       return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
+}
+
 int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                        enum btree_node_type type,
                        enum bkey_invalid_flags flags,
                        struct printbuf *err)
 {
-       if (k.k->u64s < BKEY_U64s) {
-               prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       if (flags & BKEY_INVALID_COMMIT  &&
-           !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) {
-               prt_printf(err, "invalid key type for btree %s (%s)",
-                          bch2_btree_ids[type], bch2_bkey_types[k.k->type]);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(k.k->u64s < BKEY_U64s, c, err,
+                        bkey_u64s_too_small,
+                        "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s);
 
-       if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
-               if (k.k->size == 0) {
-                       prt_printf(err, "size == 0");
-                       return -BCH_ERR_invalid_bkey;
-               }
+       if (type >= BKEY_TYPE_NR)
+               return 0;
 
-               if (k.k->size > k.k->p.offset) {
-                       prt_printf(err, "size greater than offset (%u > %llu)",
-                              k.k->size, k.k->p.offset);
-                       return -BCH_ERR_invalid_bkey;
-               }
+       bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+                        !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
+                        bkey_invalid_type_for_btree,
+                        "invalid key type for btree %s (%s)",
+                        bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+
+       if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+               bkey_fsck_err_on(k.k->size == 0, c, err,
+                                bkey_extent_size_zero,
+                                "size == 0");
+
+               bkey_fsck_err_on(k.k->size > k.k->p.offset, c, err,
+                                bkey_extent_size_greater_than_offset,
+                                "size greater than offset (%u > %llu)",
+                                k.k->size, k.k->p.offset);
        } else {
-               if (k.k->size) {
-                       prt_printf(err, "size != 0");
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(k.k->size, c, err,
+                                bkey_size_nonzero,
+                                "size != 0");
        }
 
        if (type != BKEY_TYPE_btree) {
-               if (!btree_type_has_snapshots((enum btree_id) type) &&
-                   k.k->p.snapshot) {
-                       prt_printf(err, "nonzero snapshot");
-                       return -BCH_ERR_invalid_bkey;
-               }
-
-               if (btree_type_has_snapshots((enum btree_id) type) &&
-                   !k.k->p.snapshot) {
-                       prt_printf(err, "snapshot == 0");
-                       return -BCH_ERR_invalid_bkey;
+               enum btree_id btree = type - 1;
+
+               if (btree_type_has_snapshots(btree)) {
+                       bkey_fsck_err_on(!k.k->p.snapshot, c, err,
+                                        bkey_snapshot_zero,
+                                        "snapshot == 0");
+               } else if (!btree_type_has_snapshot_field(btree)) {
+                       bkey_fsck_err_on(k.k->p.snapshot, c, err,
+                                        bkey_snapshot_nonzero,
+                                        "nonzero snapshot");
+               } else {
+                       /*
+                        * btree uses snapshot field but it's not required to be
+                        * nonzero
+                        */
                }
 
-               if (bkey_eq(k.k->p, POS_MAX)) {
-                       prt_printf(err, "key at POS_MAX");
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(bkey_eq(k.k->p, POS_MAX), c, err,
+                                bkey_at_pos_max,
+                                "key at POS_MAX");
        }
-
-       return 0;
+fsck_err:
+       return ret;
 }
 
 int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -216,20 +218,20 @@ int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                bch2_bkey_val_invalid(c, k, flags, err);
 }
 
-int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k,
-                           struct printbuf *err)
+int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b,
+                           struct bkey_s_c k, struct printbuf *err)
 {
-       if (bpos_lt(k.k->p, b->data->min_key)) {
-               prt_printf(err, "key before start of btree node");
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       if (bpos_gt(k.k->p, b->data->max_key)) {
-               prt_printf(err, "key past end of btree node");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bpos_lt(k.k->p, b->data->min_key), c, err,
+                        bkey_before_start_of_btree_node,
+                        "key before start of btree node");
 
-       return 0;
+       bkey_fsck_err_on(bpos_gt(k.k->p, b->data->max_key), c, err,
+                        bkey_after_end_of_btree_node,
+                        "key past end of btree node");
+fsck_err:
+       return ret;
 }
 
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
@@ -367,7 +369,6 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
 {
        const struct bkey_ops *ops;
        struct bkey uk;
-       struct bkey_s u;
        unsigned nr_compat = 5;
        int i;
 
@@ -432,7 +433,9 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                }
 
                break;
-       case 4:
+       case 4: {
+               struct bkey_s u;
+
                if (!bkey_packed(k)) {
                        u = bkey_i_to_s(packed_to_bkey(k));
                } else {
@@ -449,6 +452,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
                if (ops->compat)
                        ops->compat(btree_id, version, big_endian, write, u);
                break;
+       }
        default:
                BUG();
        }
index d7b63769068c7464e709d1724d4a53c39c3bc4f4..912adadfb4dd40a3435d7f6a82eba365a750fa67 100644 (file)
@@ -13,12 +13,6 @@ enum btree_node_type;
 extern const char * const bch2_bkey_types[];
 extern const struct bkey_ops bch2_bkey_null_ops;
 
-enum bkey_invalid_flags {
-       BKEY_INVALID_WRITE              = (1U << 0),
-       BKEY_INVALID_COMMIT             = (1U << 1),
-       BKEY_INVALID_JOURNAL            = (1U << 2),
-};
-
 /*
  * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If
  * invalid, entire key will be deleted.
@@ -27,7 +21,7 @@ enum bkey_invalid_flags {
  * being read or written; more aggressive checks can be enabled when rw == WRITE.
  */
 struct bkey_ops {
-       int             (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k,
+       int             (*key_invalid)(struct bch_fs *c, struct bkey_s_c k,
                                       enum bkey_invalid_flags flags, struct printbuf *err);
        void            (*val_to_text)(struct printbuf *, struct bch_fs *,
                                       struct bkey_s_c);
@@ -61,7 +55,8 @@ int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
                        enum bkey_invalid_flags, struct printbuf *);
 int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type,
                      enum bkey_invalid_flags, struct printbuf *);
-int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *);
+int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *,
+                           struct bkey_s_c, struct printbuf *);
 
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
@@ -98,7 +93,6 @@ static inline int bch2_mark_key(struct btree_trans *trans,
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
        __BTREE_UPDATE_NOJOURNAL,
-       __BTREE_UPDATE_PREJOURNAL,
        __BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
@@ -113,7 +107,6 @@ enum btree_update_flags {
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
 #define BTREE_UPDATE_NOJOURNAL         (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL                (1U << __BTREE_UPDATE_PREJOURNAL)
 #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
@@ -125,16 +118,6 @@ enum btree_update_flags {
 #define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
 #define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
 
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_alloc)|                \
-        (1U << KEY_TYPE_alloc_v2)|             \
-        (1U << KEY_TYPE_alloc_v3)|             \
-        (1U << KEY_TYPE_alloc_v4)|             \
-        (1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode)|                \
-        (1U << KEY_TYPE_inode_v2)|             \
-        (1U << KEY_TYPE_snapshot))
-
 static inline int bch2_trans_mark_key(struct btree_trans *trans,
                                      enum btree_id btree_id, unsigned level,
                                      struct bkey_s_c old, struct bkey_i *new,
index b9aa027c881b14458394ae8fdad6d93f53e1cbbf..bcca9e76a0b4bf40f20903c856e7559e60b87da1 100644 (file)
@@ -106,7 +106,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        while ((k = sort_iter_peek(iter))) {
                if (!bkey_deleted(k) &&
                    !should_drop_next_key(iter)) {
-                       bkey_copy(out, k);
+                       bkey_p_copy(out, k);
                        btree_keys_account_key_add(&nr, 0, out);
                        out = bkey_p_next(out);
                }
@@ -137,7 +137,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
                        continue;
 
                if (!transform)
-                       bkey_copy(out, in);
+                       bkey_p_copy(out, in);
                else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
                                             ? in_f : &bch2_bkey_format_current, in))
                        out->format = KEY_FORMAT_LOCAL_BTREE;
@@ -191,7 +191,7 @@ unsigned bch2_sort_keys(struct bkey_packed *dst,
                        memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in));
                        set_bkeyp_val_u64s(f, out, 0);
                } else {
-                       bkey_copy(out, in);
+                       bkey_p_copy(out, in);
                }
                out->needs_whiteout |= needs_whiteout;
                out = bkey_p_next(out);
index 79cf11d1b4e7e69d5e13512b3d5d7a8b14f39c42..7c0f0b160f18533302ebc0e5568c7599f2f6cffd 100644 (file)
@@ -9,14 +9,24 @@ struct sort_iter {
 
        struct sort_iter_set {
                struct bkey_packed *k, *end;
-       } data[MAX_BSETS + 1];
+       } data[];
 };
 
-static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b, unsigned size)
 {
        iter->b = b;
        iter->used = 0;
-       iter->size = ARRAY_SIZE(iter->data);
+       iter->size = size;
+}
+
+struct sort_iter_stack {
+       struct sort_iter        iter;
+       struct sort_iter_set    sets[MAX_BSETS + 1];
+};
+
+static inline void sort_iter_stack_init(struct sort_iter_stack *iter, struct btree *b)
+{
+       sort_iter_init(&iter->iter, b, ARRAY_SIZE(iter->sets));
 }
 
 static inline void sort_iter_add(struct sort_iter *iter,
index bcdf28f39b9c3db2b1b3f14b093348f3a0412da6..bb73ba9017b006e7fe181e19b7cccfe8494c1339 100644 (file)
@@ -172,10 +172,10 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
                printk(KERN_ERR "iter was:");
 
                btree_node_iter_for_each(_iter, set) {
-                       struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
-                       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+                       struct bkey_packed *k2 = __btree_node_offset_to_key(b, set->k);
+                       struct bset_tree *t = bch2_bkey_to_bset(b, k2);
                        printk(" [%zi %zi]", t - b->set,
-                              k->_data - bset(b, t)->_data);
+                              k2->_data - bset(b, t)->_data);
                }
                panic("\n");
        }
@@ -232,7 +232,7 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
        struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
-       struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
+       struct bkey_packed *next = (void *) ((u64 *) where->_data + clobber_u64s);
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
 #if 0
@@ -300,7 +300,8 @@ static unsigned bkey_float_byte_offset(unsigned idx)
 }
 
 struct ro_aux_tree {
-       struct bkey_float       f[0];
+       u8                      nothing[0];
+       struct bkey_float       f[];
 };
 
 struct rw_aux_tree {
@@ -476,7 +477,7 @@ static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
 {
        unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
 
-       return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
+       return (void *) ((u64 *) tree_to_bkey(b, t, j)->_data - prev_u64s);
 }
 
 static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
@@ -1010,8 +1011,8 @@ void bch2_bset_insert(struct btree *b,
                btree_keys_account_key_add(&b->nr, t - b->set, src);
 
        if (src->u64s != clobber_u64s) {
-               u64 *src_p = where->_data + clobber_u64s;
-               u64 *dst_p = where->_data + src->u64s;
+               u64 *src_p = (u64 *) where->_data + clobber_u64s;
+               u64 *dst_p = (u64 *) where->_data + src->u64s;
 
                EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
                        (int) clobber_u64s - src->u64s);
@@ -1037,7 +1038,7 @@ void bch2_bset_delete(struct btree *b,
                      unsigned clobber_u64s)
 {
        struct bset_tree *t = bset_tree_last(b);
-       u64 *src_p = where->_data + clobber_u64s;
+       u64 *src_p = (u64 *) where->_data + clobber_u64s;
        u64 *dst_p = where->_data;
 
        bch2_bset_verify_rw_aux_tree(b, t);
@@ -1188,7 +1189,7 @@ struct bkey_packed *__bch2_bset_search(struct btree *b,
        case BSET_RO_AUX_TREE:
                return bset_search_tree(b, t, search, lossy_packed_search);
        default:
-               unreachable();
+               BUG();
        }
 }
 
@@ -1268,9 +1269,13 @@ static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
 }
 
 /**
- * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
+ * bch2_btree_node_iter_init - initialize a btree node iterator, starting from a
  * given position
  *
+ * @iter:      iterator to initialize
+ * @b:         btree node to search
+ * @search:    search key
+ *
  * Main entry point to the lookup code for individual btree nodes:
  *
  * NOTE:
index 13c88d9533e5cdf5fa6bf58a397b5d71284494a3..84636ad034fa84cdfaecf654cc9b17eeb79bd04a 100644 (file)
@@ -525,7 +525,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        bc->shrink.scan_objects         = bch2_btree_cache_scan;
        bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
        bc->shrink.seeks                = 4;
-       ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
+       ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
        if (ret)
                goto err;
 
@@ -795,7 +795,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
        six_unlock_intent(&b->c.lock);
 
        /* Unlock before doing IO: */
-       if (trans && sync)
+       if (path && sync)
                bch2_trans_unlock_noassert(trans);
 
        bch2_btree_node_read(c, b, sync);
@@ -832,12 +832,12 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
               "btree node header doesn't match ptr\n"
               "btree %s level %u\n"
               "ptr: ",
-              bch2_btree_ids[b->c.btree_id], b->c.level);
+              bch2_btree_id_str(b->c.btree_id), b->c.level);
        bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
        prt_printf(&buf, "\nheader: btree %s level %llu\n"
               "min ",
-              bch2_btree_ids[BTREE_NODE_ID(b->data)],
+              bch2_btree_id_str(BTREE_NODE_ID(b->data)),
               BTREE_NODE_LEVEL(b->data));
        bch2_bpos_to_text(&buf, b->data->min_key);
 
@@ -934,7 +934,7 @@ retry:
        }
 
        if (unlikely(need_relock)) {
-               int ret = bch2_trans_relock(trans) ?:
+               ret = bch2_trans_relock(trans) ?:
                        bch2_btree_path_relock_intent(trans, path);
                if (ret) {
                        six_unlock_type(&b->c.lock, lock_type);
@@ -965,11 +965,20 @@ retry:
 }
 
 /**
- * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * bch2_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
  *
+ * @trans:     btree transaction object
+ * @path:      btree_path being traversed
+ * @k:         pointer to btree node (generally KEY_TYPE_btree_ptr_v2)
+ * @level:     level of btree node being looked up (0 == leaf node)
+ * @lock_type: SIX_LOCK_read or SIX_LOCK_intent
+ * @trace_ip:  ip of caller of btree iterator code (i.e. caller of bch2_btree_iter_peek())
+ *
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
+ *
+ * Returns: btree node or ERR_PTR()
  */
 struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
                                  const struct bkey_i *k, unsigned level,
@@ -1016,28 +1025,8 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
        }
 
        if (unlikely(btree_node_read_in_flight(b))) {
-               u32 seq = six_lock_seq(&b->c.lock);
-
                six_unlock_type(&b->c.lock, lock_type);
-               bch2_trans_unlock(trans);
-
-               bch2_btree_node_wait_on_read(b);
-
-               /*
-                * should_be_locked is not set on this path yet, so we need to
-                * relock it specifically:
-                */
-               if (trans) {
-                       int ret = bch2_trans_relock(trans) ?:
-                               bch2_btree_path_relock_intent(trans, path);
-                       if (ret) {
-                               BUG_ON(!trans->restarted);
-                               return ERR_PTR(ret);
-                       }
-               }
-
-               if (!six_relock_type(&b->c.lock, lock_type, seq))
-                       return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
+               return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip);
        }
 
        prefetch(b->aux_data);
@@ -1211,10 +1200,22 @@ wait_on_io:
        six_unlock_intent(&b->c.lock);
 }
 
-void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
-                            const struct btree *b)
+const char *bch2_btree_id_str(enum btree_id btree)
+{
+       return btree < BTREE_ID_NR ? __bch2_btree_ids[btree] : "(unknown)";
+}
+
+void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
+{
+       prt_printf(out, "%s level %u/%u\n  ",
+              bch2_btree_id_str(b->c.btree_id),
+              b->c.level,
+              bch2_btree_id_root(c, b->c.btree_id)->level);
+       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+}
+
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b)
 {
-       const struct bkey_format *f = &b->format;
        struct bset_stats stats;
 
        memset(&stats, 0, sizeof(stats));
@@ -1228,9 +1229,13 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
        prt_printf(out, ":\n"
               "    ptrs: ");
        bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
+       prt_newline(out);
+
+       prt_printf(out,
+              "    format: ");
+       bch2_bkey_format_to_text(out, &b->format);
 
-       prt_printf(out, "\n"
-              "    format: u64s %u fields %u %u %u %u %u\n"
+       prt_printf(out,
               "    unpack fn len: %u\n"
               "    bytes used %zu/%zu (%zu%% full)\n"
               "    sib u64s: %u, %u (merge threshold %u)\n"
@@ -1238,12 +1243,6 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
               "    nr unpacked keys %u\n"
               "    floats %zu\n"
               "    failed unpacked %zu\n",
-              f->key_u64s,
-              f->bits_per_field[0],
-              f->bits_per_field[1],
-              f->bits_per_field[2],
-              f->bits_per_field[3],
-              f->bits_per_field[4],
               b->unpack_fn_len,
               b->nr.live_u64s * sizeof(u64),
               btree_bytes(c) - sizeof(struct btree_node),
index 00c9b92183e7969839d48240e3e4b05498a98ed7..bfe1d7482cbc8d5205ddb87edf801e251c6206a2 100644 (file)
@@ -123,8 +123,9 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b)
        return bch2_btree_id_root(c, b->c.btree_id)->b;
 }
 
-void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
-                            const struct btree *);
+const char *bch2_btree_id_str(enum btree_id);
+void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *);
 void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *);
 
 #endif /* _BCACHEFS_BTREE_CACHE_H */
index 49e9822dda1dd8d07b7be885aefdefe649345d34..c4922bd30fafa52990cffca0ab19761fa28fc97d 100644 (file)
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
@@ -43,7 +44,7 @@
 static bool should_restart_for_topology_repair(struct bch_fs *c)
 {
        return c->opts.fix_errors != FSCK_FIX_no &&
-               !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
+               !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology));
 }
 
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
@@ -94,15 +95,15 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
 
                        if (__fsck_err(c,
-                                 FSCK_CAN_FIX|
-                                 FSCK_CAN_IGNORE|
-                                 FSCK_NO_RATELIMIT,
-                                 "btree node with incorrect min_key at btree %s level %u:\n"
-                                 "  prev %s\n"
-                                 "  cur %s",
-                                 bch2_btree_ids[b->c.btree_id], b->c.level,
-                                 buf1.buf, buf2.buf) &&
-                           should_restart_for_topology_repair(c)) {
+                                      FSCK_CAN_FIX|
+                                      FSCK_CAN_IGNORE|
+                                      FSCK_NO_RATELIMIT,
+                                      btree_node_topology_bad_min_key,
+                                      "btree node with incorrect min_key at btree %s level %u:\n"
+                                      "  prev %s\n"
+                                      "  cur %s",
+                                      bch2_btree_id_str(b->c.btree_id), b->c.level,
+                                      buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
                                bch_info(c, "Halting mark and sweep to start topology repair pass");
                                ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
                                goto err;
@@ -121,14 +122,12 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
                bch2_bpos_to_text(&buf2, node_end);
 
-               if (__fsck_err(c,
-                         FSCK_CAN_FIX|
-                         FSCK_CAN_IGNORE|
-                         FSCK_NO_RATELIMIT,
+               if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
+                         btree_node_topology_bad_max_key,
                          "btree node with incorrect max_key at btree %s level %u:\n"
                          "  %s\n"
                          "  expected %s",
-                         bch2_btree_ids[b->c.btree_id], b->c.level,
+                         bch2_btree_id_str(b->c.btree_id), b->c.level,
                          buf1.buf, buf2.buf) &&
                    should_restart_for_topology_repair(c)) {
                        bch_info(c, "Halting mark and sweep to start topology repair pass");
@@ -286,10 +285,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 
                if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
                                                cur->data->min_key), c,
+                               btree_node_topology_overwritten_by_next_node,
                                "btree node overwritten by next node at btree %s level %u:\n"
                                "  node %s\n"
                                "  next %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               bch2_btree_id_str(b->c.btree_id), b->c.level,
                                buf1.buf, buf2.buf)) {
                        ret = DROP_PREV_NODE;
                        goto out;
@@ -297,10 +297,11 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 
                if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
                                                 bpos_predecessor(cur->data->min_key)), c,
+                               btree_node_topology_bad_max_key,
                                "btree node with incorrect max_key at btree %s level %u:\n"
                                "  node %s\n"
                                "  next %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               bch2_btree_id_str(b->c.btree_id), b->c.level,
                                buf1.buf, buf2.buf))
                        ret = set_node_max(c, prev,
                                           bpos_predecessor(cur->data->min_key));
@@ -309,20 +310,22 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
 
                if (mustfix_fsck_err_on(bpos_ge(expected_start,
                                                cur->data->max_key), c,
+                               btree_node_topology_overwritten_by_prev_node,
                                "btree node overwritten by prev node at btree %s level %u:\n"
                                "  prev %s\n"
                                "  node %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               bch2_btree_id_str(b->c.btree_id), b->c.level,
                                buf1.buf, buf2.buf)) {
                        ret = DROP_THIS_NODE;
                        goto out;
                }
 
                if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
+                               btree_node_topology_bad_min_key,
                                "btree node with incorrect min_key at btree %s level %u:\n"
                                "  prev %s\n"
                                "  node %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               bch2_btree_id_str(b->c.btree_id), b->c.level,
                                buf1.buf, buf2.buf))
                        ret = set_node_min(c, cur, expected_start);
        }
@@ -343,10 +346,11 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
        bch2_bpos_to_text(&buf2, b->key.k.p);
 
        if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
+                               btree_node_topology_bad_max_key,
                        "btree node with incorrect max_key at btree %s level %u:\n"
                        "  %s\n"
                        "  expected %s",
-                       bch2_btree_ids[b->c.btree_id], b->c.level,
+                       bch2_btree_id_str(b->c.btree_id), b->c.level,
                        buf1.buf, buf2.buf)) {
                ret = set_node_max(c, child, b->key.k.p);
                if (ret)
@@ -395,9 +399,10 @@ again:
                bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
                if (mustfix_fsck_err_on(ret == -EIO, c,
+                               btree_node_unreadable,
                                "Topology repair: unreadable btree node at btree %s level %u:\n"
                                "  %s",
-                               bch2_btree_ids[b->c.btree_id],
+                               bch2_btree_id_str(b->c.btree_id),
                                b->c.level - 1,
                                buf.buf)) {
                        bch2_btree_node_evict(trans, cur_k.k);
@@ -503,9 +508,10 @@ again:
        bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
 
        if (mustfix_fsck_err_on(!have_child, c,
+                       btree_node_topology_interior_node_empty,
                        "empty interior btree node at btree %s level %u\n"
                        "  %s",
-                       bch2_btree_ids[b->c.btree_id],
+                       bch2_btree_id_str(b->c.btree_id),
                        b->c.level, buf.buf))
                ret = DROP_THIS_NODE;
 err:
@@ -528,14 +534,12 @@ fsck_err:
 
 int bch2_check_topology(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree *b;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) {
+       for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
                if (!r->alive)
@@ -545,8 +549,8 @@ int bch2_check_topology(struct bch_fs *c)
                if (btree_node_fake(b))
                        continue;
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-               ret = bch2_btree_repair_topology_recurse(&trans, b);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
+               ret = bch2_btree_repair_topology_recurse(trans, b);
                six_unlock_read(&b->c.lock);
 
                if (ret == DROP_THIS_NODE) {
@@ -555,7 +559,7 @@ int bch2_check_topology(struct bch_fs *c)
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -565,8 +569,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                               struct bkey_s_c *k)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
-       const union bch_extent_entry *entry;
+       struct bkey_ptrs_c ptrs_c = bch2_bkey_ptrs_c(*k);
+       const union bch_extent_entry *entry_c;
        struct extent_ptr_decoded p = { 0 };
        bool do_update = false;
        struct printbuf buf = PRINTBUF;
@@ -576,14 +580,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
         * XXX
         * use check_bucket_ref here
         */
-       bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
+       bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
                struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-               enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
+               enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
 
                if (!g->gen_valid &&
                    (c->opts.reconstruct_alloc ||
-                    fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+                    fsck_err(c, ptr_to_missing_alloc_key,
+                             "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                              bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -600,7 +605,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 
                if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
                    (c->opts.reconstruct_alloc ||
-                    fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+                    fsck_err(c, ptr_gen_newer_than_bucket_gen,
+                             "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                              bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -621,7 +627,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 
                if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
                    (c->opts.reconstruct_alloc ||
-                    fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+                    fsck_err(c, ptr_gen_newer_than_bucket_gen,
+                             "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
                              bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -632,7 +639,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 
                if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
                    (c->opts.reconstruct_alloc ||
-                    fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+                    fsck_err(c, stale_dirty_ptr,
+                             "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
                              "while marking %s",
                              p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                              bch2_data_types[ptr_data_type(k->k, &p.ptr)],
@@ -646,6 +654,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 
                if (fsck_err_on(bucket_data_type(g->data_type) &&
                                bucket_data_type(g->data_type) != data_type, c,
+                               ptr_bucket_data_type_mismatch,
                                "bucket %u:%zu different types of data in same bucket: %s, %s\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -665,6 +674,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                        struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
 
                        if (fsck_err_on(!m || !m->alive, c,
+                                       ptr_to_missing_stripe,
                                        "pointer to nonexistent stripe %llu\n"
                                        "while marking %s",
                                        (u64) p.ec.idx,
@@ -673,6 +683,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
                                do_update = true;
 
                        if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c,
+                                       ptr_to_incorrect_stripe,
                                        "pointer does not match stripe %llu\n"
                                        "while marking %s",
                                        (u64) p.ec.idx,
@@ -812,6 +823,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
                        goto err;
 
                if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
+                               bkey_version_in_future,
                                "key version number higher than recorded: %llu > %llu",
                                k->k->version.lo,
                                atomic64_read(&c->key_version)))
@@ -969,9 +981,10 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
                                          FSCK_CAN_FIX|
                                          FSCK_CAN_IGNORE|
                                          FSCK_NO_RATELIMIT,
+                                         btree_node_read_error,
                                          "Unreadable btree node at btree %s level %u:\n"
                                          "  %s",
-                                         bch2_btree_ids[b->c.btree_id],
+                                         bch2_btree_id_str(b->c.btree_id),
                                          b->c.level - 1,
                                          (printbuf_reset(&buf),
                                           bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) &&
@@ -1026,6 +1039,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
        printbuf_reset(&buf);
        bch2_bpos_to_text(&buf, b->data->min_key);
        if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c,
+                               btree_root_bad_min_key,
                        "btree root with incorrect min_key: %s", buf.buf)) {
                bch_err(c, "repair unimplemented");
                ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1035,6 +1049,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
        printbuf_reset(&buf);
        bch2_bpos_to_text(&buf, b->data->max_key);
        if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c,
+                               btree_root_bad_max_key,
                        "btree root with incorrect max_key: %s", buf.buf)) {
                bch_err(c, "repair unimplemented");
                ret = -BCH_ERR_fsck_repair_unimplemented;
@@ -1067,15 +1082,13 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        if (initial)
-               trans.is_initial_gc = true;
+               trans->is_initial_gc = true;
 
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
@@ -1083,22 +1096,22 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 
        for (i = 0; i < BTREE_ID_NR && !ret; i++)
                ret = initial
-                       ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
-                       : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
+                       ? bch2_gc_btree_init(trans, ids[i], metadata_only)
+                       : bch2_gc_btree(trans, ids[i], initial, metadata_only);
 
        for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) {
                if (!bch2_btree_id_root(c, i)->alive)
                        continue;
 
                ret = initial
-                       ? bch2_gc_btree_init(&trans, i, metadata_only)
-                       : bch2_gc_btree(&trans, i, initial, metadata_only);
+                       ? bch2_gc_btree_init(trans, i, metadata_only)
+                       : bch2_gc_btree(trans, i, initial, metadata_only);
        }
 
        if (ret < 0)
                bch_err_fn(c, ret);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1213,24 +1226,16 @@ static int bch2_gc_done(struct bch_fs *c,
 
        percpu_down_write(&c->mark_lock);
 
-#define copy_field(_f, _msg, ...)                                      \
+#define copy_field(_err, _f, _msg, ...)                                        \
        if (dst->_f != src->_f &&                                       \
            (!verify ||                                                 \
-            fsck_err(c, _msg ": got %llu, should be %llu"              \
+            fsck_err(c, _err, _msg ": got %llu, should be %llu"        \
                      , ##__VA_ARGS__, dst->_f, src->_f)))              \
                dst->_f = src->_f
-#define copy_stripe_field(_f, _msg, ...)                               \
-       if (dst->_f != src->_f &&                                       \
-           (!verify ||                                                 \
-            fsck_err(c, "stripe %zu has wrong "_msg                    \
-                     ": got %u, should be %u",                         \
-                     iter.pos, ##__VA_ARGS__,                          \
-                     dst->_f, src->_f)))                               \
-               dst->_f = src->_f
-#define copy_dev_field(_f, _msg, ...)                                  \
-       copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
-#define copy_fs_field(_f, _msg, ...)                                   \
-       copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+#define copy_dev_field(_err, _f, _msg, ...)                            \
+       copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+#define copy_fs_field(_err, _f, _msg, ...)                             \
+       copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
 
        for (i = 0; i < ARRAY_SIZE(c->usage); i++)
                bch2_fs_usage_acc_to_base(c, i);
@@ -1241,14 +1246,18 @@ static int bch2_gc_done(struct bch_fs *c,
                        bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
                                             dev_usage_u64s());
 
-               copy_dev_field(buckets_ec,              "buckets_ec");
-
                for (i = 0; i < BCH_DATA_NR; i++) {
-                       copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
-                       copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
-                       copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+                       copy_dev_field(dev_usage_buckets_wrong,
+                                      d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                       copy_dev_field(dev_usage_sectors_wrong,
+                                      d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                       copy_dev_field(dev_usage_fragmented_wrong,
+                                      d[i].fragmented, "%s fragmented", bch2_data_types[i]);
                }
-       };
+
+               copy_dev_field(dev_usage_buckets_ec_wrong,
+                              buckets_ec,              "buckets_ec");
+       }
 
        {
                unsigned nr = fs_usage_u64s(c);
@@ -1256,17 +1265,24 @@ static int bch2_gc_done(struct bch_fs *c,
                struct bch_fs_usage *src = (void *)
                        bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
 
-               copy_fs_field(hidden,           "hidden");
-               copy_fs_field(btree,            "btree");
+               copy_fs_field(fs_usage_hidden_wrong,
+                             hidden,           "hidden");
+               copy_fs_field(fs_usage_btree_wrong,
+                             btree,            "btree");
 
                if (!metadata_only) {
-                       copy_fs_field(data,     "data");
-                       copy_fs_field(cached,   "cached");
-                       copy_fs_field(reserved, "reserved");
-                       copy_fs_field(nr_inodes,"nr_inodes");
+                       copy_fs_field(fs_usage_data_wrong,
+                                     data,     "data");
+                       copy_fs_field(fs_usage_cached_wrong,
+                                     cached,   "cached");
+                       copy_fs_field(fs_usage_reserved_wrong,
+                                     reserved, "reserved");
+                       copy_fs_field(fs_usage_nr_inodes_wrong,
+                                     nr_inodes,"nr_inodes");
 
                        for (i = 0; i < BCH_REPLICAS_MAX; i++)
-                               copy_fs_field(persistent_reserved[i],
+                               copy_fs_field(fs_usage_persistent_reserved_wrong,
+                                             persistent_reserved[i],
                                              "persistent_reserved[%i]", i);
                }
 
@@ -1282,7 +1298,8 @@ static int bch2_gc_done(struct bch_fs *c,
                        printbuf_reset(&buf);
                        bch2_replicas_entry_to_text(&buf, e);
 
-                       copy_fs_field(replicas[i], "%s", buf.buf);
+                       copy_fs_field(fs_usage_replicas_wrong,
+                                     replicas[i], "%s", buf.buf);
                }
        }
 
@@ -1418,6 +1435,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
        if (c->opts.reconstruct_alloc ||
            fsck_err_on(new.data_type != gc.data_type, c,
+                       alloc_key_data_type_wrong,
                        "bucket %llu:%llu gen %u has wrong data_type"
                        ": got %s, should be %s",
                        iter->pos.inode, iter->pos.offset,
@@ -1426,9 +1444,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
                        bch2_data_types[gc.data_type]))
                new.data_type = gc.data_type;
 
-#define copy_bucket_field(_f)                                          \
+#define copy_bucket_field(_errtype, _f)                                        \
        if (c->opts.reconstruct_alloc ||                                \
-           fsck_err_on(new._f != gc._f, c,                             \
+           fsck_err_on(new._f != gc._f, c, _errtype,                   \
                        "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
                        ": got %u, should be %u",                       \
                        iter->pos.inode, iter->pos.offset,              \
@@ -1437,11 +1455,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
                        new._f, gc._f))                                 \
                new._f = gc._f;                                         \
 
-       copy_bucket_field(gen);
-       copy_bucket_field(dirty_sectors);
-       copy_bucket_field(cached_sectors);
-       copy_bucket_field(stripe_redundancy);
-       copy_bucket_field(stripe);
+       copy_bucket_field(alloc_key_gen_wrong,
+                         gen);
+       copy_bucket_field(alloc_key_dirty_sectors_wrong,
+                         dirty_sectors);
+       copy_bucket_field(alloc_key_cached_sectors_wrong,
+                         cached_sectors);
+       copy_bucket_field(alloc_key_stripe_wrong,
+                         stripe);
+       copy_bucket_field(alloc_key_stripe_redundancy_wrong,
+                         stripe_redundancy);
 #undef copy_bucket_field
 
        if (!bch2_alloc_v4_cmp(*old, new))
@@ -1468,37 +1491,35 @@ fsck_err:
 
 static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
        unsigned i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for_each_member_device(ca, c, i) {
-               ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+               ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                                POS(ca->dev_idx, ca->mi.first_bucket),
                                BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_LAZY_RW,
-                       bch2_alloc_write_key(&trans, &iter, k, metadata_only));
+                               NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+                       bch2_alloc_write_key(trans, &iter, k, metadata_only));
 
                if (ret < 0) {
-                       bch_err(c, "error writing alloc info: %s", bch2_err_str(ret));
+                       bch_err_fn(c, ret);
                        percpu_ref_put(&ca->ref);
                        break;
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret < 0 ? ret : 0;
 }
 
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
        struct bch_dev *ca;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bucket *g;
@@ -1514,17 +1535,16 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                if (!buckets) {
                        percpu_ref_put(&ca->ref);
                        bch_err(c, "error allocating ca->buckets[gc]");
-                       return -BCH_ERR_ENOMEM_gc_alloc_start;
+                       ret = -BCH_ERR_ENOMEM_gc_alloc_start;
+                       goto err;
                }
 
                buckets->first_bucket   = ca->mi.first_bucket;
                buckets->nbuckets       = ca->mi.nbuckets;
                rcu_assign_pointer(ca->buckets_gc, buckets);
-       };
-
-       bch2_trans_init(&trans, c, 0, 0);
+       }
 
-       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
                g = gc_bucket(ca, k.k->p.offset);
@@ -1545,13 +1565,11 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                        g->stripe_redundancy    = a->stripe_redundancy;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       bch2_trans_exit(&trans);
-
+       bch2_trans_iter_exit(trans, &iter);
+err:
+       bch2_trans_put(trans);
        if (ret)
-               bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret));
-
+               bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1574,7 +1592,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
                        g->dirty_sectors = 0;
                        g->cached_sectors = 0;
                }
-       };
+       }
 }
 
 static int bch2_gc_write_reflink_key(struct btree_trans *trans,
@@ -1603,6 +1621,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
        }
 
        if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                       reflink_v_refcount_wrong,
                        "reflink key has wrong refcount:\n"
                        "  %s\n"
                        "  should be %u",
@@ -1626,7 +1645,7 @@ fsck_err:
 
 static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        size_t idx = 0;
@@ -1635,23 +1654,23 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key_commit(&trans, iter,
+       ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_reflink, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_gc_write_reflink_key(&trans, &iter, k, &idx));
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+               bch2_gc_write_reflink_key(trans, &iter, k, &idx));
 
        c->reflink_gc_nr = 0;
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int bch2_gc_reflink_start(struct bch_fs *c,
                                 bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
@@ -1660,10 +1679,10 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
        c->reflink_gc_nr = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                const __le64 *refcount = bkey_refcount_c(k);
 
@@ -1681,9 +1700,9 @@ static int bch2_gc_reflink_start(struct bch_fs *c,
                r->size         = k.k->size;
                r->refcount     = 0;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1728,7 +1747,8 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans,
        if (bad)
                bch2_bkey_val_to_text(&buf, c, k);
 
-       if (fsck_err_on(bad, c, "%s", buf.buf)) {
+       if (fsck_err_on(bad, c, stripe_sector_count_wrong,
+                       "%s", buf.buf)) {
                struct bkey_i_stripe *new;
 
                new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
@@ -1750,7 +1770,7 @@ fsck_err:
 
 static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
@@ -1758,15 +1778,15 @@ static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
        if (metadata_only)
                return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key_commit(&trans, iter,
+       ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_stripes, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_NOFAIL,
-               bch2_gc_write_stripes_key(&trans, &iter, k));
+                       NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+               bch2_gc_write_stripes_key(trans, &iter, k));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -1778,6 +1798,12 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
+ * @c:                 filesystem object
+ * @initial:           are we in recovery?
+ * @metadata_only:     are we just checking metadata references, or everything?
+ *
+ * Returns: 0 on success, or standard errcode on failure
+ *
  * Order matters here:
  *  - Concurrent GC relies on the fact that we have a total ordering for
  *    everything that GC walks - see  gc_will_visit_node(),
@@ -1946,7 +1972,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 
 int bch2_gc_gens(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
@@ -1964,22 +1990,20 @@ int bch2_gc_gens(struct bch_fs *c)
 
        trace_and_count(c, gc_gens_start, c);
        down_read(&c->gc_lock);
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        for_each_member_device(ca, c, i) {
-               struct bucket_gens *gens;
+               struct bucket_gens *gens = bucket_gens(ca);
 
                BUG_ON(ca->oldest_gen);
 
-               ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+               ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL);
                if (!ca->oldest_gen) {
                        percpu_ref_put(&ca->ref);
                        ret = -BCH_ERR_ENOMEM_gc_gens;
                        goto err;
                }
 
-               gens = bucket_gens(ca);
-
                for (b = gens->first_bucket;
                     b < gens->nbuckets; b++)
                        ca->oldest_gen[b] = gens->b[b];
@@ -1987,33 +2011,31 @@ int bch2_gc_gens(struct bch_fs *c)
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if (btree_type_has_ptrs(i)) {
-                       struct btree_iter iter;
-                       struct bkey_s_c k;
-
                        c->gc_gens_btree = i;
                        c->gc_gens_pos = POS_MIN;
-                       ret = for_each_btree_key_commit(&trans, iter, i,
+
+                       ret = for_each_btree_key_commit(trans, iter, i,
                                        POS_MIN,
                                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                                        k,
                                        NULL, NULL,
-                                       BTREE_INSERT_NOFAIL,
-                               gc_btree_gens_key(&trans, &iter, k));
+                                       BCH_TRANS_COMMIT_no_enospc,
+                               gc_btree_gens_key(trans, &iter, k));
                        if (ret && !bch2_err_matches(ret, EROFS))
-                               bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret));
+                               bch_err_fn(c, ret);
                        if (ret)
                                goto err;
                }
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
                        POS_MIN,
                        BTREE_ITER_PREFETCH,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_NOFAIL,
-               bch2_alloc_write_oldest_gen(&trans, &iter, k));
+                       BCH_TRANS_COMMIT_no_enospc,
+               bch2_alloc_write_oldest_gen(trans, &iter, k));
        if (ret && !bch2_err_matches(ret, EROFS))
-               bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
        if (ret)
                goto err;
 
@@ -2030,7 +2052,7 @@ err:
                ca->oldest_gen = NULL;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        up_read(&c->gc_lock);
        mutex_unlock(&c->gc_gens_lock);
        return ret;
@@ -2085,7 +2107,7 @@ static int bch2_gc_thread(void *arg)
                ret = bch2_gc_gens(c);
 #endif
                if (ret < 0)
-                       bch_err(c, "btree gc failed: %s", bch2_err_str(ret));
+                       bch_err_fn(c, ret);
 
                debug_check_no_locks_held();
        }
@@ -2115,7 +2137,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
        p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
        if (IS_ERR(p)) {
-               bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p)));
+               bch_err_fn(c, PTR_ERR(p));
                return PTR_ERR(p);
        }
 
index b45e382f7055b1a28a045707bf1116b04a31abc8..607575f83a00232b67d026a0423f9d486f044f2e 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_BTREE_GC_H
 #define _BCACHEFS_BTREE_GC_H
 
+#include "bkey.h"
 #include "btree_types.h"
 
 int bch2_check_topology(struct bch_fs *);
index c049876ee80be3cc4ca5e7813ddd42e1a9194490..1f73ee0ee359bbf3c7434e2254ea2361432d968d 100644 (file)
 #include "debug.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "recovery.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -105,8 +106,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
                vpfree(p, size);
 }
 
-static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
-                                      bool *used_mempool)
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
+                               bool *used_mempool)
 {
        unsigned flags = memalloc_nofs_save();
        void *p;
@@ -114,7 +115,7 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
        BUG_ON(size > btree_bytes(c));
 
        *used_mempool = false;
-       p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT);
+       p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
        if (!p) {
                *used_mempool = true;
                p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -122,8 +123,6 @@ static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size,
        memalloc_nofs_restore(flags);
        return p;
 }
-#define btree_bounce_alloc(_c, _size, _used_mempool)           \
-       alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool))
 
 static void sort_bkey_ptrs(const struct btree *bt,
                           struct bkey_packed **ptrs, unsigned nr)
@@ -185,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
        k = new_whiteouts;
 
        while (ptrs != ptrs_end) {
-               bkey_copy(k, *ptrs);
+               bkey_p_copy(k, *ptrs);
                k = bkey_p_next(k);
                ptrs++;
        }
@@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
                        n = bkey_p_next(k);
 
                        if (!bkey_deleted(k)) {
-                               bkey_copy(out, k);
+                               bkey_p_copy(out, k);
                                out = bkey_p_next(out);
                        } else {
                                BUG_ON(k->needs_whiteout);
@@ -293,7 +292,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
                            bool filter_whiteouts)
 {
        struct btree_node *out;
-       struct sort_iter sort_iter;
+       struct sort_iter_stack sort_iter;
        struct bset_tree *t;
        struct bset *start_bset = bset(b, &b->set[start_idx]);
        bool used_mempool = false;
@@ -302,13 +301,13 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        bool sorting_entire_node = start_idx == 0 &&
                end_idx == b->nsets;
 
-       sort_iter_init(&sort_iter, b);
+       sort_iter_stack_init(&sort_iter, b);
 
        for (t = b->set + start_idx;
             t < b->set + end_idx;
             t++) {
                u64s += le16_to_cpu(bset(b, t)->u64s);
-               sort_iter_add(&sort_iter,
+               sort_iter_add(&sort_iter.iter,
                              btree_bkey_first(b, t),
                              btree_bkey_last(b, t));
        }
@@ -321,7 +320,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
 
        start_time = local_clock();
 
-       u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
+       u64s = bch2_sort_keys(out->keys.start, &sort_iter.iter, filter_whiteouts);
 
        out->keys.u64s = cpu_to_le16(u64s);
 
@@ -337,7 +336,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
        start_bset->journal_seq = cpu_to_le64(seq);
 
        if (sorting_entire_node) {
-               unsigned u64s = le16_to_cpu(out->keys.u64s);
+               u64s = le16_to_cpu(out->keys.u64s);
 
                BUG_ON(bytes != btree_bytes(c));
 
@@ -411,8 +410,6 @@ void bch2_btree_sort_into(struct bch_fs *c,
        bch2_verify_btree_nr_keys(dst);
 }
 
-#define SORT_CRIT      (4096 / sizeof(u64))
-
 /*
  * We're about to add another bset to the btree node, so if there's currently
  * too many bsets - sort some of them together:
@@ -513,16 +510,6 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
                bch2_trans_node_reinit_iter(trans, b);
 }
 
-static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
-                         struct btree *b)
-{
-       prt_printf(out, "%s level %u/%u\n  ",
-              bch2_btree_ids[b->c.btree_id],
-              b->c.level,
-              bch2_btree_id_root(c, b->c.btree_id)->level);
-       bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
-}
-
 static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
                          struct bch_dev *ca,
                          struct btree *b, struct bset *i,
@@ -535,7 +522,7 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
        if (ca)
                prt_printf(out, "on %s ", ca->name);
        prt_printf(out, "at btree ");
-       btree_pos_to_text(out, c, b);
+       bch2_btree_pos_to_text(out, c, b);
 
        prt_printf(out, "\n  node offset %u", b->written);
        if (i)
@@ -543,42 +530,19 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
        prt_str(out, ": ");
 }
 
-enum btree_err_type {
-       /*
-        * We can repair this locally, and we're after the checksum check so
-        * there's no need to try another replica:
-        */
-       BTREE_ERR_FIXABLE,
-       /*
-        * We can repair this if we have to, but we should try reading another
-        * replica if we can:
-        */
-       BTREE_ERR_WANT_RETRY,
-       /*
-        * Read another replica if we have one, otherwise consider the whole
-        * node bad:
-        */
-       BTREE_ERR_MUST_RETRY,
-       BTREE_ERR_BAD_NODE,
-       BTREE_ERR_INCOMPATIBLE,
-};
-
-enum btree_validate_ret {
-       BTREE_RETRY_READ = 64,
-};
-
-static int __btree_err(enum btree_err_type type,
+__printf(9, 10)
+static int __btree_err(int ret,
                       struct bch_fs *c,
                       struct bch_dev *ca,
                       struct btree *b,
                       struct bset *i,
                       int write,
                       bool have_retry,
+                      enum bch_sb_error_id err_type,
                       const char *fmt, ...)
 {
        struct printbuf out = PRINTBUF;
        va_list args;
-       int ret = -BCH_ERR_fsck_fix;
 
        btree_err_msg(&out, c, ca, b, i, b->written, write);
 
@@ -594,27 +558,32 @@ static int __btree_err(enum btree_err_type type,
                goto out;
        }
 
-       if (!have_retry && type == BTREE_ERR_WANT_RETRY)
-               type = BTREE_ERR_FIXABLE;
-       if (!have_retry && type == BTREE_ERR_MUST_RETRY)
-               type = BTREE_ERR_BAD_NODE;
+       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
+               ret = -BCH_ERR_btree_node_read_err_fixable;
+       if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
+               ret = -BCH_ERR_btree_node_read_err_bad_node;
+
+       if (ret != -BCH_ERR_btree_node_read_err_fixable)
+               bch2_sb_error_count(c, err_type);
 
-       switch (type) {
-       case BTREE_ERR_FIXABLE:
-               mustfix_fsck_err(c, "%s", out.buf);
+       switch (ret) {
+       case -BCH_ERR_btree_node_read_err_fixable:
+               ret = bch2_fsck_err(c, FSCK_CAN_FIX, err_type, "%s", out.buf);
+               if (ret != -BCH_ERR_fsck_fix &&
+                   ret != -BCH_ERR_fsck_ignore)
+                       goto fsck_err;
                ret = -BCH_ERR_fsck_fix;
                break;
-       case BTREE_ERR_WANT_RETRY:
-       case BTREE_ERR_MUST_RETRY:
+       case -BCH_ERR_btree_node_read_err_want_retry:
+       case -BCH_ERR_btree_node_read_err_must_retry:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
-               ret = BTREE_RETRY_READ;
                break;
-       case BTREE_ERR_BAD_NODE:
+       case -BCH_ERR_btree_node_read_err_bad_node:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
                bch2_topology_error(c);
                ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
                break;
-       case BTREE_ERR_INCOMPATIBLE:
+       case -BCH_ERR_btree_node_read_err_incompatible:
                bch2_print_string_as_lines(KERN_ERR, out.buf);
                ret = -BCH_ERR_fsck_errors_not_fixed;
                break;
@@ -627,12 +596,17 @@ fsck_err:
        return ret;
 }
 
-#define btree_err(type, c, ca, b, i, msg, ...)                         \
+#define btree_err(type, c, ca, b, i, _err_type, msg, ...)              \
 ({                                                                     \
-       int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\
+       int _ret = __btree_err(type, c, ca, b, i, write, have_retry,    \
+                              BCH_FSCK_ERR_##_err_type,                \
+                              msg, ##__VA_ARGS__);                     \
                                                                        \
-       if (_ret != -BCH_ERR_fsck_fix)                                  \
+       if (_ret != -BCH_ERR_fsck_fix) {                                \
+               ret = _ret;                                             \
                goto fsck_err;                                          \
+       }                                                               \
+                                                                       \
        *saw_error = true;                                              \
 })
 
@@ -646,9 +620,6 @@ __cold
 void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 {
        struct bset_tree *t;
-       struct bkey_s_c k;
-       struct bkey unpacked;
-       struct btree_node_iter iter;
 
        for_each_bset(b, t) {
                struct bset *i = bset(b, t);
@@ -684,6 +655,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
        bch2_bset_set_no_aux_tree(b, b->set);
        bch2_btree_build_aux_trees(b);
 
+       struct bkey_s_c k;
+       struct bkey unpacked;
+       struct btree_node_iter iter;
        for_each_btree_node_key_unpack(b, k, &iter, &unpacked) {
                BUG_ON(bpos_lt(k.k->p, b->data->min_key));
                BUG_ON(bpos_gt(k.k->p, b->data->max_key));
@@ -696,19 +670,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         int write, bool have_retry, bool *saw_error)
 {
        unsigned version = le16_to_cpu(i->version);
-       const char *err;
        struct printbuf buf1 = PRINTBUF;
        struct printbuf buf2 = PRINTBUF;
        int ret = 0;
 
        btree_err_on(!bch2_version_compatible(version),
-                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_incompatible,
+                    c, ca, b, i,
+                    btree_node_unsupported_version,
                     "unsupported bset version %u.%u",
                     BCH_VERSION_MAJOR(version),
                     BCH_VERSION_MINOR(version));
 
        if (btree_err_on(version < c->sb.version_min,
-                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable,
+                        c, NULL, b, i,
+                        btree_node_bset_older_than_sb_min,
                         "bset version %u older than superblock version_min %u",
                         version, c->sb.version_min)) {
                mutex_lock(&c->sb_lock);
@@ -719,7 +696,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
        if (btree_err_on(BCH_VERSION_MAJOR(version) >
                         BCH_VERSION_MAJOR(c->sb.version),
-                        BTREE_ERR_FIXABLE, c, NULL, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable,
+                        c, NULL, b, i,
+                        btree_node_bset_newer_than_sb,
                         "bset version %u newer than superblock version %u",
                         version, c->sb.version)) {
                mutex_lock(&c->sb_lock);
@@ -729,11 +708,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        }
 
        btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
-                    BTREE_ERR_INCOMPATIBLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_incompatible,
+                    c, ca, b, i,
+                    btree_node_unsupported_version,
                     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
        if (btree_err_on(offset + sectors > btree_sectors(c),
-                        BTREE_ERR_FIXABLE, c, ca, b, i,
+                        -BCH_ERR_btree_node_read_err_fixable,
+                        c, ca, b, i,
+                        bset_past_end_of_btree_node,
                         "bset past end of btree node")) {
                i->u64s = 0;
                ret = 0;
@@ -741,12 +724,15 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
        }
 
        btree_err_on(offset && !i->u64s,
-                    BTREE_ERR_FIXABLE, c, ca, b, i,
+                    -BCH_ERR_btree_node_read_err_fixable,
+                    c, ca, b, i,
+                    bset_empty,
                     "empty bset");
 
-       btree_err_on(BSET_OFFSET(i) &&
-                    BSET_OFFSET(i) != offset,
-                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+       btree_err_on(BSET_OFFSET(i) && BSET_OFFSET(i) != offset,
+                    -BCH_ERR_btree_node_read_err_want_retry,
+                    c, ca, b, i,
+                    bset_wrong_sector_offset,
                     "bset at wrong sector offset");
 
        if (!offset) {
@@ -760,16 +746,22 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
 
                        /* XXX endianness */
                        btree_err_on(bp->seq != bn->keys.seq,
-                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_must_retry,
+                                    c, ca, b, NULL,
+                                    bset_bad_seq,
                                     "incorrect sequence number (wrong btree node)");
                }
 
                btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry,
+                            c, ca, b, i,
+                            btree_node_bad_btree,
                             "incorrect btree id");
 
                btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry,
+                            c, ca, b, i,
+                            btree_node_bad_level,
                             "incorrect level");
 
                if (!write)
@@ -786,7 +778,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                        }
 
                        btree_err_on(!bpos_eq(b->data->min_key, bp->min_key),
-                                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_must_retry,
+                                    c, ca, b, NULL,
+                                    btree_node_bad_min_key,
                                     "incorrect min_key: got %s should be %s",
                                     (printbuf_reset(&buf1),
                                      bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf),
@@ -795,7 +789,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                }
 
                btree_err_on(!bpos_eq(bn->max_key, b->key.k.p),
-                            BTREE_ERR_MUST_RETRY, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_must_retry,
+                            c, ca, b, i,
+                            btree_node_bad_max_key,
                             "incorrect max key %s",
                             (printbuf_reset(&buf1),
                              bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf));
@@ -804,10 +800,14 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                        compat_btree_node(b->c.level, b->c.btree_id, version,
                                          BSET_BIG_ENDIAN(i), write, bn);
 
-               err = bch2_bkey_format_validate(&bn->format);
-               btree_err_on(err,
-                            BTREE_ERR_BAD_NODE, c, ca, b, i,
-                            "invalid bkey format: %s", err);
+               btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1),
+                            -BCH_ERR_btree_node_read_err_bad_node,
+                            c, ca, b, i,
+                            btree_node_bad_format,
+                            "invalid bkey format: %s\n  %s", buf1.buf,
+                            (printbuf_reset(&buf2),
+                             bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf));
+               printbuf_reset(&buf1);
 
                compat_bformat(b->c.level, b->c.btree_id, version,
                               BSET_BIG_ENDIAN(i), write,
@@ -826,7 +826,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
                            struct printbuf *err)
 {
        return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?:
-               (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?:
+               (!updated_range ? bch2_bkey_in_btree_node(c, b, k, err) : 0) ?:
                (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
 }
 
@@ -847,14 +847,18 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                struct bkey tmp;
 
                if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-                                BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                -BCH_ERR_btree_node_read_err_fixable,
+                                c, NULL, b, i,
+                                btree_node_bkey_past_bset_end,
                                 "key extends past end of bset")) {
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
                        break;
                }
 
                if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
-                                BTREE_ERR_FIXABLE, c, NULL, b, i,
+                                -BCH_ERR_btree_node_read_err_fixable,
+                                c, NULL, b, i,
+                                btree_node_bkey_bad_format,
                                 "invalid bkey format %u", k->format)) {
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_p_next(k),
@@ -873,12 +877,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                printbuf_reset(&buf);
                if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) {
                        printbuf_reset(&buf);
-                       prt_printf(&buf, "invalid bkey:  ");
                        bset_key_invalid(c, b, u.s_c, updated_range, write, &buf);
                        prt_printf(&buf, "\n  ");
                        bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+                       btree_err(-BCH_ERR_btree_node_read_err_fixable,
+                                 c, NULL, b, i,
+                                 btree_node_bad_bkey,
+                                 "invalid bkey: %s", buf.buf);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_p_next(k),
@@ -902,7 +908,10 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 
                        bch2_dump_bset(c, b, i, 0);
 
-                       if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) {
+                       if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
+                                     c, NULL, b, i,
+                                     btree_node_bkey_out_of_order,
+                                     "%s", buf.buf)) {
                                i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                                memmove_u64s_down(k, bkey_p_next(k),
                                                  (u64 *) vstruct_end(i) - (u64 *) k);
@@ -931,7 +940,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
                BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
-       unsigned blacklisted_written, nonblacklisted_written = 0;
        unsigned ptr_written = btree_ptr_sectors_written(&b->key);
        struct printbuf buf = PRINTBUF;
        int ret = 0, retry_read = 0, write = READ;
@@ -941,51 +949,65 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        b->written = 0;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOFS);
-       sort_iter_init(iter, b);
-       iter->size = (btree_blocks(c) + 1) * 2;
+       sort_iter_init(iter, b, (btree_blocks(c) + 1) * 2);
 
        if (bch2_meta_read_fault("btree"))
-               btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+               btree_err(-BCH_ERR_btree_node_read_err_must_retry,
+                         c, ca, b, NULL,
+                         btree_node_fault_injected,
                          "dynamic fault");
 
        btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
-                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                    -BCH_ERR_btree_node_read_err_must_retry,
+                    c, ca, b, NULL,
+                    btree_node_bad_magic,
                     "bad magic: want %llx, got %llx",
                     bset_magic(c), le64_to_cpu(b->data->magic));
 
-       btree_err_on(!b->data->keys.seq,
-                    BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
-                    "bad btree header: seq 0");
-
        if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
                struct bch_btree_ptr_v2 *bp =
                        &bkey_i_to_btree_ptr_v2(&b->key)->v;
 
                btree_err_on(b->data->keys.seq != bp->seq,
-                            BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
+                            -BCH_ERR_btree_node_read_err_must_retry,
+                            c, ca, b, NULL,
+                            btree_node_bad_seq,
                             "got wrong btree node (seq %llx want %llx)",
                             b->data->keys.seq, bp->seq);
+       } else {
+               btree_err_on(!b->data->keys.seq,
+                            -BCH_ERR_btree_node_read_err_must_retry,
+                            c, ca, b, NULL,
+                            btree_node_bad_seq,
+                            "bad btree header: seq 0");
        }
 
        while (b->written < (ptr_written ?: btree_sectors(c))) {
                unsigned sectors;
                struct nonce nonce;
-               struct bch_csum csum;
                bool first = !b->written;
+               bool csum_bad;
 
                if (!b->written) {
                        i = &b->data->keys;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
-                                    "unknown checksum type %llu",
-                                    BSET_CSUM_TYPE(i));
+                                    -BCH_ERR_btree_node_read_err_want_retry,
+                                    c, ca, b, i,
+                                    bset_unknown_csum,
+                                    "unknown checksum type %llu", BSET_CSUM_TYPE(i));
 
                        nonce = btree_nonce(i, b->written << 9);
-                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
 
-                       btree_err_on(bch2_crc_cmp(csum, b->data->csum),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                       csum_bad = bch2_crc_cmp(b->data->csum,
+                               csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+                       if (csum_bad)
+                               bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+                       btree_err_on(csum_bad,
+                                    -BCH_ERR_btree_node_read_err_want_retry,
+                                    c, ca, b, i,
+                                    bset_bad_csum,
                                     "invalid checksum");
 
                        ret = bset_encrypt(c, i, b->written << 9);
@@ -995,7 +1017,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
                        btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
                                     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
-                                    BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_incompatible,
+                                    c, NULL, b, NULL,
+                                    btree_node_unsupported_version,
                                     "btree node does not have NEW_EXTENT_OVERWRITE set");
 
                        sectors = vstruct_sectors(b->data, c->block_bits);
@@ -1007,15 +1031,21 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                break;
 
                        btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
-                                    "unknown checksum type %llu",
-                                    BSET_CSUM_TYPE(i));
+                                    -BCH_ERR_btree_node_read_err_want_retry,
+                                    c, ca, b, i,
+                                    bset_unknown_csum,
+                                    "unknown checksum type %llu", BSET_CSUM_TYPE(i));
 
                        nonce = btree_nonce(i, b->written << 9);
-                       csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
-
-                       btree_err_on(bch2_crc_cmp(csum, bne->csum),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, i,
+                       csum_bad = bch2_crc_cmp(bne->csum,
+                               csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+                       if (csum_bad)
+                               bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
+
+                       btree_err_on(csum_bad,
+                                    -BCH_ERR_btree_node_read_err_want_retry,
+                                    c, ca, b, i,
+                                    bset_bad_csum,
                                     "invalid checksum");
 
                        ret = bset_encrypt(c, i, b->written << 9);
@@ -1048,12 +1078,16 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                        true);
 
                btree_err_on(blacklisted && first,
-                            BTREE_ERR_FIXABLE, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_fixable,
+                            c, ca, b, i,
+                            bset_blacklisted_journal_seq,
                             "first btree node bset has blacklisted journal seq (%llu)",
                             le64_to_cpu(i->journal_seq));
 
                btree_err_on(blacklisted && ptr_written,
-                            BTREE_ERR_FIXABLE, c, ca, b, i,
+                            -BCH_ERR_btree_node_read_err_fixable,
+                            c, ca, b, i,
+                            first_bset_blacklisted_journal_seq,
                             "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
                             le64_to_cpu(i->journal_seq),
                             b->written, b->written + sectors, ptr_written);
@@ -1066,13 +1100,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                sort_iter_add(iter,
                              vstruct_idx(i, 0),
                              vstruct_last(i));
-
-               nonblacklisted_written = b->written;
        }
 
        if (ptr_written) {
                btree_err_on(b->written < ptr_written,
-                            BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+                            -BCH_ERR_btree_node_read_err_want_retry,
+                            c, ca, b, NULL,
+                            btree_node_data_missing,
                             "btree node data missing: expected %u sectors, found %u",
                             ptr_written, b->written);
        } else {
@@ -1083,20 +1117,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                     !bch2_journal_seq_is_blacklisted(c,
                                                                      le64_to_cpu(bne->keys.journal_seq),
                                                                      true),
-                                    BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
+                                    -BCH_ERR_btree_node_read_err_want_retry,
+                                    c, ca, b, NULL,
+                                    btree_node_bset_after_end,
                                     "found bset signature after last bset");
-
-               /*
-                * Blacklisted bsets are those that were written after the most recent
-                * (flush) journal write. Since there wasn't a flush, they may not have
-                * made it to all devices - which means we shouldn't write new bsets
-                * after them, as that could leave a gap and then reads from that device
-                * wouldn't find all the bsets in that btree node - which means it's
-                * important that we start writing new bsets after the most recent _non_
-                * blacklisted bset:
-                */
-               blacklisted_written = b->written;
-               b->written = nonblacklisted_written;
        }
 
        sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
@@ -1137,7 +1161,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                        prt_printf(&buf, "\n  ");
                        bch2_bkey_val_to_text(&buf, c, u.s_c);
 
-                       btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf);
+                       btree_err(-BCH_ERR_btree_node_read_err_fixable,
+                                 c, NULL, b, i,
+                                 btree_node_bad_bkey,
+                                 "%s", buf.buf);
 
                        btree_keys_account_key_drop(&b->nr, 0, k);
 
@@ -1164,9 +1191,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        btree_node_reset_sib_u64s(b);
 
        bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bch_dev *ca2 = bch_dev_bkey_exists(c, ptr->dev);
 
-               if (ca->mi.state != BCH_MEMBER_STATE_rw)
+               if (ca2->mi.state != BCH_MEMBER_STATE_rw)
                        set_btree_node_need_rewrite(b);
        }
 
@@ -1177,7 +1204,8 @@ out:
        printbuf_exit(&buf);
        return retry_read;
 fsck_err:
-       if (ret == BTREE_RETRY_READ)
+       if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
+           ret == -BCH_ERR_btree_node_read_err_must_retry)
                retry_read = 1;
        else
                set_btree_node_read_error(b);
@@ -1216,8 +1244,9 @@ static void btree_node_read_work(struct work_struct *work)
                }
 start:
                printbuf_reset(&buf);
-               btree_pos_to_text(&buf, c, b);
-               bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
+               bch2_btree_pos_to_text(&buf, c, b);
+               bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+                                  "btree read error %s for %s",
                                   bch2_blk_status_to_str(bio->bi_status), buf.buf);
                if (rb->have_ioref)
                        percpu_ref_put(&ca->io_ref);
@@ -1247,19 +1276,17 @@ start:
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
                               rb->start_time);
        bio_put(&rb->bio);
-       printbuf_exit(&buf);
 
        if (saw_error && !btree_node_read_error(b)) {
-               struct printbuf buf = PRINTBUF;
-
+               printbuf_reset(&buf);
                bch2_bpos_to_text(&buf, b->key.k.p);
                bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
-                        __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf);
-               printbuf_exit(&buf);
+                        __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
 
                bch2_btree_node_rewrite_async(c, b);
        }
 
+       printbuf_exit(&buf);
        clear_btree_node_read_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
@@ -1363,14 +1390,20 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
                }
 
                written2 = btree_node_sectors_written(c, ra->buf[i]);
-               if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+               if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable,
+                                c, NULL, b, NULL,
+                                btree_node_replicas_sectors_written_mismatch,
                                 "btree node sectors written mismatch: %u != %u",
                                 written, written2) ||
                    btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
-                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                -BCH_ERR_btree_node_read_err_fixable,
+                                c, NULL, b, NULL,
+                                btree_node_bset_after_end,
                                 "found bset signature after last bset") ||
                    btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9),
-                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                -BCH_ERR_btree_node_read_err_fixable,
+                                c, NULL, b, NULL,
+                                btree_node_replicas_data_mismatch,
                                 "btree node replicas content mismatch"))
                        dump_bset_maps = true;
 
@@ -1565,7 +1598,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
                struct printbuf buf = PRINTBUF;
 
                prt_str(&buf, "btree node read error: no device to read from\n at ");
-               btree_pos_to_text(&buf, c, b);
+               bch2_btree_pos_to_text(&buf, c, b);
                bch_err(c, "%s", buf.buf);
 
                if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
@@ -1669,8 +1702,7 @@ err:
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
                        const struct bkey_i *k, unsigned level)
 {
-       return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level));
-
+       return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level));
 }
 
 void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
@@ -1732,15 +1764,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
-       struct btree_trans trans;
-
-       bch2_trans_init(&trans, c, 0, 0);
+       struct btree_trans *trans = bch2_trans_get(c);
 
-       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+       btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
        __btree_node_write_done(c, b);
        six_unlock_read(&b->c.lock);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 }
 
 static void btree_node_write_work(struct work_struct *work)
@@ -1769,11 +1799,11 @@ static void btree_node_write_work(struct work_struct *work)
                }
        } else {
                ret = bch2_trans_do(c, NULL, NULL, 0,
-                       bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key,
+                       bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
                                        BCH_WATERMARK_reclaim|
-                                       BTREE_INSERT_JOURNAL_RECLAIM|
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_NOCHECK_RW,
+                                       BCH_TRANS_COMMIT_journal_reclaim|
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_no_check_rw,
                                        !wbio->wbio.failed.nr));
                if (ret)
                        goto err;
@@ -1803,7 +1833,8 @@ static void btree_node_write_endio(struct bio *bio)
        if (wbio->have_ioref)
                bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+                              "btree write error: %s",
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("btree")) {
                spin_lock_irqsave(&c->btree_write_error_lock, flags);
@@ -1874,7 +1905,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       struct sort_iter sort_iter;
+       struct sort_iter_stack sort_iter;
        struct nonce nonce;
        unsigned bytes_to_write, sectors_to_write, bytes, u64s;
        u64 seq = 0;
@@ -1947,7 +1978,7 @@ do_write:
 
        bch2_sort_whiteouts(c, b);
 
-       sort_iter_init(&sort_iter, b);
+       sort_iter_stack_init(&sort_iter, b);
 
        bytes = !b->written
                ? sizeof(struct btree_node)
@@ -1962,7 +1993,7 @@ do_write:
                        continue;
 
                bytes += le16_to_cpu(i->u64s) * sizeof(u64);
-               sort_iter_add(&sort_iter,
+               sort_iter_add(&sort_iter.iter,
                              btree_bkey_first(b, t),
                              btree_bkey_last(b, t));
                seq = max(seq, le64_to_cpu(i->journal_seq));
@@ -1991,14 +2022,14 @@ do_write:
        i->journal_seq  = cpu_to_le64(seq);
        i->u64s         = 0;
 
-       sort_iter_add(&sort_iter,
+       sort_iter_add(&sort_iter.iter,
                      unwritten_whiteouts_start(c, b),
                      unwritten_whiteouts_end(c, b));
        SET_BSET_SEPARATE_WHITEOUTS(i, false);
 
        b->whiteout_u64s = 0;
 
-       u64s = bch2_sort_keys(i->start, &sort_iter, false);
+       u64s = bch2_sort_keys(i->start, &sort_iter.iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
        BUG_ON(!b->written && i->u64s != b->data->keys.u64s);
index 0cadf651e7cfde8122dc1632a47e80bd7140c427..7e03dd76fb380498a42bcdef91857727403a4d8a 100644 (file)
@@ -7,7 +7,7 @@
 #include "btree_locking.h"
 #include "checksum.h"
 #include "extents.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct bch_fs;
 struct btree_write;
@@ -143,8 +143,8 @@ enum btree_write_flags {
        __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
        __BTREE_WRITE_ALREADY_STARTED,
 };
-#define BTREE_WRITE_ONLY_IF_NEED       (1U << __BTREE_WRITE_ONLY_IF_NEED )
-#define BTREE_WRITE_ALREADY_STARTED    (1U << __BTREE_WRITE_ALREADY_STARTED)
+#define BTREE_WRITE_ONLY_IF_NEED       BIT(__BTREE_WRITE_ONLY_IF_NEED)
+#define BTREE_WRITE_ALREADY_STARTED    BIT(__BTREE_WRITE_ALREADY_STARTED)
 
 void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
index dfb77b23d1261030994b97d0e48a06ca2e2299b2..3128695062d9345b8e94b442db77e93c25897e0e 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update.h"
@@ -12,9 +13,8 @@
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
-#include "recovery.h"
 #include "replicas.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "trace.h"
 
 #include <linux/random.h>
@@ -257,7 +257,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
 
        BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              !btree_type_has_snapshots(iter->btree_id));
+              !btree_type_has_snapshot_field(iter->btree_id));
 
        if (iter->update_path)
                bch2_btree_path_verify(trans, iter->update_path);
@@ -362,7 +362,7 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
        bch2_bpos_to_text(&buf, pos);
 
        panic("not locked: %s %s%s\n",
-             bch2_btree_ids[id], buf.buf,
+             bch2_btree_id_str(id), buf.buf,
              key_cache ? " cached" : "");
 }
 
@@ -488,7 +488,6 @@ fixup_done:
        if (!bch2_btree_node_iter_end(node_iter) &&
            iter_current_key_modified &&
            b->c.level) {
-               struct bset_tree *t;
                struct bkey_packed *k, *k2, *p;
 
                k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -689,7 +688,7 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
                        if (t != BTREE_NODE_UNLOCKED) {
                                btree_node_unlock(trans, path, b->c.level);
                                six_lock_increment(&b->c.lock, (enum six_lock_type) t);
-                               mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t);
+                               mark_btree_node_locked(trans, path, b->c.level, t);
                        }
 
                        bch2_btree_path_level_init(trans, path, b);
@@ -764,7 +763,8 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
                        for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
                                path->l[i].b = NULL;
 
-                       mark_btree_node_locked(trans, path, path->level, lock_type);
+                       mark_btree_node_locked(trans, path, path->level,
+                                              (enum btree_node_locked_type) lock_type);
                        bch2_btree_path_level_init(trans, path, b);
                        return 0;
                }
@@ -936,7 +936,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
        if (btree_node_read_locked(path, level + 1))
                btree_node_unlock(trans, path, level + 1);
 
-       mark_btree_node_locked(trans, path, level, lock_type);
+       mark_btree_node_locked(trans, path, level,
+                              (enum btree_node_locked_type) lock_type);
        path->level = level;
        bch2_btree_path_level_init(trans, path, b);
 
@@ -1008,7 +1009,7 @@ retry_all:
        /*
         * We used to assert that all paths had been traversed here
         * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since
-        * path->Should_be_locked is not set yet, we we might have unlocked and
+        * path->should_be_locked is not set yet, we might have unlocked and
         * then failed to relock a path - that's fine.
         */
 err:
@@ -1108,6 +1109,9 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
        if (unlikely(ret))
                goto out;
 
+       if (unlikely(!trans->srcu_held))
+               bch2_trans_srcu_lock(trans);
+
        /*
         * Ensure we obey path->should_be_locked: if it's set, we can't unlock
         * and re-traverse the path without a transaction restart:
@@ -1210,8 +1214,6 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                   struct btree_path *path, struct bpos new_pos,
                   bool intent, unsigned long ip, int cmp)
 {
-       unsigned level = path->level;
-
        bch2_trans_verify_not_in_restart(trans);
        EBUG_ON(!path->ref);
 
@@ -1227,7 +1229,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
                goto out;
        }
 
-       level = btree_path_up_until_good_node(trans, path, cmp);
+       unsigned level = btree_path_up_until_good_node(trans, path, cmp);
 
        if (btree_path_node(path, level)) {
                struct btree_path_level *l = &path->l[level];
@@ -1341,14 +1343,14 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p
        __bch2_path_free(trans, path);
 }
 
-void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count)
 {
        panic("trans->restart_count %u, should be %u, last restarted by %pS\n",
              trans->restart_count, restart_count,
              (void *) trans->last_begin_ip);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *trans)
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
 {
        panic("in transaction restart: %s, last restarted by %pS\n",
              bch2_err_str(trans->restarted),
@@ -1370,7 +1372,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
                struct bkey_s_c old = { &i->old_k, i->old_v };
 
                prt_printf(buf, "update: btree=%s cached=%u %pS",
-                      bch2_btree_ids[i->btree_id],
+                      bch2_btree_id_str(i->btree_id),
                       i->cached,
                       (void *) i->ip_allocated);
                prt_newline(buf);
@@ -1386,7 +1388,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
 
        trans_for_each_wb_update(trans, wb) {
                prt_printf(buf, "update: btree=%s wb=1 %pS",
-                      bch2_btree_ids[wb->btree],
+                      bch2_btree_id_str(wb->btree),
                       (void *) i->ip_allocated);
                prt_newline(buf);
 
@@ -1415,7 +1417,7 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
                   path->idx, path->ref, path->intent_ref,
                   path->preserve ? 'P' : ' ',
                   path->should_be_locked ? 'S' : ' ',
-                  bch2_btree_ids[path->btree_id],
+                  bch2_btree_id_str(path->btree_id),
                   path->level);
        bch2_bpos_to_text(out, path->pos);
 
@@ -1493,7 +1495,7 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
 static noinline void btree_path_overflow(struct btree_trans *trans)
 {
        bch2_dump_trans_paths_updates(trans);
-       panic("trans path oveflow\n");
+       panic("trans path overflow\n");
 }
 
 static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
@@ -1522,6 +1524,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
        path->ref               = 0;
        path->intent_ref        = 0;
        path->nodes_locked      = 0;
+       path->alloc_seq++;
 
        btree_path_list_add(trans, pos, path);
        trans->paths_sorted = false;
@@ -1597,7 +1600,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
 
        locks_want = min(locks_want, BTREE_MAX_DEPTH);
        if (locks_want > path->locks_want)
-               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want);
+               bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
 
        return path;
 }
@@ -1794,23 +1797,15 @@ err:
 
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
-       if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
-               struct bpos pos = iter->k.p;
-               bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
-                            ? bpos_eq(pos, SPOS_MAX)
-                            : bkey_eq(pos, SPOS_MAX));
-
-               if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
-                       pos = bkey_successor(iter, pos);
-               bch2_btree_iter_set_pos(iter, pos);
-               return ret;
-       } else {
-               if (!btree_path_node(iter->path, iter->path->level))
-                       return true;
+       struct bpos pos = iter->k.p;
+       bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                    ? bpos_eq(pos, SPOS_MAX)
+                    : bkey_eq(pos, SPOS_MAX));
 
-               iter->advanced = true;
-               return false;
-       }
+       if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+               pos = bkey_successor(iter, pos);
+       bch2_btree_iter_set_pos(iter, pos);
+       return ret;
 }
 
 inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -2046,8 +2041,12 @@ out:
 }
 
 /**
- * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
- * current position
+ * bch2_btree_iter_peek_upto() - returns first key greater than or equal to
+ * iterator's current position
+ * @iter:      iterator to peek from
+ * @end:       search limit: returns keys less than or equal to @end
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end)
 {
@@ -2057,7 +2056,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
        struct bpos iter_pos;
        int ret;
 
-       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
        EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
 
        if (iter->update_path) {
@@ -2184,102 +2182,11 @@ end:
 }
 
 /**
- * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal
- * to iterator's current position, returning keys from every level of the btree.
- * For keys at different levels of the btree that compare equal, the key from
- * the lower level (leaf) is returned first.
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
-       struct btree_trans *trans = iter->trans;
-       struct bkey_s_c k;
-       int ret;
-
-       EBUG_ON(iter->path->cached);
-       bch2_btree_iter_verify(iter);
-       BUG_ON(iter->path->level < iter->min_depth);
-       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-       EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
-       while (1) {
-               iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
-                                       iter->flags & BTREE_ITER_INTENT,
-                                       btree_iter_ip_allocated(iter));
-
-               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
-               if (unlikely(ret)) {
-                       /* ensure that iter->k is consistent with iter->pos: */
-                       bch2_btree_iter_set_pos(iter, iter->pos);
-                       k = bkey_s_c_err(ret);
-                       goto out_no_locked;
-               }
-
-               /* Already at end? */
-               if (!btree_path_node(iter->path, iter->path->level)) {
-                       k = bkey_s_c_null;
-                       goto out_no_locked;
-               }
-
-               k = btree_path_level_peek_all(trans->c,
-                               &iter->path->l[iter->path->level], &iter->k);
-
-               /* Check if we should go up to the parent node: */
-               if (!k.k ||
-                   (iter->advanced &&
-                    bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
-                       iter->pos = path_l(iter->path)->b->key.k.p;
-                       btree_path_set_level_up(trans, iter->path);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               /*
-                * Check if we should go back down to a leaf:
-                * If we're not in a leaf node, we only return the current key
-                * if it exactly matches iter->pos - otherwise we first have to
-                * go back to the leaf:
-                */
-               if (iter->path->level != iter->min_depth &&
-                   (iter->advanced ||
-                    !k.k ||
-                    !bpos_eq(iter->pos, k.k->p))) {
-                       btree_path_set_level_down(trans, iter->path, iter->min_depth);
-                       iter->pos = bpos_successor(iter->pos);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               /* Check if we should go to the next key: */
-               if (iter->path->level == iter->min_depth &&
-                   iter->advanced &&
-                   k.k &&
-                   bpos_eq(iter->pos, k.k->p)) {
-                       iter->pos = bpos_successor(iter->pos);
-                       iter->advanced = false;
-                       continue;
-               }
-
-               if (iter->advanced &&
-                   iter->path->level == iter->min_depth &&
-                   !bpos_eq(k.k->p, iter->pos))
-                       iter->advanced = false;
-
-               BUG_ON(iter->advanced);
-               BUG_ON(!k.k);
-               break;
-       }
-
-       iter->pos = k.k->p;
-       btree_path_set_should_be_locked(iter->path);
-out_no_locked:
-       bch2_btree_iter_verify(iter);
-
-       return k;
-}
-
-/**
- * bch2_btree_iter_next: returns first key greater than iterator's current
+ * bch2_btree_iter_next() - returns first key greater than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 {
@@ -2290,8 +2197,11 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
 }
 
 /**
- * bch2_btree_iter_peek_prev: returns first key less than or equal to
+ * bch2_btree_iter_peek_prev() - returns first key less than or equal to
  * iterator's current position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
@@ -2414,8 +2324,11 @@ out_no_locked:
 }
 
 /**
- * bch2_btree_iter_prev: returns first key less than iterator's current
+ * bch2_btree_iter_prev() - returns first key less than iterator's current
  * position
+ * @iter:      iterator to peek from
+ *
+ * Returns:    key if found, or an error extractable with bkey_err().
  */
 struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 {
@@ -2434,7 +2347,6 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
-       EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
        EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
 
        /* extents can't span inode numbers: */
@@ -2722,7 +2634,7 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 
 void bch2_trans_iter_init_outlined(struct btree_trans *trans,
                          struct btree_iter *iter,
-                         unsigned btree_id, struct bpos pos,
+                         enum btree_id btree_id, struct bpos pos,
                          unsigned flags)
 {
        bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0,
@@ -2738,9 +2650,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
                               unsigned depth,
                               unsigned flags)
 {
-       flags |= BTREE_ITER_NOT_EXTENTS;
-       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
-       flags |= BTREE_ITER_ALL_SNAPSHOTS;
+       flags |= BTREE_ITER_NOT_EXTENTS;
+       flags |= __BTREE_ITER_ALL_SNAPSHOTS;
+       flags |= BTREE_ITER_ALL_SNAPSHOTS;
 
        bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth,
                               __bch2_btree_iter_flags(trans, btree_id, flags),
@@ -2812,24 +2724,44 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        return p;
 }
 
-static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans)
+static inline void check_srcu_held_too_long(struct btree_trans *trans)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_path *path;
+       WARN(trans->srcu_held && time_after(jiffies, trans->srcu_lock_time + HZ * 10),
+            "btree trans held srcu lock (delaying memory reclaim) for %lu seconds",
+            (jiffies - trans->srcu_lock_time) / HZ);
+}
 
-       trans_for_each_path(trans, path)
-               if (path->cached && !btree_node_locked(path, 0))
-                       path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+void bch2_trans_srcu_unlock(struct btree_trans *trans)
+{
+       if (trans->srcu_held) {
+               struct bch_fs *c = trans->c;
+               struct btree_path *path;
 
-       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
-       trans->srcu_lock_time   = jiffies;
+               trans_for_each_path(trans, path)
+                       if (path->cached && !btree_node_locked(path, 0))
+                               path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
+
+               check_srcu_held_too_long(trans);
+               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+               trans->srcu_held = false;
+       }
+}
+
+void bch2_trans_srcu_lock(struct btree_trans *trans)
+{
+       if (!trans->srcu_held) {
+               trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
+               trans->srcu_lock_time   = jiffies;
+               trans->srcu_held = true;
+       }
 }
 
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
  *
+ * Returns:    current restart counter, to be used with trans_was_restarted()
+ *
  * While iterating over nodes or updating nodes a attempt to lock a btree node
  * may return BCH_ERR_transaction_restart when the trylock fails. When this
  * occurs bch2_trans_begin() should be called and the transaction retried.
@@ -2875,8 +2807,9 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        }
        trans->last_begin_time = now;
 
-       if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
-               bch2_trans_reset_srcu_lock(trans);
+       if (unlikely(trans->srcu_held &&
+                    time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10))))
+               bch2_trans_srcu_unlock(trans);
 
        trans->last_begin_ip = _RET_IP_;
        if (trans->restarted) {
@@ -2887,26 +2820,23 @@ u32 bch2_trans_begin(struct btree_trans *trans)
        return trans->restart_count;
 }
 
-static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
+static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
 {
-       size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
-       size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-       void *p = NULL;
+       struct btree_trans *trans;
 
-       BUG_ON(trans->used_mempool);
+       if (IS_ENABLED(__KERNEL__)) {
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+               if (trans)
+                       return trans;
+       }
 
-#ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_paths_bufs->path, NULL);
-#endif
-       if (!p)
-               p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
+       trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
        /*
-        * paths need to be zeroed, bch2_check_for_deadlock looks at paths in
-        * other threads
+        * paths need to be zeroed, bch2_check_for_deadlock looks at
+        * paths in other threads
         */
-
-       trans->paths            = p; p += paths_bytes;
-       trans->updates          = p; p += updates_bytes;
+       memset(&trans->paths, 0, sizeof(trans->paths));
+       return trans;
 }
 
 const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
@@ -2926,13 +2856,16 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
        return i;
 }
 
-void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx)
+struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        __acquires(&c->btree_trans_barrier)
 {
+       struct btree_trans *trans;
        struct btree_transaction_stats *s;
 
        bch2_assert_btree_nodes_not_locked();
 
+       trans = bch2_trans_alloc(c);
+
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
        trans->fn               = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
@@ -2944,8 +2877,6 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
        closure_init_stack(&trans->ref);
 
-       bch2_trans_alloc_paths(trans, c);
-
        s = btree_trans_stats(trans);
        if (s && s->max_mem) {
                unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
@@ -2965,8 +2896,9 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
                trans->wb_updates_size = s->wb_updates_size;
        }
 
-       trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+       trans->srcu_idx         = srcu_read_lock(&c->btree_trans_barrier);
        trans->srcu_lock_time   = jiffies;
+       trans->srcu_held        = true;
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
                struct btree_trans *pos;
@@ -2991,6 +2923,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
 list_add_done:
                seqmutex_unlock(&c->btree_trans_lock);
        }
+
+       return trans;
 }
 
 static void check_btree_paths_leaked(struct btree_trans *trans)
@@ -3008,14 +2942,14 @@ leaked:
        trans_for_each_path(trans, path)
                if (path->ref)
                        printk(KERN_ERR "  btree %s %pS\n",
-                              bch2_btree_ids[path->btree_id],
+                              bch2_btree_id_str(path->btree_id),
                               (void *) path->ip_allocated);
        /* Be noisy about this: */
        bch2_fatal_error(c);
 #endif
 }
 
-void bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_put(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
        struct btree_insert_entry *i;
@@ -3041,9 +2975,10 @@ void bch2_trans_exit(struct btree_trans *trans)
 
        check_btree_paths_leaked(trans);
 
-       srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
-
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
+       if (trans->srcu_held) {
+               check_srcu_held_too_long(trans);
+               srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
+       }
 
        kfree(trans->extra_journal_entries.data);
 
@@ -3061,18 +2996,11 @@ void bch2_trans_exit(struct btree_trans *trans)
        else
                kfree(trans->mem);
 
-#ifdef __KERNEL__
-       /*
-        * Userspace doesn't have a real percpu implementation:
-        */
-       trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
-#endif
-
-       if (trans->paths)
-               mempool_free(trans->paths, &c->btree_paths_pool);
-
-       trans->mem      = (void *) 0x1;
-       trans->paths    = (void *) 0x1;
+       /* Userspace doesn't have a real percpu implementation: */
+       if (IS_ENABLED(__KERNEL__))
+               trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
+       if (trans)
+               mempool_free(trans, &c->btree_trans_pool);
 }
 
 static void __maybe_unused
@@ -3090,7 +3018,7 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
 
        prt_tab(out);
        prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b',
-                  b->level, bch2_btree_ids[b->btree_id]);
+                  b->level, bch2_btree_id_str(b->btree_id));
        bch2_bpos_to_text(out, btree_node_pos(b));
 
        prt_tab(out);
@@ -3120,7 +3048,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
                       path->idx,
                       path->cached ? 'c' : 'b',
                       path->level,
-                      bch2_btree_ids[path->btree_id]);
+                      bch2_btree_id_str(path->btree_id));
                bch2_bpos_to_text(out, path->pos);
                prt_newline(out);
 
@@ -3150,6 +3078,17 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
+       struct btree_trans *trans;
+       int cpu;
+
+       trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
+       if (trans)
+               panic("%s leaked btree_trans\n", trans->fn);
+
+       if (c->btree_trans_bufs)
+               for_each_possible_cpu(cpu)
+                       kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
+       free_percpu(c->btree_trans_bufs);
 
        for (s = c->btree_transaction_stats;
             s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
@@ -3161,13 +3100,12 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
        if (c->btree_trans_barrier_initialized)
                cleanup_srcu_struct(&c->btree_trans_barrier);
        mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_paths_pool);
+       mempool_exit(&c->btree_trans_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
        struct btree_transaction_stats *s;
-       unsigned nr = BTREE_ITER_MAX;
        int ret;
 
        for (s = c->btree_transaction_stats;
@@ -3180,9 +3118,12 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
        INIT_LIST_HEAD(&c->btree_trans_list);
        seqmutex_init(&c->btree_trans_lock);
 
-       ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
-                       sizeof(struct btree_path) * nr +
-                       sizeof(struct btree_insert_entry) * nr) ?:
+       c->btree_trans_bufs = alloc_percpu(struct btree_trans_buf);
+       if (!c->btree_trans_bufs)
+               return -ENOMEM;
+
+       ret   = mempool_init_kmalloc_pool(&c->btree_trans_pool, 1,
+                                         sizeof(struct btree_trans)) ?:
                mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
                                          BTREE_TRANS_MEM_MAX) ?:
                init_srcu_struct(&c->btree_trans_barrier);
index c472aa8c58a09b8181aab307cd2ced3508c17545..e5b989a8eb98ab330eeacadd6b8c24d72f440599 100644 (file)
@@ -221,6 +221,22 @@ struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpo
                                 unsigned, unsigned, unsigned, unsigned long);
 struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
+/*
+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
+ * different snapshot:
+ */
+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
+{
+       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
+
+       if (k.k && bpos_eq(path->pos, k.k->p))
+               return k;
+
+       bkey_init(u);
+       u->p = path->pos;
+       return (struct bkey_s_c) { u, NULL };
+}
+
 struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *,
                                        struct btree_iter *, struct bpos);
 
@@ -258,14 +274,17 @@ void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 int bch2_trans_relock(struct btree_trans *);
 int bch2_trans_relock_notrace(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
+void bch2_trans_unlock_long(struct btree_trans *);
 bool bch2_trans_locked(struct btree_trans *);
 
-static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count)
+static inline int trans_was_restarted(struct btree_trans *trans, u32 restart_count)
 {
-       return restart_count != trans->restart_count;
+       return restart_count != trans->restart_count
+               ? -BCH_ERR_transaction_restart_nested
+               : 0;
 }
 
-void bch2_trans_restart_error(struct btree_trans *, u32);
+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32);
 
 static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
                                                   u32 restart_count)
@@ -274,7 +293,7 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans,
                bch2_trans_restart_error(trans, restart_count);
 }
 
-void bch2_trans_in_restart_error(struct btree_trans *);
+void __noreturn bch2_trans_in_restart_error(struct btree_trans *);
 
 static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans)
 {
@@ -329,8 +348,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
 struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
 struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
 
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
-
 static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
        return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -389,15 +406,12 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
                                               unsigned btree_id,
                                               unsigned flags)
 {
-       if (flags & BTREE_ITER_ALL_LEVELS)
-               flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
        if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
-           btree_node_type_is_extents(btree_id))
+           btree_id_is_extents(btree_id))
                flags |= BTREE_ITER_IS_EXTENTS;
 
        if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
-           !btree_type_has_snapshots(btree_id))
+           !btree_type_has_snapshot_field(btree_id))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
        if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
@@ -447,7 +461,7 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 }
 
 void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *,
-                         unsigned, struct bpos, unsigned);
+                         enum btree_id, struct bpos, unsigned);
 
 static inline void bch2_trans_iter_init(struct btree_trans *trans,
                          struct btree_iter *iter,
@@ -561,6 +575,9 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
        __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags,      \
                                  KEY_TYPE_##_type, sizeof(*_val), _val)
 
+void bch2_trans_srcu_unlock(struct btree_trans *);
+void bch2_trans_srcu_lock(struct btree_trans *);
+
 u32 bch2_trans_begin(struct btree_trans *);
 
 /*
@@ -584,8 +601,6 @@ u32 bch2_trans_begin(struct btree_trans *);
 static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
                                                             unsigned flags)
 {
-       BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
-
        return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
                                                bch2_btree_iter_peek_prev(iter);
 }
@@ -593,8 +608,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *
 static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
                                                        unsigned flags)
 {
-       return  flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
-               flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
+       return  flags & BTREE_ITER_SLOTS      ? bch2_btree_iter_peek_slot(iter) :
                                                bch2_btree_iter_peek(iter);
 }
 
@@ -656,17 +670,17 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define lockrestart_do(_trans, _do)                                    \
 ({                                                                     \
        u32 _restart_count;                                             \
-       int _ret                                                      \
+       int _ret2;                                                      \
                                                                        \
        do {                                                            \
                _restart_count = bch2_trans_begin(_trans);              \
-               _ret = (_do);                                           \
-       } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart));  \
+               _ret2 = (_do);                                          \
+       } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
                                                                        \
-       if (!_ret                                                     \
+       if (!_ret2)                                                     \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                                                                        \
-       _ret                                                          \
+       _ret2;                                                          \
 })
 
 /*
@@ -681,26 +695,23 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
 #define nested_lockrestart_do(_trans, _do)                             \
 ({                                                                     \
        u32 _restart_count, _orig_restart_count;                        \
-       int _ret                                                      \
+       int _ret2;                                                      \
                                                                        \
        _restart_count = _orig_restart_count = (_trans)->restart_count; \
                                                                        \
-       while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\
+       while (bch2_err_matches(_ret2 = (_do), BCH_ERR_transaction_restart))\
                _restart_count = bch2_trans_begin(_trans);              \
                                                                        \
-       if (!_ret                                                     \
+       if (!_ret2)                                                     \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                                                                        \
-       if (!_ret && trans_was_restarted(_trans, _orig_restart_count))  \
-               _ret = -BCH_ERR_transaction_restart_nested;             \
-                                                                       \
-       _ret;                                                           \
+       _ret2 ?: trans_was_restarted(_trans, _restart_count);           \
 })
 
 #define for_each_btree_key2(_trans, _iter, _btree_id,                  \
                            _start, _flags, _k, _do)                    \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -708,15 +719,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        while (1) {                                                     \
                u32 _restart_count = bch2_trans_begin(_trans);          \
                                                                        \
-               _ret = 0;                                               \
+               _ret3 = 0;                                              \
                (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags));   \
                if (!(_k).k)                                            \
                        break;                                          \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_advance(&(_iter)))                 \
@@ -724,13 +735,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key2_upto(_trans, _iter, _btree_id,             \
                            _start, _end, _flags, _k, _do)              \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -738,15 +749,15 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        while (1) {                                                     \
                u32 _restart_count = bch2_trans_begin(_trans);          \
                                                                        \
-               _ret = 0;                                               \
+               _ret3 = 0;                                              \
                (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
                if (!(_k).k)                                            \
                        break;                                          \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_advance(&(_iter)))                 \
@@ -754,13 +765,13 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key_reverse(_trans, _iter, _btree_id,           \
                                   _start, _flags, _k, _do)             \
 ({                                                                     \
-       int _ret = 0;                                                   \
+       int _ret3 = 0;                                                  \
                                                                        \
        bch2_trans_iter_init((_trans), &(_iter), (_btree_id),           \
                             (_start), (_flags));                       \
@@ -769,14 +780,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
                u32 _restart_count = bch2_trans_begin(_trans);          \
                (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
                if (!(_k).k) {                                          \
-                       _ret = 0;                                       \
+                       _ret3 = 0;                                      \
                        break;                                          \
                }                                                       \
                                                                        \
-               _ret = bkey_err(_k) ?: (_do);                           \
-               if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+               _ret3 = bkey_err(_k) ?: (_do);                          \
+               if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
                        continue;                                       \
-               if (_ret                                              \
+               if (_ret3)                                              \
                        break;                                          \
                bch2_trans_verify_not_restarted(_trans, _restart_count);\
                if (!bch2_btree_iter_rewind(&(_iter)))                  \
@@ -784,7 +795,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
        }                                                               \
                                                                        \
        bch2_trans_iter_exit((_trans), &(_iter));                       \
-       _ret                                                          \
+       _ret3;                                                          \
 })
 
 #define for_each_btree_key_commit(_trans, _iter, _btree_id,            \
@@ -900,21 +911,21 @@ void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
 void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
 void bch2_dump_trans_updates(struct btree_trans *);
 void bch2_dump_trans_paths_updates(struct btree_trans *);
-void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned);
-void bch2_trans_exit(struct btree_trans *);
+
+struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
+void bch2_trans_put(struct btree_trans *);
 
 extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
 unsigned bch2_trans_get_fn_idx(const char *);
 
-#define bch2_trans_init(_trans, _c, _nr_iters, _mem)                   \
-do {                                                                   \
+#define bch2_trans_get(_c)                                             \
+({                                                                     \
        static unsigned trans_fn_idx;                                   \
                                                                        \
        if (unlikely(!trans_fn_idx))                                    \
                trans_fn_idx = bch2_trans_get_fn_idx(__func__);         \
-                                                                       \
-       __bch2_trans_init(_trans, _c, trans_fn_idx);                    \
-} while (0)
+       __bch2_trans_get(_c, trans_fn_idx);                             \
+})
 
 void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *);
 
index f7c001d42391faa5151f1c807514db35392f3abe..8e80a5b687fe04e685f61b022333befa218c0c9c 100644 (file)
@@ -90,10 +90,13 @@ static void bkey_cached_free(struct btree_key_cache *bc,
        ck->btree_trans_barrier_seq =
                start_poll_synchronize_srcu(&c->btree_trans_barrier);
 
-       if (ck->c.lock.readers)
+       if (ck->c.lock.readers) {
                list_move_tail(&ck->list, &bc->freed_pcpu);
-       else
+               bc->nr_freed_pcpu++;
+       } else {
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
+       }
        atomic_long_inc(&bc->nr_freed);
 
        kfree(ck->k);
@@ -110,6 +113,8 @@ static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc,
 {
        struct bkey_cached *pos;
 
+       bc->nr_freed_nonpcpu++;
+
        list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) {
                if (ULONG_CMP_GE(ck->btree_trans_barrier_seq,
                                 pos->btree_trans_barrier_seq)) {
@@ -159,6 +164,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 #else
                mutex_lock(&bc->lock);
                list_move_tail(&ck->list, &bc->freed_nonpcpu);
+               bc->nr_freed_nonpcpu++;
                mutex_unlock(&bc->lock);
 #endif
        } else {
@@ -218,6 +224,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                               f->nr < ARRAY_SIZE(f->objs) / 2) {
                                ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                                list_del_init(&ck->list);
+                               bc->nr_freed_nonpcpu--;
                                f->objs[f->nr++] = ck;
                        }
 
@@ -230,6 +237,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
                if (!list_empty(&bc->freed_nonpcpu)) {
                        ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list);
                        list_del_init(&ck->list);
+                       bc->nr_freed_nonpcpu--;
                }
                mutex_unlock(&bc->lock);
 #endif
@@ -243,8 +251,6 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
        }
 
        if (ck) {
-               int ret;
-
                ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_);
                if (unlikely(ret)) {
                        bkey_cached_move_to_freelist(bc, ck);
@@ -253,7 +259,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 
                path->l[0].b = (void *) ck;
                path->l[0].lock_seq = six_lock_seq(&ck->c.lock);
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
 
                ret = bch2_btree_node_lock_write(trans, path, &ck->c);
                if (unlikely(ret)) {
@@ -327,11 +333,11 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path)
                ck = bkey_cached_reuse(bc);
                if (unlikely(!ck)) {
                        bch_err(c, "error allocating memory for key cache item, btree %s",
-                               bch2_btree_ids[path->btree_id]);
+                               bch2_btree_id_str(path->btree_id));
                        return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create);
                }
 
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
        }
 
        ck->c.level             = 0;
@@ -410,7 +416,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                        new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
                        if (!new_k) {
                                bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
-                                       bch2_btree_ids[ck->key.btree_id], new_u64s);
+                                       bch2_btree_id_str(ck->key.btree_id), new_u64s);
                                ret = -BCH_ERR_ENOMEM_btree_key_cache_fill;
                                goto err;
                        }
@@ -479,7 +485,7 @@ retry:
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, 0, BTREE_NODE_INTENT_LOCKED);
                path->locks_want = 1;
        } else {
                enum six_lock_type lock_want = __btree_lock_want(path, 0);
@@ -497,7 +503,8 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(trans, path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0,
+                                      (enum btree_node_locked_type) lock_want);
        }
 
        path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
@@ -511,7 +518,7 @@ fill:
                 * path->uptodate yet:
                 */
                if (!path->locks_want &&
-                   !__bch2_btree_path_upgrade(trans, path, 1)) {
+                   !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
                        trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
                        ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
                        goto err;
@@ -579,7 +586,8 @@ retry:
                        goto retry;
                }
 
-               mark_btree_node_locked(trans, path, 0, lock_want);
+               mark_btree_node_locked(trans, path, 0,
+                                      (enum btree_node_locked_type) lock_want);
        }
 
        path->l[0].lock_seq     = six_lock_seq(&ck->c.lock);
@@ -649,8 +657,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOCHECK_RW|
-                                 BTREE_INSERT_NOFAIL|
+                                 BCH_TRANS_COMMIT_no_check_rw|
+                                 BCH_TRANS_COMMIT_no_enospc|
                                  (ck->journal.seq == journal_last_seq(j)
                                   ? BCH_WATERMARK_reclaim
                                   : 0)|
@@ -665,7 +673,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                goto out;
 
        bch2_journal_pin_drop(j, &ck->journal);
-       bch2_journal_preres_put(j, &ck->res);
 
        BUG_ON(!btree_node_locked(c_iter.path, 0));
 
@@ -705,13 +712,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        struct bkey_cached *ck =
                container_of(pin, struct bkey_cached, journal);
        struct bkey_cached_key key;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read);
+       btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read);
        key = ck->key;
 
        if (ck->journal.seq != seq ||
@@ -728,13 +733,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
        }
        six_unlock_read(&ck->c.lock);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-               btree_key_cache_flush_pos(&trans, key, seq,
-                               BTREE_INSERT_JOURNAL_RECLAIM, false));
+       ret = commit_do(trans, NULL, NULL, 0,
+               btree_key_cache_flush_pos(trans, key, seq,
+                               BCH_TRANS_COMMIT_journal_reclaim, false));
 unlock:
        srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -765,18 +770,6 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
 
        BUG_ON(insert->k.u64s > ck->u64s);
 
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               int difference;
-
-               BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s);
-
-               difference = jset_u64s(insert->k.u64s) - ck->res.u64s;
-               if (difference > 0) {
-                       trans->journal_preres.u64s      -= difference;
-                       ck->res.u64s                    += difference;
-               }
-       }
-
        bkey_copy(ck->k, insert);
        ck->valid = true;
 
@@ -854,6 +847,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
         * Newest freed entries are at the end of the list - once we hit one
         * that's too new to be freed, we can bail out:
         */
+       scanned += bc->nr_freed_nonpcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -863,13 +858,15 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_nonpcpu--;
        }
 
        if (scanned >= nr)
                goto out;
 
+       scanned += bc->nr_freed_pcpu;
+
        list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) {
                if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
                                                 ck->btree_trans_barrier_seq))
@@ -879,8 +876,8 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                six_lock_exit(&ck->c.lock);
                kmem_cache_free(bch2_key_cache, ck);
                atomic_long_dec(&bc->nr_freed);
-               scanned++;
                freed++;
+               bc->nr_freed_pcpu--;
        }
 
        if (scanned >= nr)
@@ -987,6 +984,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
        }
 #endif
 
+       BUG_ON(list_count_nodes(&bc->freed_pcpu) != bc->nr_freed_pcpu);
+       BUG_ON(list_count_nodes(&bc->freed_nonpcpu) != bc->nr_freed_nonpcpu);
+
        list_splice(&bc->freed_pcpu,    &items);
        list_splice(&bc->freed_nonpcpu, &items);
 
@@ -996,7 +996,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                cond_resched();
 
                bch2_journal_pin_drop(&c->journal, &ck->journal);
-               bch2_journal_preres_put(&c->journal, &ck->res);
 
                list_del(&ck->list);
                kfree(ck->k);
@@ -1058,14 +1057,14 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
        bc->shrink.count_objects        = bch2_btree_key_cache_count;
        bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
        bc->shrink.to_text              = bch2_btree_key_cache_shrinker_to_text;
-       if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name))
+       if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
                return -BCH_ERR_ENOMEM_fs_btree_cache_init;
        return 0;
 }
 
 void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
 {
-       prt_printf(out, "nr_freed:\t%zu",       atomic_long_read(&c->nr_freed));
+       prt_printf(out, "nr_freed:\t%lu",       atomic_long_read(&c->nr_freed));
        prt_newline(out);
        prt_printf(out, "nr_keys:\t%lu",        atomic_long_read(&c->nr_keys));
        prt_newline(out);
index 0b0f9d607798842ca9c9877a95992eaf1ca9c619..59c57c585a4c37716ecc50fbfdc0968eb62655d7 100644 (file)
@@ -430,7 +430,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 
 static inline bool btree_path_get_locks(struct btree_trans *trans,
                                        struct btree_path *path,
-                                       bool upgrade)
+                                       bool upgrade,
+                                       struct get_locks_fail *f)
 {
        unsigned l = path->level;
        int fail_idx = -1;
@@ -441,8 +442,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
 
                if (!(upgrade
                      ? bch2_btree_node_upgrade(trans, path, l)
-                     : bch2_btree_node_relock(trans, path, l)))
-                       fail_idx = l;
+                     : bch2_btree_node_relock(trans, path, l))) {
+                       fail_idx        = l;
+
+                       if (f) {
+                               f->l    = l;
+                               f->b    = path->l[l].b;
+                       }
+               }
 
                l++;
        } while (l < path->locks_want);
@@ -583,7 +590,9 @@ __flatten
 bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
                        struct btree_path *path, unsigned long trace_ip)
 {
-       return btree_path_get_locks(trans, path, false);
+       struct get_locks_fail f;
+
+       return btree_path_get_locks(trans, path, false, &f);
 }
 
 int __bch2_btree_path_relock(struct btree_trans *trans,
@@ -599,22 +608,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
 
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
                               struct btree_path *path,
-                              unsigned new_locks_want)
+                              unsigned new_locks_want,
+                              struct get_locks_fail *f)
 {
        EBUG_ON(path->locks_want >= new_locks_want);
 
        path->locks_want = new_locks_want;
 
-       return btree_path_get_locks(trans, path, true);
+       return btree_path_get_locks(trans, path, true, f);
 }
 
 bool __bch2_btree_path_upgrade(struct btree_trans *trans,
                               struct btree_path *path,
-                              unsigned new_locks_want)
+                              unsigned new_locks_want,
+                              struct get_locks_fail *f)
 {
        struct btree_path *linked;
 
-       if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want))
+       if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
                return true;
 
        /*
@@ -643,7 +654,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
                            linked->btree_id == path->btree_id &&
                            linked->locks_want < new_locks_want) {
                                linked->locks_want = new_locks_want;
-                               btree_path_get_locks(trans, linked, true);
+                               btree_path_get_locks(trans, linked, true, NULL);
                        }
 
        return false;
@@ -653,7 +664,10 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
                                 struct btree_path *path,
                                 unsigned new_locks_want)
 {
-       unsigned l;
+       unsigned l, old_locks_want = path->locks_want;
+
+       if (trans->restarted)
+               return;
 
        EBUG_ON(path->locks_want < new_locks_want);
 
@@ -673,6 +687,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
        }
 
        bch2_btree_path_verify_locks(path);
+
+       path->downgrade_seq++;
+       trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
 }
 
 /* Btree transaction locking: */
@@ -681,6 +698,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 {
        struct btree_path *path;
 
+       if (trans->restarted)
+               return;
+
        trans_for_each_path(trans, path)
                bch2_btree_path_downgrade(trans, path);
 }
@@ -739,6 +759,12 @@ void bch2_trans_unlock(struct btree_trans *trans)
                bch2_assert_btree_nodes_not_locked();
 }
 
+void bch2_trans_unlock_long(struct btree_trans *trans)
+{
+       bch2_trans_unlock(trans);
+       bch2_trans_srcu_unlock(trans);
+}
+
 bool bch2_trans_locked(struct btree_trans *trans)
 {
        struct btree_path *path;
index ce3c7d9f438e2a011ad43489f8f46c561c3ff3ca..11b0a2c8cd691b21afccdcc38486aa060351f62a 100644 (file)
@@ -10,9 +10,8 @@
  * updating the iterator state
  */
 
-#include <linux/six.h>
-
 #include "btree_iter.h"
+#include "six.h"
 
 void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags);
 
@@ -92,7 +91,7 @@ static inline void mark_btree_node_unlocked(struct btree_path *path,
 static inline void mark_btree_node_locked(struct btree_trans *trans,
                                          struct btree_path *path,
                                          unsigned level,
-                                         enum six_lock_type type)
+                                         enum btree_node_locked_type type)
 {
        mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type);
 #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
@@ -356,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
 
 /* upgrade */
 
+
+struct get_locks_fail {
+       unsigned        l;
+       struct btree    *b;
+};
+
 bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
-                              struct btree_path *, unsigned);
+                              struct btree_path *, unsigned,
+                              struct get_locks_fail *);
+
 bool __bch2_btree_path_upgrade(struct btree_trans *,
-                              struct btree_path *, unsigned);
+                              struct btree_path *, unsigned,
+                              struct get_locks_fail *);
 
 static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
                                          struct btree_path *path,
                                          unsigned new_locks_want)
 {
+       struct get_locks_fail f;
        unsigned old_locks_want = path->locks_want;
 
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
        if (path->locks_want < new_locks_want
-           ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+           ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
            : path->uptodate == BTREE_ITER_UPTODATE)
                return 0;
 
        trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
-                       old_locks_want, new_locks_want);
+                       old_locks_want, new_locks_want, &f);
        return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
 }
 
index 6b6333df88f2efb701ad409058cb4b164a2acb21..e58575ad104542bf1d5f5f111a5bf1ee10255ba0 100644 (file)
@@ -4,14 +4,14 @@
 
 #include <linux/list.h>
 #include <linux/rhashtable.h>
-#include <linux/six.h>
 
-//#include "bkey_methods.h"
+#include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
 #include "errcode.h"
 #include "journal_types.h"
 #include "replicas_types.h"
+#include "six.h"
 
 struct open_bucket;
 struct btree_update;
@@ -194,34 +194,33 @@ struct btree_node_iter {
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-static const u16 BTREE_ITER_SLOTS              = 1 << 0;
-static const u16 BTREE_ITER_ALL_LEVELS         = 1 << 1;
+static const __maybe_unused u16 BTREE_ITER_SLOTS               = 1 << 0;
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-static const u16 BTREE_ITER_INTENT             = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT              = 1 << 1;
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-static const u16 BTREE_ITER_PREFETCH           = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH            = 1 << 2;
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-static const u16 BTREE_ITER_IS_EXTENTS         = 1 << 4;
-static const u16 BTREE_ITER_NOT_EXTENTS                = 1 << 5;
-static const u16 BTREE_ITER_CACHED             = 1 << 6;
-static const u16 BTREE_ITER_WITH_KEY_CACHE     = 1 << 7;
-static const u16 BTREE_ITER_WITH_UPDATES       = 1 << 8;
-static const u16 BTREE_ITER_WITH_JOURNAL       = 1 << 9;
-static const u16 __BTREE_ITER_ALL_SNAPSHOTS    = 1 << 10;
-static const u16 BTREE_ITER_ALL_SNAPSHOTS      = 1 << 11;
-static const u16 BTREE_ITER_FILTER_SNAPSHOTS   = 1 << 12;
-static const u16 BTREE_ITER_NOPRESERVE         = 1 << 13;
-static const u16 BTREE_ITER_CACHED_NOFILL      = 1 << 14;
-static const u16 BTREE_ITER_KEY_CACHE_FILL     = 1 << 15;
-#define __BTREE_ITER_FLAGS_END                        16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS          = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS         = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_CACHED              = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE      = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES                = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL                = 1 << 8;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS     = 1 << 9;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS       = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS    = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE          = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL       = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL      = 1 << 14;
+#define __BTREE_ITER_FLAGS_END                                        15
 
 enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
@@ -238,6 +237,8 @@ struct btree_path {
        u8                      sorted_idx;
        u8                      ref;
        u8                      intent_ref;
+       u32                     alloc_seq;
+       u32                     downgrade_seq;
 
        /* btree_iter_copy starts here: */
        struct bpos             pos;
@@ -320,31 +321,6 @@ struct btree_iter {
 #endif
 };
 
-struct btree_key_cache_freelist {
-       struct bkey_cached      *objs[16];
-       unsigned                nr;
-};
-
-struct btree_key_cache {
-       struct mutex            lock;
-       struct rhashtable       table;
-       bool                    table_init_done;
-       struct list_head        freed_pcpu;
-       struct list_head        freed_nonpcpu;
-       struct shrinker         shrink;
-       unsigned                shrink_iter;
-       struct btree_key_cache_freelist __percpu *pcpu_freed;
-
-       atomic_long_t           nr_freed;
-       atomic_long_t           nr_keys;
-       atomic_long_t           nr_dirty;
-};
-
-struct bkey_cached_key {
-       u32                     btree_id;
-       struct bpos             pos;
-} __packed __aligned(4);
-
 #define BKEY_CACHED_ACCESSED           0
 #define BKEY_CACHED_DIRTY              1
 
@@ -360,7 +336,6 @@ struct bkey_cached {
        struct rhash_head       hash;
        struct list_head        list;
 
-       struct journal_preres   res;
        struct journal_entry_pin journal;
        u64                     seq;
 
@@ -390,7 +365,6 @@ struct btree_insert_entry {
        u8                      old_btree_u64s;
        struct bkey_i           *k;
        struct btree_path       *path;
-       u64                     seq;
        /* key being overwritten: */
        struct bkey             old_k;
        const struct bch_val    *old_v;
@@ -430,6 +404,7 @@ struct btree_trans {
        u8                      nr_updates;
        u8                      nr_wb_updates;
        u8                      wb_updates_size;
+       bool                    srcu_held:1;
        bool                    used_mempool:1;
        bool                    in_traverse_all:1;
        bool                    paths_sorted:1;
@@ -438,6 +413,7 @@ struct btree_trans {
        bool                    journal_replay_not_finished:1;
        bool                    is_initial_gc:1;
        bool                    notrace_relock_fail:1;
+       bool                    write_locked:1;
        enum bch_errcode        restarted:16;
        u32                     restart_count;
        unsigned long           last_begin_ip;
@@ -459,8 +435,8 @@ struct btree_trans {
        void                    *mem;
 
        u8                      sorted[BTREE_ITER_MAX + 8];
-       struct btree_path       *paths;
-       struct btree_insert_entry *updates;
+       struct btree_path       paths[BTREE_ITER_MAX];
+       struct btree_insert_entry updates[BTREE_ITER_MAX];
        struct btree_write_buffered_key *wb_updates;
 
        /* update path: */
@@ -469,11 +445,9 @@ struct btree_trans {
        struct journal_entry_pin *journal_pin;
 
        struct journal_res      journal_res;
-       struct journal_preres   journal_preres;
        u64                     *journal_seq;
        struct disk_reservation *disk_res;
        unsigned                journal_u64s;
-       unsigned                journal_preres_u64s;
        struct replicas_delta_list *fs_usage_deltas;
 };
 
@@ -643,16 +617,17 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i)
 }
 
 enum btree_node_type {
-#define x(kwd, val, ...) BKEY_TYPE_##kwd = val,
+       BKEY_TYPE_btree,
+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val + 1,
        BCH_BTREE_IDS()
 #undef x
-       BKEY_TYPE_btree,
+       BKEY_TYPE_NR
 };
 
 /* Type of a key in btree @id at level @level: */
 static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
 {
-       return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
+       return level ? BKEY_TYPE_btree : (unsigned) id + 1;
 }
 
 /* Type of keys @b contains: */
@@ -661,19 +636,21 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
        return __btree_node_type(b->c.level, b->c.btree_id);
 }
 
+const char *bch2_btree_node_type_str(enum btree_node_type);
+
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
-       (BIT(BKEY_TYPE_extents)|                        \
-        BIT(BKEY_TYPE_alloc)|                          \
-        BIT(BKEY_TYPE_inodes)|                         \
-        BIT(BKEY_TYPE_stripes)|                        \
-        BIT(BKEY_TYPE_reflink)|                        \
-        BIT(BKEY_TYPE_btree))
+       (BIT_ULL(BKEY_TYPE_extents)|                    \
+        BIT_ULL(BKEY_TYPE_alloc)|                      \
+        BIT_ULL(BKEY_TYPE_inodes)|                     \
+        BIT_ULL(BKEY_TYPE_stripes)|                    \
+        BIT_ULL(BKEY_TYPE_reflink)|                    \
+        BIT_ULL(BKEY_TYPE_btree))
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
-       (BIT(BKEY_TYPE_alloc)|                          \
-        BIT(BKEY_TYPE_inodes)|                         \
-        BIT(BKEY_TYPE_stripes)|                        \
-        BIT(BKEY_TYPE_snapshots))
+       (BIT_ULL(BKEY_TYPE_alloc)|                      \
+        BIT_ULL(BKEY_TYPE_inodes)|                     \
+        BIT_ULL(BKEY_TYPE_stripes)|                    \
+        BIT_ULL(BKEY_TYPE_snapshots))
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
@@ -681,13 +658,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
-       return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
+       return BTREE_NODE_TYPE_HAS_TRIGGERS & BIT_ULL(type);
 }
 
 static inline bool btree_node_type_is_extents(enum btree_node_type type)
 {
        const unsigned mask = 0
-#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_ID_EXTENTS)) << nr)
+#define x(name, nr, flags, ...)        |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1))
        BCH_BTREE_IDS()
 #undef x
        ;
@@ -697,7 +674,7 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type)
 
 static inline bool btree_id_is_extents(enum btree_id btree)
 {
-       return btree_node_type_is_extents((enum btree_node_type) btree);
+       return btree_node_type_is_extents(__btree_node_type(0, btree));
 }
 
 static inline bool btree_type_has_snapshots(enum btree_id id)
@@ -711,6 +688,17 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
        return (1U << id) & mask;
 }
 
+static inline bool btree_type_has_snapshot_field(enum btree_id id)
+{
+       const unsigned mask = 0
+#define x(name, nr, flags, ...)        |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr)
+       BCH_BTREE_IDS()
+#undef x
+       ;
+
+       return (1U << id) & mask;
+}
+
 static inline bool btree_type_has_ptrs(enum btree_id id)
 {
        const unsigned mask = 0
index 2281140a288cad6b98dede73a95e58443d2eb2f3..14a2315aa88e4267775c910f3119728f3f5579dc 100644 (file)
@@ -4,7 +4,6 @@
 
 #include "btree_iter.h"
 #include "journal.h"
-#include "journal.h"
 
 struct bch_fs;
 struct btree;
@@ -22,50 +21,42 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
                                struct bkey_i *, u64);
 
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS()                                                       \
+       x(no_enospc,    "don't check for enospc")                                       \
+       x(no_check_rw,  "don't attempt to take a ref on c->writes")                     \
+       x(lazy_rw,      "go read-write if we haven't yet - only for use in recovery")   \
+       x(no_journal_res, "don't take a journal reservation, instead "                  \
+                       "pin journal entry referred to by trans->journal_res.seq")      \
+       x(journal_reclaim, "operation required for journal reclaim; may return error"   \
+                       "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
        /* First bits for bch_watermark: */
-       __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
-       __BTREE_INSERT_NOCHECK_RW,
-       __BTREE_INSERT_LAZY_RW,
-       __BTREE_INSERT_JOURNAL_REPLAY,
-       __BTREE_INSERT_JOURNAL_RECLAIM,
-       __BTREE_INSERT_NOWAIT,
-       __BTREE_INSERT_GC_LOCK_HELD,
-       __BCH_HASH_SET_MUST_CREATE,
-       __BCH_HASH_SET_MUST_REPLACE,
+       __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...)      __BCH_TRANS_COMMIT_##n,
+       BCH_TRANS_COMMIT_FLAGS()
+#undef x
 };
 
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL            BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW                BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW           BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY    BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM   BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT            BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD      BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE       BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE      BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...)      BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+       BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
 
 int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
                                unsigned, unsigned);
 int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
 int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
+int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
 
 int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
                                struct bkey_i *, enum btree_update_flags);
 
-int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *,
+int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *,
                        enum btree_update_flags);
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
-                    struct disk_reservation *, u64 *, int flags);
+                    struct disk_reservation *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
                                  struct bpos, struct bpos, unsigned, u64 *);
@@ -114,8 +105,8 @@ void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *, unsigned);
 
-int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
-int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...);
+__printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...);
 
 /**
  * bch2_trans_commit - insert keys at given iterator positions
@@ -145,30 +136,17 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
        nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
                                        (_journal_seq), (_flags)))
 
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
-({                                                                     \
-       struct btree_trans trans;                                       \
-       int _ret;                                                       \
-                                                                       \
-       bch2_trans_init(&trans, (_c), 0, 0);                            \
-       _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \
-       bch2_trans_exit(&trans);                                        \
-                                                                       \
-       _ret;                                                           \
-})
-
 #define bch2_trans_run(_c, _do)                                                \
 ({                                                                     \
-       struct btree_trans trans;                                       \
-       int _ret;                                                       \
-                                                                       \
-       bch2_trans_init(&trans, (_c), 0, 0);                            \
-       _ret = (_do);                                                   \
-       bch2_trans_exit(&trans);                                        \
-                                                                       \
+       struct btree_trans *trans = bch2_trans_get(_c);                 \
+       int _ret = (_do);                                               \
+       bch2_trans_put(trans);                                          \
        _ret;                                                           \
 })
 
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
+       bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
+
 #define trans_for_each_update(_trans, _i)                              \
        for ((_i) = (_trans)->updates;                                  \
             (_i) < (_trans)->updates + (_trans)->nr_updates;           \
@@ -268,10 +246,10 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr
 {
        struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter,
                                btree_id, pos, flags|BTREE_ITER_INTENT, type);
-       struct bkey_i *ret = unlikely(IS_ERR(k.k))
+       struct bkey_i *ret = IS_ERR(k.k)
                ? ERR_CAST(k.k)
                : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes);
-       if (unlikely(IS_ERR(ret)))
+       if (IS_ERR(ret))
                bch2_trans_iter_exit(trans, iter);
        return ret;
 }
index f42ef46c59df6f423b524d4d0056e89af72f041b..18e5a75142e9a5e95b68661965342e8b923e140e 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -17,7 +18,6 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
-#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 #include "trace.h"
@@ -143,10 +143,15 @@ static size_t btree_node_u64s_with_format(struct btree *b,
 }
 
 /**
- * btree_node_format_fits - check if we could rewrite node with a new format
+ * bch2_btree_node_format_fits - check if we could rewrite node with a new format
  *
- * This assumes all keys can pack with the new format -- it just checks if
- * the re-packed keys would fit inside the node itself.
+ * @c:         filesystem handle
+ * @b:         btree node to rewrite
+ * @new_f:     bkey format to translate keys to
+ *
+ * Returns: true if all re-packed keys will be able to fit in a new node.
+ *
+ * Assumes all keys will successfully pack with the new format.
  */
 bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
                                 struct bkey_format *new_f)
@@ -244,7 +249,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
        struct write_point *wp;
        struct btree *b;
        BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
-       struct open_buckets ob = { .nr = 0 };
+       struct open_buckets obs = { .nr = 0 };
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
        unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
@@ -257,7 +262,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
                struct btree_alloc *a =
                        &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-               ob = a->ob;
+               obs = a->ob;
                bkey_copy(&tmp.k, &a->k);
                mutex_unlock(&c->btree_reserve_cache_lock);
                goto mem_alloc;
@@ -292,7 +297,7 @@ retry:
        bkey_btree_ptr_v2_init(&tmp.k);
        bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
-       bch2_open_bucket_get(c, wp, &ob);
+       bch2_open_bucket_get(c, wp, &obs);
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
        b = bch2_btree_node_mem_alloc(trans, interior_node);
@@ -304,7 +309,7 @@ mem_alloc:
        BUG_ON(b->ob.nr);
 
        bkey_copy(&b->key, &tmp.k);
-       b->ob = ob;
+       b->ob = obs;
 
        return b;
 }
@@ -470,9 +475,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
        /*
         * Protects reaping from the btree node cache and using the btree node
         * open bucket reserve:
-        *
-        * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
-        * blocking on this lock:
         */
        ret = bch2_btree_cache_cannibalize_lock(c, cl);
        if (ret)
@@ -482,9 +484,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
                struct prealloc_nodes *p = as->prealloc_nodes + interior;
 
                while (p->nr < nr_nodes[interior]) {
-                       b = __bch2_btree_node_alloc(trans, &as->disk_res,
-                                       flags & BTREE_INSERT_NOWAIT ? NULL : cl,
-                                       interior, flags);
+                       b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+                                                   interior, flags);
                        if (IS_ERR(b)) {
                                ret = PTR_ERR(b);
                                goto err;
@@ -508,8 +509,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
                up_read(&c->gc_lock);
        as->took_gc_lock = false;
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
        bch2_disk_reservation_put(c, &as->disk_res);
@@ -592,12 +591,11 @@ static void btree_update_nodes_written(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
        struct btree *b;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        u64 journal_seq = 0;
        unsigned i;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 512);
        /*
         * If we're already in an error state, it might be because a btree node
         * was never written, and we might be trying to free that same btree
@@ -618,7 +616,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 
                b = as->old_nodes[i];
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
                seq = b->data ? b->data->keys.seq : 0;
                six_unlock_read(&b->c.lock);
 
@@ -640,13 +638,13 @@ static void btree_update_nodes_written(struct btree_update *as)
         * journal reclaim does btree updates when flushing bkey_cached entries,
         * which may require allocations as well.
         */
-       ret = commit_do(&trans, &as->disk_res, &journal_seq,
+       ret = commit_do(trans, &as->disk_res, &journal_seq,
                        BCH_WATERMARK_reclaim|
-                       BTREE_INSERT_NOFAIL|
-                       BTREE_INSERT_NOCHECK_RW|
-                       BTREE_INSERT_JOURNAL_RECLAIM,
-                       btree_update_nodes_written_trans(&trans, as));
-       bch2_trans_unlock(&trans);
+                       BCH_TRANS_COMMIT_no_enospc|
+                       BCH_TRANS_COMMIT_no_check_rw|
+                       BCH_TRANS_COMMIT_journal_reclaim,
+                       btree_update_nodes_written_trans(trans, as));
+       bch2_trans_unlock(trans);
 
        bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
                             "%s(): error %s", __func__, bch2_err_str(ret));
@@ -655,7 +653,7 @@ err:
                struct btree_path *path;
 
                b = as->b;
-               path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p);
+               path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
                /*
                 * @b is the node we did the final insert into:
                 *
@@ -678,13 +676,13 @@ err:
                 * we may rarely end up with a locked path besides the one we
                 * have here:
                 */
-               bch2_trans_unlock(&trans);
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent);
-               mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent);
+               bch2_trans_unlock(trans);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path, b->c.level, BTREE_NODE_INTENT_LOCKED);
                path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock);
                path->l[b->c.level].b = b;
 
-               bch2_btree_node_lock_write_nofail(&trans, path, &b->c);
+               bch2_btree_node_lock_write_nofail(trans, path, &b->c);
 
                mutex_lock(&c->btree_interior_update_lock);
 
@@ -697,15 +695,15 @@ err:
                 * btree_interior_update_lock:
                 */
                if (as->b == b) {
-                       struct bset *i = btree_bset_last(b);
-
                        BUG_ON(!b->c.level);
                        BUG_ON(!btree_node_dirty(b));
 
                        if (!ret) {
-                               i->journal_seq = cpu_to_le64(
+                               struct bset *last = btree_bset_last(b);
+
+                               last->journal_seq = cpu_to_le64(
                                                             max(journal_seq,
-                                                                le64_to_cpu(i->journal_seq)));
+                                                                le64_to_cpu(last->journal_seq)));
 
                                bch2_btree_add_journal_pin(c, b, journal_seq);
                        } else {
@@ -724,14 +722,12 @@ err:
                six_unlock_write(&b->c.lock);
 
                btree_node_write_if_need(c, b, SIX_LOCK_intent);
-               btree_node_unlock(&trans, path, b->c.level);
-               bch2_path_put(&trans, path, true);
+               btree_node_unlock(trans, path, b->c.level);
+               bch2_path_put(trans, path, true);
        }
 
        bch2_journal_pin_drop(&c->journal, &as->journal);
 
-       bch2_journal_preres_put(&c->journal, &as->journal_preres);
-
        mutex_lock(&c->btree_interior_update_lock);
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
@@ -745,7 +741,7 @@ err:
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
 
-               btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
+               btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
                btree_node_write_if_need(c, b, SIX_LOCK_read);
                six_unlock_read(&b->c.lock);
        }
@@ -753,8 +749,8 @@ err:
        for (i = 0; i < as->nr_open_buckets; i++)
                bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
 
-       bch2_btree_update_free(as, &trans);
-       bch2_trans_exit(&trans);
+       bch2_btree_update_free(as, trans);
+       bch2_trans_put(trans);
 }
 
 static void btree_interior_update_work(struct work_struct *work)
@@ -814,6 +810,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
 static void btree_update_reparent(struct btree_update *as,
                                  struct btree_update *child)
 {
@@ -824,7 +826,8 @@ static void btree_update_reparent(struct btree_update *as,
        child->b = NULL;
        child->mode = BTREE_INTERIOR_UPDATING_AS;
 
-       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+                             bch2_update_reparent_journal_pin_flush);
 }
 
 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -933,6 +936,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
                        b->ob.v[--b->ob.nr];
 }
 
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+                               struct journal_entry_pin *_pin, u64 seq)
+{
+       return 0;
+}
+
 /*
  * @b is being split/rewritten: it may have pointers to not-yet-written btree
  * nodes and thus outstanding btree_updates - redirect @b's
@@ -984,11 +993,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         * when the new nodes are persistent and reachable on disk:
         */
        w = btree_current_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+                             bch2_btree_update_will_free_node_journal_pin_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        w = btree_prev_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+                             bch2_btree_update_will_free_node_journal_pin_flush);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1038,12 +1049,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        struct bch_fs *c = trans->c;
        struct btree_update *as;
        u64 start_time = local_clock();
-       int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+       int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
        unsigned nr_nodes[2] = { 0, 0 };
        unsigned update_level = level;
        enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       unsigned journal_flags = 0;
        int ret = 0;
        u32 restart_count = trans->restart_count;
 
@@ -1057,10 +1067,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        flags &= ~BCH_WATERMARK_MASK;
        flags |= watermark;
 
-       if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
-               journal_flags |= JOURNAL_RES_GET_NONBLOCK;
-       journal_flags |= watermark;
-
        while (1) {
                nr_nodes[!!update_level] += 1 + split;
                update_level++;
@@ -1083,9 +1089,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
                split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
        }
 
-       if (flags & BTREE_INSERT_GC_LOCK_HELD)
-               lockdep_assert_held(&c->gc_lock);
-       else if (!down_read_trylock(&c->gc_lock)) {
+       if (!down_read_trylock(&c->gc_lock)) {
                ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
                if (ret) {
                        up_read(&c->gc_lock);
@@ -1099,7 +1103,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        as->c           = c;
        as->start_time  = start_time;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
-       as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+       as->took_gc_lock = true;
        as->btree_id    = path->btree_id;
        as->update_level = update_level;
        INIT_LIST_HEAD(&as->list);
@@ -1125,27 +1129,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
        if (ret)
                goto err;
 
-       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                     BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
-       if (ret) {
-               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
-                       goto err;
-               }
-
-               ret = drop_locks_do(trans,
-                       bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                                             BTREE_UPDATE_JOURNAL_RES,
-                                             journal_flags));
-               if (ret == -BCH_ERR_journal_preres_get_blocked) {
-                       trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags);
-                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get);
-               }
-               if (ret)
-                       goto err;
-       }
-
        ret = bch2_disk_reservation_get(c, &as->disk_res,
                        (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
                        c->opts.metadata_replicas,
@@ -1163,7 +1146,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
                 * flag
                 */
                if (bch2_err_matches(ret, ENOSPC) &&
-                   (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+                   (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
                    watermark != BCH_WATERMARK_reclaim) {
                        ret = -BCH_ERR_journal_reclaim_would_deadlock;
                        goto err;
@@ -1216,18 +1199,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
        bch2_recalc_btree_reserve(c);
 }
 
-/**
- * bch_btree_set_root - update the root in memory and on disk
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks. However, you must hold an intent lock on the
- * old root.
- *
- * Note: This allocates a journal entry but doesn't add any keys to
- * it.  All the btree roots are part of every journal write, so there
- * is nothing new to be done.  This just guarantees that there is a
- * journal write.
- */
 static void bch2_btree_set_root(struct btree_update *as,
                                struct btree_trans *trans,
                                struct btree_path *path,
@@ -1282,14 +1253,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 
        if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
                              btree_node_type(b), WRITE, &buf) ?:
-           bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
+           bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf)) {
                printbuf_reset(&buf);
                prt_printf(&buf, "inserting invalid bkey\n  ");
                bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
                prt_printf(&buf, "\n  ");
                bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
                                  btree_node_type(b), WRITE, &buf);
-               bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);
+               bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), &buf);
 
                bch2_fs_inconsistent(c, "%s", buf.buf);
                dump_stack();
@@ -1341,12 +1312,12 @@ __bch2_btree_insert_keys_interior(struct btree_update *as,
                ;
 
        while (!bch2_keylist_empty(keys)) {
-               struct bkey_i *k = bch2_keylist_front(keys);
+               insert = bch2_keylist_front(keys);
 
-               if (bpos_gt(k->k.p, b->key.k.p))
+               if (bpos_gt(insert->k.p, b->key.k.p))
                        break;
 
-               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k);
+               bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert);
                bch2_keylist_pop_front(keys);
        }
 }
@@ -1513,12 +1484,12 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
                path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
                six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path1, n1);
 
                path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
                six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path2, n2);
 
                /*
@@ -1539,7 +1510,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
                        path2->locks_want++;
                        BUG_ON(btree_node_locked(path2, n3->c.level));
                        six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
-                       mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent);
+                       mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
                        bch2_btree_path_level_init(trans, path2, n3);
 
                        n3->sib_u64s[0] = U16_MAX;
@@ -1563,7 +1534,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
                path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
                six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
-               mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent);
+               mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
                bch2_btree_path_level_init(trans, path1, n1);
 
                if (parent)
@@ -1661,12 +1632,16 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
 }
 
 /**
- * bch_btree_insert_node - insert bkeys into a given btree node
+ * bch2_btree_insert_node - insert bkeys into a given btree node
  *
- * @iter:              btree iterator
+ * @as:                        btree_update object
+ * @trans:             btree_trans object
+ * @path:              path that points to current node
+ * @b:                 node to insert keys into
  * @keys:              list of keys to insert
- * @hook:              insert callback
- * @persistent:                if not null, @persistent will wait on journal write
+ * @flags:             transaction commit flags
+ *
+ * Returns: 0 on success, typically transaction restart error on failure
  *
  * Inserts as many keys as it can into a given btree node, splitting it if full.
  * If a split occurred, this function will return early. This can only happen
@@ -1859,7 +1834,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        parent = btree_node_parent(path, b);
        as = bch2_btree_update_start(trans, path, level, false,
-                                    BTREE_INSERT_NOFAIL|flags);
+                                    BCH_TRANS_COMMIT_no_enospc|flags);
        ret = PTR_ERR_OR_ZERO(as);
        if (ret)
                goto err;
@@ -1890,7 +1865,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
        new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
        six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
        bch2_btree_path_level_init(trans, new_path, n);
 
        bkey_init(&delete.k);
@@ -1934,9 +1909,6 @@ err_free_update:
        goto out;
 }
 
-/**
- * bch_btree_node_rewrite - Rewrite/move a btree node
- */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
                            struct btree_iter *iter,
                            struct btree *b,
@@ -1948,7 +1920,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
        struct btree_update *as;
        int ret;
 
-       flags |= BTREE_INSERT_NOFAIL;
+       flags |= BCH_TRANS_COMMIT_no_enospc;
 
        parent = btree_node_parent(iter->path, b);
        as = bch2_btree_update_start(trans, iter->path, b->c.level,
@@ -1967,7 +1939,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
        new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
        six_lock_increment(&n->c.lock, SIX_LOCK_intent);
-       mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent);
+       mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
        bch2_btree_path_level_init(trans, new_path, n);
 
        trace_and_count(c, btree_node_rewrite, c, b);
@@ -1994,7 +1966,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 out:
        if (new_path)
                bch2_path_put(trans, new_path, true);
-       bch2_btree_path_downgrade(trans, iter->path);
+       bch2_trans_downgrade(trans);
        return ret;
 err:
        bch2_btree_node_free_never_used(as, trans, n);
@@ -2055,9 +2027,9 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
        int ret;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                     async_btree_node_rewrite_trans(&trans, a));
+                     async_btree_node_rewrite_trans(trans, a));
        if (ret)
-               bch_err(c, "%s: error %s", __func__, bch2_err_str(ret));
+               bch_err_fn(c, ret);
        bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
        kfree(a);
 }
@@ -2096,8 +2068,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 
                ret = bch2_fs_read_write_early(c);
                if (ret) {
-                       bch_err(c, "%s: error going read-write: %s",
-                               __func__, bch2_err_str(ret));
+                       bch_err_msg(c, ret, "going read-write");
                        kfree(a);
                        return;
                }
@@ -2372,7 +2343,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 
 void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
 {
-       bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id));
+       bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2385,7 +2356,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
                       as,
                       as->mode,
                       as->nodes_written,
-                      atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
+                      closure_nr_remaining(&as->cl),
                       as->journal.seq);
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -2419,30 +2390,24 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry
 
        r->level = entry->level;
        r->alive = true;
-       bkey_copy(&r->key, &entry->start[0]);
+       bkey_copy(&r->key, (struct bkey_i *) entry->start);
 
        mutex_unlock(&c->btree_root_lock);
 }
 
 struct jset_entry *
 bch2_btree_roots_to_journal_entries(struct bch_fs *c,
-                                   struct jset_entry *start,
-                                   struct jset_entry *end)
+                                   struct jset_entry *end,
+                                   unsigned long skip)
 {
-       struct jset_entry *entry;
-       unsigned long have = 0;
        unsigned i;
 
-       for (entry = start; entry < end; entry = vstruct_next(entry))
-               if (entry->type == BCH_JSET_ENTRY_btree_root)
-                       __set_bit(entry->btree_id, &have);
-
        mutex_lock(&c->btree_root_lock);
 
        for (i = 0; i < btree_id_nr_alive(c); i++) {
                struct btree_root *r = bch2_btree_id_root(c, i);
 
-               if (r->alive && !test_bit(i, &have)) {
+               if (r->alive && !test_bit(i, &skip)) {
                        journal_entry_set(end, BCH_JSET_ENTRY_btree_root,
                                          i, r->level, &r->key, r->key.k.u64s);
                        end = vstruct_next(end);
index 5e0a467fe9056acf25ce8c3d65a1d5ca6adfc216..031076e75fa1322a82a202e150a8eca9a75c063e 100644 (file)
@@ -55,7 +55,6 @@ struct btree_update {
        unsigned                        update_level;
 
        struct disk_reservation         disk_res;
-       struct journal_preres           journal_preres;
 
        /*
         * BTREE_INTERIOR_UPDATING_NODE:
@@ -271,7 +270,7 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
        struct btree_node_entry *bne = max(write_block(b),
                        (void *) btree_bkey_last(b, bset_tree_last(b)));
        ssize_t remaining_space =
-               __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
+               __bch_btree_u64s_remaining(c, b, bne->keys.start);
 
        if (unlikely(bset_written(b, bset(b, t)))) {
                if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -303,7 +302,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
        k.needs_whiteout = true;
 
        b->whiteout_u64s += k.u64s;
-       bkey_copy(unwritten_whiteouts_start(c, b), &k);
+       bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
 }
 
 /*
@@ -325,7 +324,7 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *);
 
 void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *);
 struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
-                                       struct jset_entry *, struct jset_entry *);
+                                       struct jset_entry *, unsigned long);
 
 void bch2_do_pending_node_rewrites(struct bch_fs *);
 void bch2_free_pending_node_rewrites(struct bch_fs *);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
deleted file mode 100644 (file)
index 369e37a..0000000
+++ /dev/null
@@ -1,2107 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include "bcachefs.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_gc.h"
-#include "btree_io.h"
-#include "btree_iter.h"
-#include "btree_key_cache.h"
-#include "btree_locking.h"
-#include "btree_write_buffer.h"
-#include "buckets.h"
-#include "debug.h"
-#include "errcode.h"
-#include "error.h"
-#include "extent_update.h"
-#include "journal.h"
-#include "journal_reclaim.h"
-#include "keylist.h"
-#include "recovery.h"
-#include "subvolume.h"
-#include "replicas.h"
-#include "trace.h"
-
-#include <linux/prefetch.h>
-#include <linux/sort.h>
-
-/*
- * bch2_btree_path_peek_slot() for a cached iterator might return a key in a
- * different snapshot:
- */
-static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u)
-{
-       struct bkey_s_c k = bch2_btree_path_peek_slot(path, u);
-
-       if (k.k && bpos_eq(path->pos, k.k->p))
-               return k;
-
-       bkey_init(u);
-       u->p = path->pos;
-       return (struct bkey_s_c) { u, NULL };
-}
-
-static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct bch_fs *c = trans->c;
-       struct bkey u;
-       struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
-
-       if (unlikely(trans->journal_replay_not_finished)) {
-               struct bkey_i *j_k =
-                       bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p);
-
-               if (j_k)
-                       k = bkey_i_to_s_c(j_k);
-       }
-
-       u = *k.k;
-       u.needs_whiteout = i->old_k.needs_whiteout;
-
-       BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey)));
-       BUG_ON(i->old_v != k.v);
-#endif
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-                         struct bkey_i *, enum btree_update_flags,
-                         unsigned long ip);
-
-static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
-                                        const struct btree_insert_entry *r)
-{
-       return   cmp_int(l->btree_id,   r->btree_id) ?:
-                cmp_int(l->cached,     r->cached) ?:
-                -cmp_int(l->level,     r->level) ?:
-                bpos_cmp(l->k->k.p,    r->k->k.p);
-}
-
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
-{
-       return i->path->l + i->level;
-}
-
-static inline bool same_leaf_as_prev(struct btree_trans *trans,
-                                    struct btree_insert_entry *i)
-{
-       return i != trans->updates &&
-               insert_l(&i[0])->b == insert_l(&i[-1])->b;
-}
-
-static inline bool same_leaf_as_next(struct btree_trans *trans,
-                                    struct btree_insert_entry *i)
-{
-       return i + 1 < trans->updates + trans->nr_updates &&
-               insert_l(&i[0])->b == insert_l(&i[1])->b;
-}
-
-inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
-                                          struct btree_path *path,
-                                          struct btree *b)
-{
-       struct bch_fs *c = trans->c;
-
-       if (unlikely(btree_node_just_written(b)) &&
-           bch2_btree_post_write_cleanup(c, b))
-               bch2_trans_node_reinit_iter(trans, b);
-
-       /*
-        * If the last bset has been written, or if it's gotten too big - start
-        * a new bset to insert into:
-        */
-       if (want_new_bset(c, b))
-               bch2_btree_init_next(trans, b);
-}
-
-/* Inserting into a given leaf node (last stage of insert): */
-
-/* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_trans *trans,
-                               struct btree_path *path,
-                               struct btree *b,
-                               struct btree_node_iter *node_iter,
-                               struct bkey_i *insert)
-{
-       struct bkey_packed *k;
-       unsigned clobber_u64s = 0, new_u64s = 0;
-
-       EBUG_ON(btree_node_just_written(b));
-       EBUG_ON(bset_written(b, btree_bset_last(b)));
-       EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
-       EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
-       EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
-       EBUG_ON(insert->k.u64s >
-               bch_btree_keys_u64s_remaining(trans->c, b));
-
-       k = bch2_btree_node_iter_peek_all(node_iter, b);
-       if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
-               k = NULL;
-
-       /* @k is the key being overwritten/deleted, if any: */
-       EBUG_ON(k && bkey_deleted(k));
-
-       /* Deleting, but not found? nothing to do: */
-       if (bkey_deleted(&insert->k) && !k)
-               return false;
-
-       if (bkey_deleted(&insert->k)) {
-               /* Deleting: */
-               btree_account_key_drop(b, k);
-               k->type = KEY_TYPE_deleted;
-
-               if (k->needs_whiteout)
-                       push_whiteout(trans->c, b, insert->k.p);
-               k->needs_whiteout = false;
-
-               if (k >= btree_bset_last(b)->start) {
-                       clobber_u64s = k->u64s;
-                       bch2_bset_delete(b, k, clobber_u64s);
-                       goto fix_iter;
-               } else {
-                       bch2_btree_path_fix_key_modified(trans, b, k);
-               }
-
-               return true;
-       }
-
-       if (k) {
-               /* Overwriting: */
-               btree_account_key_drop(b, k);
-               k->type = KEY_TYPE_deleted;
-
-               insert->k.needs_whiteout = k->needs_whiteout;
-               k->needs_whiteout = false;
-
-               if (k >= btree_bset_last(b)->start) {
-                       clobber_u64s = k->u64s;
-                       goto overwrite;
-               } else {
-                       bch2_btree_path_fix_key_modified(trans, b, k);
-               }
-       }
-
-       k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
-overwrite:
-       bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
-       new_u64s = k->u64s;
-fix_iter:
-       if (clobber_u64s != new_u64s)
-               bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
-                                        clobber_u64s, new_u64s);
-       return true;
-}
-
-static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
-                              unsigned i, u64 seq)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct btree_write *w = container_of(pin, struct btree_write, journal);
-       struct btree *b = container_of(w, struct btree, writes[i]);
-       struct btree_trans trans;
-       unsigned long old, new, v;
-       unsigned idx = w - b->writes;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read);
-       v = READ_ONCE(b->flags);
-
-       do {
-               old = new = v;
-
-               if (!(old & (1 << BTREE_NODE_dirty)) ||
-                   !!(old & (1 << BTREE_NODE_write_idx)) != idx ||
-                   w->journal.seq != seq)
-                       break;
-
-               new &= ~BTREE_WRITE_TYPE_MASK;
-               new |= BTREE_WRITE_journal_reclaim;
-               new |= 1 << BTREE_NODE_need_write;
-       } while ((v = cmpxchg(&b->flags, old, new)) != old);
-
-       btree_node_write_if_need(c, b, SIX_LOCK_read);
-       six_unlock_read(&b->c.lock);
-
-       bch2_trans_exit(&trans);
-       return 0;
-}
-
-int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-       return __btree_node_flush(j, pin, 0, seq);
-}
-
-int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
-{
-       return __btree_node_flush(j, pin, 1, seq);
-}
-
-inline void bch2_btree_add_journal_pin(struct bch_fs *c,
-                                      struct btree *b, u64 seq)
-{
-       struct btree_write *w = btree_current_write(b);
-
-       bch2_journal_pin_add(&c->journal, seq, &w->journal,
-                            btree_node_write_idx(b) == 0
-                            ? bch2_btree_node_flush0
-                            : bch2_btree_node_flush1);
-}
-
-/**
- * btree_insert_key - insert a key one key into a leaf node
- */
-inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
-                                      struct btree_path *path,
-                                      struct bkey_i *insert,
-                                      u64 journal_seq)
-{
-       struct bch_fs *c = trans->c;
-       struct btree *b = path_l(path)->b;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bset *i = bset(b, t);
-       int old_u64s = bset_u64s(t);
-       int old_live_u64s = b->nr.live_u64s;
-       int live_u64s_added, u64s_added;
-
-       if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
-                                       &path_l(path)->iter, insert)))
-               return;
-
-       i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
-
-       bch2_btree_add_journal_pin(c, b, journal_seq);
-
-       if (unlikely(!btree_node_dirty(b))) {
-               EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-               set_btree_node_dirty_acct(c, b);
-       }
-
-       live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
-       u64s_added = (int) bset_u64s(t) - old_u64s;
-
-       if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
-       if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
-               b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
-
-       if (u64s_added > live_u64s_added &&
-           bch2_maybe_compact_whiteouts(c, b))
-               bch2_trans_node_reinit_iter(trans, b);
-}
-
-/* Cached btree updates: */
-
-/* Normal update interface: */
-
-static inline void btree_insert_entry_checks(struct btree_trans *trans,
-                                            struct btree_insert_entry *i)
-{
-       BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
-       BUG_ON(i->cached        != i->path->cached);
-       BUG_ON(i->level         != i->path->level);
-       BUG_ON(i->btree_id      != i->path->btree_id);
-       EBUG_ON(!i->level &&
-               !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
-               test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
-               i->k->k.p.snapshot &&
-               bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
-}
-
-static noinline int
-bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags,
-                                  unsigned long trace_ip)
-{
-       return drop_locks_do(trans,
-               bch2_journal_preres_get(&trans->c->journal,
-                       &trans->journal_preres,
-                       trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)));
-}
-
-static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
-                                                     unsigned flags)
-{
-       return bch2_journal_res_get(&trans->c->journal, &trans->journal_res,
-                                   trans->journal_u64s, flags);
-}
-
-#define JSET_ENTRY_LOG_U64s            4
-
-static noinline void journal_transaction_name(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct journal *j = &c->journal;
-       struct jset_entry *entry =
-               bch2_journal_add_entry(j, &trans->journal_res,
-                                      BCH_JSET_ENTRY_log, 0, 0,
-                                      JSET_ENTRY_LOG_U64s);
-       struct jset_entry_log *l =
-               container_of(entry, struct jset_entry_log, entry);
-
-       strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64));
-}
-
-static inline int btree_key_can_insert(struct btree_trans *trans,
-                                      struct btree *b, unsigned u64s)
-{
-       struct bch_fs *c = trans->c;
-
-       if (!bch2_btree_node_insert_fits(c, b, u64s))
-               return -BCH_ERR_btree_insert_btree_node_full;
-
-       return 0;
-}
-
-static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags,
-                                      struct btree_path *path, unsigned u64s)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) path->l[0].b;
-       struct btree_insert_entry *i;
-       unsigned new_u64s;
-       struct bkey_i *new_k;
-
-       EBUG_ON(path->level);
-
-       if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-           bch2_btree_key_cache_must_wait(c) &&
-           !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
-               return -BCH_ERR_btree_insert_need_journal_reclaim;
-
-       /*
-        * bch2_varint_decode can read past the end of the buffer by at most 7
-        * bytes (it won't be used):
-        */
-       u64s += 1;
-
-       if (u64s <= ck->u64s)
-               return 0;
-
-       new_u64s        = roundup_pow_of_two(u64s);
-       new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-       if (!new_k) {
-               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
-                       bch2_btree_ids[path->btree_id], new_u64s);
-               return -BCH_ERR_ENOMEM_btree_key_cache_insert;
-       }
-
-       trans_for_each_update(trans, i)
-               if (i->old_v == &ck->k->v)
-                       i->old_v = &new_k->v;
-
-       ck->u64s        = new_u64s;
-       ck->k           = new_k;
-       return 0;
-}
-
-/* Triggers: */
-
-static int run_one_mem_trigger(struct btree_trans *trans,
-                              struct btree_insert_entry *i,
-                              unsigned flags)
-{
-       struct bkey_s_c old = { &i->old_k, i->old_v };
-       struct bkey_i *new = i->k;
-       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-       int ret;
-
-       verify_update_old_key(trans, i);
-
-       if (unlikely(flags & BTREE_TRIGGER_NORUN))
-               return 0;
-
-       if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id))
-               return 0;
-
-       if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               ret   = bch2_mark_key(trans, i->btree_id, i->level,
-                               old, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-       } else {
-               struct bkey             _deleted = KEY(0, 0, 0);
-               struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
-
-               _deleted.p = i->path->pos;
-
-               ret   = bch2_mark_key(trans, i->btree_id, i->level,
-                               deleted, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|flags) ?:
-                       bch2_mark_key(trans, i->btree_id, i->level,
-                               old, deleted,
-                               BTREE_TRIGGER_OVERWRITE|flags);
-       }
-
-       return ret;
-}
-
-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
-                                bool overwrite)
-{
-       /*
-        * Transactional triggers create new btree_insert_entries, so we can't
-        * pass them a pointer to a btree_insert_entry, that memory is going to
-        * move:
-        */
-       struct bkey old_k = i->old_k;
-       struct bkey_s_c old = { &old_k, i->old_v };
-       const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
-       const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
-
-       verify_update_old_key(trans, i);
-
-       if ((i->flags & BTREE_TRIGGER_NORUN) ||
-           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
-               return 0;
-
-       if (!i->insert_trigger_run &&
-           !i->overwrite_trigger_run &&
-           old_ops->trans_trigger == new_ops->trans_trigger &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               i->overwrite_trigger_run = true;
-               i->insert_trigger_run = true;
-               return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
-                                          BTREE_TRIGGER_INSERT|
-                                          BTREE_TRIGGER_OVERWRITE|
-                                          i->flags) ?: 1;
-       } else if (overwrite && !i->overwrite_trigger_run) {
-               i->overwrite_trigger_run = true;
-               return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
-       } else if (!overwrite && !i->insert_trigger_run) {
-               i->insert_trigger_run = true;
-               return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
-       } else {
-               return 0;
-       }
-}
-
-static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-                             struct btree_insert_entry *btree_id_start)
-{
-       struct btree_insert_entry *i;
-       bool trans_trigger_run;
-       int ret, overwrite;
-
-       for (overwrite = 1; overwrite >= 0; --overwrite) {
-
-               /*
-                * Running triggers will append more updates to the list of updates as
-                * we're walking it:
-                */
-               do {
-                       trans_trigger_run = false;
-
-                       for (i = btree_id_start;
-                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
-                            i++) {
-                               if (i->btree_id != btree_id)
-                                       continue;
-
-                               ret = run_one_trans_trigger(trans, i, overwrite);
-                               if (ret < 0)
-                                       return ret;
-                               if (ret)
-                                       trans_trigger_run = true;
-                       }
-               } while (trans_trigger_run);
-       }
-
-       return 0;
-}
-
-static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
-       unsigned btree_id = 0;
-       int ret = 0;
-
-       /*
-        *
-        * For a given btree, this algorithm runs insert triggers before
-        * overwrite triggers: this is so that when extents are being moved
-        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
-        * they are re-added.
-        */
-       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
-               if (btree_id == BTREE_ID_alloc)
-                       continue;
-
-               while (btree_id_start < trans->updates + trans->nr_updates &&
-                      btree_id_start->btree_id < btree_id)
-                       btree_id_start++;
-
-               ret = run_btree_triggers(trans, btree_id, btree_id_start);
-               if (ret)
-                       return ret;
-       }
-
-       trans_for_each_update(trans, i) {
-               if (i->btree_id > BTREE_ID_alloc)
-                       break;
-               if (i->btree_id == BTREE_ID_alloc) {
-                       ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
-                       if (ret)
-                               return ret;
-                       break;
-               }
-       }
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
-                      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
-#endif
-       return 0;
-}
-
-static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i;
-       int ret = 0;
-
-       trans_for_each_update(trans, i) {
-               /*
-                * XXX: synchronization of cached update triggers with gc
-                * XXX: synchronization of interior node updates with gc
-                */
-               BUG_ON(i->cached || i->level);
-
-               if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
-                       ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
-                       if (ret)
-                               break;
-               }
-       }
-
-       return ret;
-}
-
-static inline int
-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
-                              struct btree_insert_entry **stopped_at,
-                              unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i;
-       struct btree_write_buffered_key *wb;
-       struct btree_trans_commit_hook *h;
-       unsigned u64s = 0;
-       bool marking = false;
-       int ret;
-
-       if (race_fault()) {
-               trace_and_count(c, trans_restart_fault_inject, trans, trace_ip);
-               return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject);
-       }
-
-       /*
-        * Check if the insert will fit in the leaf node with the write lock
-        * held, otherwise another thread could write the node changing the
-        * amount of space available:
-        */
-
-       prefetch(&trans->c->journal.flags);
-
-       trans_for_each_update(trans, i) {
-               /* Multiple inserts might go to same leaf: */
-               if (!same_leaf_as_prev(trans, i))
-                       u64s = 0;
-
-               u64s += i->k->k.u64s;
-               ret = !i->cached
-                       ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
-                       : btree_key_can_insert_cached(trans, flags, i->path, u64s);
-               if (ret) {
-                       *stopped_at = i;
-                       return ret;
-               }
-
-               if (btree_node_type_needs_gc(i->bkey_type))
-                       marking = true;
-       }
-
-       if (trans->nr_wb_updates &&
-           trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
-               return -BCH_ERR_btree_insert_need_flush_buffer;
-
-       /*
-        * Don't get journal reservation until after we know insert will
-        * succeed:
-        */
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               ret = bch2_trans_journal_res_get(trans,
-                               (flags & BCH_WATERMARK_MASK)|
-                               JOURNAL_RES_GET_NONBLOCK);
-               if (ret)
-                       return ret;
-
-               if (unlikely(trans->journal_transaction_names))
-                       journal_transaction_name(trans);
-       } else {
-               trans->journal_res.seq = c->journal.replay_journal_seq;
-       }
-
-       /*
-        * Not allowed to fail after we've gotten our journal reservation - we
-        * have to use it:
-        */
-
-       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
-           !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
-               if (bch2_journal_seq_verify)
-                       trans_for_each_update(trans, i)
-                               i->k->k.version.lo = trans->journal_res.seq;
-               else if (bch2_inject_invalid_keys)
-                       trans_for_each_update(trans, i)
-                               i->k->k.version = MAX_VERSION;
-       }
-
-       if (trans->fs_usage_deltas &&
-           bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
-               return -BCH_ERR_btree_insert_need_mark_replicas;
-
-       if (trans->nr_wb_updates) {
-               EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
-               ret = bch2_btree_insert_keys_write_buffer(trans);
-               if (ret)
-                       goto revert_fs_usage;
-       }
-
-       h = trans->hooks;
-       while (h) {
-               ret = h->fn(trans, h);
-               if (ret)
-                       goto revert_fs_usage;
-               h = h->next;
-       }
-
-       trans_for_each_update(trans, i)
-               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
-                       ret = run_one_mem_trigger(trans, i, i->flags);
-                       if (ret)
-                               goto fatal_err;
-               }
-
-       if (unlikely(c->gc_pos.phase)) {
-               ret = bch2_trans_commit_run_gc_triggers(trans);
-               if  (ret)
-                       goto fatal_err;
-       }
-
-       if (unlikely(trans->extra_journal_entries.nr)) {
-               memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
-                                 trans->extra_journal_entries.data,
-                                 trans->extra_journal_entries.nr);
-
-               trans->journal_res.offset       += trans->extra_journal_entries.nr;
-               trans->journal_res.u64s         -= trans->extra_journal_entries.nr;
-       }
-
-       if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               struct journal *j = &c->journal;
-               struct jset_entry *entry;
-
-               trans_for_each_update(trans, i) {
-                       if (i->key_cache_already_flushed)
-                               continue;
-
-                       if (i->flags & BTREE_UPDATE_NOJOURNAL)
-                               continue;
-
-                       verify_update_old_key(trans, i);
-
-                       if (trans->journal_transaction_names) {
-                               entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                                      BCH_JSET_ENTRY_overwrite,
-                                                      i->btree_id, i->level,
-                                                      i->old_k.u64s);
-                               bkey_reassemble(&entry->start[0],
-                                               (struct bkey_s_c) { &i->old_k, i->old_v });
-                       }
-
-                       entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                              BCH_JSET_ENTRY_btree_keys,
-                                              i->btree_id, i->level,
-                                              i->k->k.u64s);
-                       bkey_copy(&entry->start[0], i->k);
-               }
-
-               trans_for_each_wb_update(trans, wb) {
-                       entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                              BCH_JSET_ENTRY_btree_keys,
-                                              wb->btree, 0,
-                                              wb->k.k.u64s);
-                       bkey_copy(&entry->start[0], &wb->k);
-               }
-
-               if (trans->journal_seq)
-                       *trans->journal_seq = trans->journal_res.seq;
-       }
-
-       trans_for_each_update(trans, i) {
-               i->k->k.needs_whiteout = false;
-
-               if (!i->cached) {
-                       u64 seq = trans->journal_res.seq;
-
-                       if (i->flags & BTREE_UPDATE_PREJOURNAL)
-                               seq = i->seq;
-
-                       bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
-               } else if (!i->key_cache_already_flushed)
-                       bch2_btree_insert_key_cached(trans, flags, i);
-               else {
-                       bch2_btree_key_cache_drop(trans, i->path);
-                       btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
-               }
-       }
-
-       return 0;
-fatal_err:
-       bch2_fatal_error(c);
-revert_fs_usage:
-       if (trans->fs_usage_deltas)
-               bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
-       return ret;
-}
-
-static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i)
-{
-       while (--i >= trans->updates) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
-       }
-
-       trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
-       return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write);
-}
-
-static inline int trans_lock_write(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i) {
-               if (same_leaf_as_prev(trans, i))
-                       continue;
-
-               if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
-                       return trans_lock_write_fail(trans, i);
-
-               if (!i->cached)
-                       bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
-       }
-
-       return 0;
-}
-
-static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
-{
-       struct btree_insert_entry *i;
-       struct btree_write_buffered_key *wb;
-
-       trans_for_each_update(trans, i)
-               bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
-       trans_for_each_wb_update(trans, wb)
-               bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
-}
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags,
-                                                  struct btree_insert_entry *i,
-                                                  struct printbuf *err)
-{
-       struct bch_fs *c = trans->c;
-       int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
-
-       printbuf_reset(err);
-       prt_printf(err, "invalid bkey on insert from %s -> %ps",
-                  trans->fn, (void *) i->ip_allocated);
-       prt_newline(err);
-       printbuf_indent_add(err, 2);
-
-       bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k));
-       prt_newline(err);
-
-       bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-                         i->bkey_type, rw, err);
-       bch2_print_string_as_lines(KERN_ERR, err->buf);
-
-       bch2_inconsistent_error(c);
-       bch2_dump_trans_updates(trans);
-       printbuf_exit(err);
-
-       return -EINVAL;
-}
-#endif
-
-/*
- * Get journal reservation, take write locks, and attempt to do btree update(s):
- */
-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags,
-                                      struct btree_insert_entry **stopped_at,
-                                      unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i;
-       int ret = 0, u64s_delta = 0;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i) {
-               struct printbuf buf = PRINTBUF;
-               enum bkey_invalid_flags invalid_flags = 0;
-
-               if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
-                       invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
-
-               if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
-                                              i->bkey_type, invalid_flags, &buf)))
-                       ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
-               btree_insert_entry_checks(trans, i);
-               printbuf_exit(&buf);
-
-               if (ret)
-                       return ret;
-       }
-#endif
-
-       trans_for_each_update(trans, i) {
-               if (i->cached)
-                       continue;
-
-               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-               u64s_delta -= i->old_btree_u64s;
-
-               if (!same_leaf_as_next(trans, i)) {
-                       if (u64s_delta <= 0) {
-                               ret = bch2_foreground_maybe_merge(trans, i->path,
-                                                       i->level, flags);
-                               if (unlikely(ret))
-                                       return ret;
-                       }
-
-                       u64s_delta = 0;
-               }
-       }
-
-       ret = bch2_journal_preres_get(&c->journal,
-                       &trans->journal_preres, trans->journal_preres_u64s,
-                       (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK);
-       if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked))
-               ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip);
-       if (unlikely(ret))
-               return ret;
-
-       ret = trans_lock_write(trans);
-       if (unlikely(ret))
-               return ret;
-
-       ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip);
-
-       if (!ret && unlikely(trans->journal_replay_not_finished))
-               bch2_drop_overwrites_from_journal(trans);
-
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(trans, i->path,
-                                                       insert_l(i)->b);
-
-       if (!ret && trans->journal_pin)
-               bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
-                                    trans->journal_pin, NULL);
-
-       /*
-        * Drop journal reservation after dropping write locks, since dropping
-        * the journal reservation may kick off a journal write:
-        */
-       bch2_journal_res_put(&c->journal, &trans->journal_res);
-
-       if (unlikely(ret))
-               return ret;
-
-       bch2_trans_downgrade(trans);
-
-       return 0;
-}
-
-static int journal_reclaim_wait_done(struct bch_fs *c)
-{
-       int ret = bch2_journal_error(&c->journal) ?:
-               !bch2_btree_key_cache_must_wait(c);
-
-       if (!ret)
-               journal_reclaim_kick(&c->journal);
-       return ret;
-}
-
-static noinline
-int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
-                           struct btree_insert_entry *i,
-                           int ret, unsigned long trace_ip)
-{
-       struct bch_fs *c = trans->c;
-
-       switch (ret) {
-       case -BCH_ERR_btree_insert_btree_node_full:
-               ret = bch2_btree_split_leaf(trans, i->path, flags);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
-               break;
-       case -BCH_ERR_btree_insert_need_mark_replicas:
-               ret = drop_locks_do(trans,
-                       bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas));
-               break;
-       case -BCH_ERR_journal_res_get_blocked:
-               /*
-                * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
-                * flag
-                */
-               if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
-                   (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
-                       ret = -BCH_ERR_journal_reclaim_would_deadlock;
-                       break;
-               }
-
-               ret = drop_locks_do(trans,
-                       bch2_trans_journal_res_get(trans,
-                                       (flags & BCH_WATERMARK_MASK)|
-                                       JOURNAL_RES_GET_CHECK));
-               break;
-       case -BCH_ERR_btree_insert_need_journal_reclaim:
-               bch2_trans_unlock(trans);
-
-               trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip);
-
-               wait_event_freezable(c->journal.reclaim_wait,
-                                    (ret = journal_reclaim_wait_done(c)));
-               if (ret < 0)
-                       break;
-
-               ret = bch2_trans_relock(trans);
-               break;
-       case -BCH_ERR_btree_insert_need_flush_buffer: {
-               struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-               ret = 0;
-
-               if (wb->state.nr > wb->size * 3 / 4) {
-                       bch2_trans_unlock(trans);
-                       mutex_lock(&wb->flush_lock);
-
-                       if (wb->state.nr > wb->size * 3 / 4) {
-                               bch2_trans_begin(trans);
-                               ret = __bch2_btree_write_buffer_flush(trans,
-                                               flags|BTREE_INSERT_NOCHECK_RW, true);
-                               if (!ret) {
-                                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-                               }
-                       } else {
-                               mutex_unlock(&wb->flush_lock);
-                               ret = bch2_trans_relock(trans);
-                       }
-               }
-               break;
-       }
-       default:
-               BUG_ON(ret >= 0);
-               break;
-       }
-
-       BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
-
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
-                               !(flags & BTREE_INSERT_NOWAIT) &&
-                               (flags & BTREE_INSERT_NOFAIL), c,
-               "%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
-
-       return ret;
-}
-
-static noinline int
-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
-           test_bit(BCH_FS_STARTED, &c->flags))
-               return -BCH_ERR_erofs_trans_commit;
-
-       ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
-       if (ret)
-               return ret;
-
-       bch2_write_ref_get(c, BCH_WRITE_REF_trans);
-       return 0;
-}
-
-/*
- * This is for updates done in the early part of fsck - btree_gc - before we've
- * gone RW. we only add the new key to the list of keys for journal replay to
- * do.
- */
-static noinline int
-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i;
-       int ret = 0;
-
-       trans_for_each_update(trans, i) {
-               ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k);
-               if (ret)
-                       break;
-       }
-
-       return ret;
-}
-
-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i = NULL;
-       struct btree_write_buffered_key *wb;
-       unsigned u64s;
-       int ret = 0;
-
-       if (!trans->nr_updates &&
-           !trans->nr_wb_updates &&
-           !trans->extra_journal_entries.nr)
-               goto out_reset;
-
-       if (flags & BTREE_INSERT_GC_LOCK_HELD)
-               lockdep_assert_held(&c->gc_lock);
-
-       ret = bch2_trans_commit_run_triggers(trans);
-       if (ret)
-               goto out_reset;
-
-       if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
-               ret = do_bch2_trans_commit_to_journal_replay(trans);
-               goto out_reset;
-       }
-
-       if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
-           unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
-               ret = bch2_trans_commit_get_rw_cold(trans, flags);
-               if (ret)
-                       goto out_reset;
-       }
-
-       if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
-           mutex_trylock(&c->btree_write_buffer.flush_lock)) {
-               bch2_trans_begin(trans);
-               bch2_trans_unlock(trans);
-
-               ret = __bch2_btree_write_buffer_flush(trans,
-                                       flags|BTREE_INSERT_NOCHECK_RW, true);
-               if (!ret) {
-                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-               }
-               goto out;
-       }
-
-       EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
-
-       memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
-
-       trans->journal_u64s             = trans->extra_journal_entries.nr;
-       trans->journal_preres_u64s      = 0;
-
-       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
-
-       if (trans->journal_transaction_names)
-               trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
-
-       trans_for_each_update(trans, i) {
-               EBUG_ON(!i->path->should_be_locked);
-
-               ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
-               if (unlikely(ret))
-                       goto out;
-
-               EBUG_ON(!btree_node_intent_locked(i->path, i->level));
-
-               if (i->key_cache_already_flushed)
-                       continue;
-
-               /* we're going to journal the key being updated: */
-               u64s = jset_u64s(i->k->k.u64s);
-               if (i->cached &&
-                   likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY)))
-                       trans->journal_preres_u64s += u64s;
-
-               if (i->flags & BTREE_UPDATE_NOJOURNAL)
-                       continue;
-
-               trans->journal_u64s += u64s;
-
-               /* and we're also going to log the overwrite: */
-               if (trans->journal_transaction_names)
-                       trans->journal_u64s += jset_u64s(i->old_k.u64s);
-       }
-
-       trans_for_each_wb_update(trans, wb)
-               trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
-       if (trans->extra_journal_res) {
-               ret = bch2_disk_reservation_add(c, trans->disk_res,
-                               trans->extra_journal_res,
-                               (flags & BTREE_INSERT_NOFAIL)
-                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
-               if (ret)
-                       goto err;
-       }
-retry:
-       bch2_trans_verify_not_in_restart(trans);
-       memset(&trans->journal_res, 0, sizeof(trans->journal_res));
-
-       ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
-
-       /* make sure we didn't drop or screw up locks: */
-       bch2_trans_verify_locks(trans);
-
-       if (ret)
-               goto err;
-
-       trace_and_count(c, transaction_commit, trans, _RET_IP_);
-out:
-       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
-
-       if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
-               bch2_write_ref_put(c, BCH_WRITE_REF_trans);
-out_reset:
-       bch2_trans_reset_updates(trans);
-
-       return ret;
-err:
-       ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
-       if (ret)
-               goto out;
-
-       goto retry;
-}
-
-static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans,
-                                         enum btree_id id,
-                                         struct bpos pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, id, pos,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       while (1) {
-               k = bch2_btree_iter_prev(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               if (!k.k)
-                       break;
-
-               if (!bkey_eq(pos, k.k->p))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
-                       ret = 1;
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static inline int check_pos_snapshot_overwritten(struct btree_trans *trans,
-                                         enum btree_id id,
-                                         struct bpos pos)
-{
-       if (!btree_type_has_snapshots(id) ||
-           bch2_snapshot_is_leaf(trans->c, pos.snapshot))
-               return 0;
-
-       return __check_pos_snapshot_overwritten(trans, id, pos);
-}
-
-static noinline int extent_front_merge(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      struct bkey_s_c k,
-                                      struct bkey_i **insert,
-                                      enum btree_update_flags flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_i *update;
-       int ret;
-
-       update = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(update);
-       if (ret)
-               return ret;
-
-       if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert)))
-               return 0;
-
-       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?:
-               check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       ret = bch2_btree_delete_at(trans, iter, flags);
-       if (ret)
-               return ret;
-
-       *insert = update;
-       return 0;
-}
-
-static noinline int extent_back_merge(struct btree_trans *trans,
-                                     struct btree_iter *iter,
-                                     struct bkey_i *insert,
-                                     struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       int ret;
-
-       ret =   check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?:
-               check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p);
-       if (ret < 0)
-               return ret;
-       if (ret)
-               return 0;
-
-       bch2_bkey_merge(c, bkey_i_to_s(insert), k);
-       return 0;
-}
-
-/*
- * When deleting, check if we need to emit a whiteout (because we're overwriting
- * something in an ancestor snapshot)
- */
-static int need_whiteout_for_snapshot(struct btree_trans *trans,
-                                     enum btree_id btree_id, struct bpos pos)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u32 snapshot = pos.snapshot;
-       int ret;
-
-       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
-               return 0;
-
-       pos.snapshot++;
-
-       for_each_btree_key_norestart(trans, iter, btree_id, pos,
-                          BTREE_ITER_ALL_SNAPSHOTS|
-                          BTREE_ITER_NOPRESERVE, k, ret) {
-               if (!bkey_eq(k.k->p, pos))
-                       break;
-
-               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
-                                             k.k->p.snapshot)) {
-                       ret = !bkey_whiteout(k.k);
-                       break;
-               }
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
-                                  enum btree_id id,
-                                  struct bpos old_pos,
-                                  struct bpos new_pos)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter old_iter, new_iter = { NULL };
-       struct bkey_s_c old_k, new_k;
-       snapshot_id_list s;
-       struct bkey_i *update;
-       int ret;
-
-       if (!bch2_snapshot_has_children(c, old_pos.snapshot))
-               return 0;
-
-       darray_init(&s);
-
-       bch2_trans_iter_init(trans, &old_iter, id, old_pos,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       while ((old_k = bch2_btree_iter_prev(&old_iter)).k &&
-              !(ret = bkey_err(old_k)) &&
-              bkey_eq(old_pos, old_k.k->p)) {
-               struct bpos whiteout_pos =
-                       SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
-
-               if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
-                   snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
-                       continue;
-
-               new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos,
-                                          BTREE_ITER_NOT_EXTENTS|
-                                          BTREE_ITER_INTENT);
-               ret = bkey_err(new_k);
-               if (ret)
-                       break;
-
-               if (new_k.k->type == KEY_TYPE_deleted) {
-                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
-                       ret = PTR_ERR_OR_ZERO(update);
-                       if (ret)
-                               break;
-
-                       bkey_init(&update->k);
-                       update->k.p             = whiteout_pos;
-                       update->k.type          = KEY_TYPE_whiteout;
-
-                       ret = bch2_trans_update(trans, &new_iter, update,
-                                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-               }
-               bch2_trans_iter_exit(trans, &new_iter);
-
-               ret = snapshot_list_add(c, &s, old_k.k->p.snapshot);
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &new_iter);
-       bch2_trans_iter_exit(trans, &old_iter);
-       darray_exit(&s);
-
-       return ret;
-}
-
-int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
-                                      struct btree_iter *iter,
-                                      enum btree_update_flags flags,
-                                      struct bkey_s_c old,
-                                      struct bkey_s_c new)
-{
-       enum btree_id btree_id = iter->btree_id;
-       struct bkey_i *update;
-       struct bpos new_start = bkey_start_pos(new.k);
-       bool front_split = bkey_lt(bkey_start_pos(old.k), new_start);
-       bool back_split  = bkey_gt(old.k->p, new.k->p);
-       int ret = 0, compressed_sectors;
-
-       /*
-        * If we're going to be splitting a compressed extent, note it
-        * so that __bch2_trans_commit() can increase our disk
-        * reservation:
-        */
-       if (((front_split && back_split) ||
-            ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) &&
-           (compressed_sectors = bch2_bkey_sectors_compressed(old)))
-               trans->extra_journal_res += compressed_sectors;
-
-       if (front_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_back(new_start, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       /* If we're overwriting in a different snapshot - middle split: */
-       if (old.k->p.snapshot != new.k->p.snapshot &&
-           (front_split || back_split)) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new_start, update);
-               bch2_cut_back(new.k->p, update);
-
-               ret =   bch2_insert_snapshot_whiteouts(trans, btree_id,
-                                       old.k->p, update->k.p) ?:
-                       bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (bkey_le(old.k->p, new.k->p)) {
-               update = bch2_trans_kmalloc(trans, sizeof(*update));
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bkey_init(&update->k);
-               update->k.p = old.k->p;
-               update->k.p.snapshot = new.k->p.snapshot;
-
-               if (new.k->p.snapshot != old.k->p.snapshot) {
-                       update->k.type = KEY_TYPE_whiteout;
-               } else if (btree_type_has_snapshots(btree_id)) {
-                       ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p);
-                       if (ret < 0)
-                               return ret;
-                       if (ret)
-                               update->k.type = KEY_TYPE_whiteout;
-               }
-
-               ret = bch2_btree_insert_nonextent(trans, btree_id, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags);
-               if (ret)
-                       return ret;
-       }
-
-       if (back_split) {
-               update = bch2_bkey_make_mut_noupdate(trans, old);
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       return ret;
-
-               bch2_cut_front(new.k->p, update);
-
-               ret = bch2_trans_update_by_path(trans, iter->path, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                         flags, _RET_IP_);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
-static int bch2_trans_update_extent(struct btree_trans *trans,
-                                   struct btree_iter *orig_iter,
-                                   struct bkey_i *insert,
-                                   enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       enum btree_id btree_id = orig_iter->btree_id;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k),
-                            BTREE_ITER_INTENT|
-                            BTREE_ITER_WITH_UPDATES|
-                            BTREE_ITER_NOT_EXTENTS);
-       k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-       if ((ret = bkey_err(k)))
-               goto err;
-       if (!k.k)
-               goto out;
-
-       if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) {
-               if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
-                       ret = extent_front_merge(trans, &iter, k, &insert, flags);
-                       if (ret)
-                               goto err;
-               }
-
-               goto next;
-       }
-
-       while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) {
-               bool done = bkey_lt(insert->k.p, k.k->p);
-
-               ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert));
-               if (ret)
-                       goto err;
-
-               if (done)
-                       goto out;
-next:
-               bch2_btree_iter_advance(&iter);
-               k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       goto err;
-               if (!k.k)
-                       goto out;
-       }
-
-       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
-               ret = extent_back_merge(trans, &iter, insert, k);
-               if (ret)
-                       goto err;
-       }
-out:
-       if (!bkey_deleted(&insert->k))
-               ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       return ret;
-}
-
-static noinline int flush_new_cached_update(struct btree_trans *trans,
-                                           struct btree_path *path,
-                                           struct btree_insert_entry *i,
-                                           enum btree_update_flags flags,
-                                           unsigned long ip)
-{
-       struct btree_path *btree_path;
-       struct bkey k;
-       int ret;
-
-       btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                  BTREE_ITER_INTENT, _THIS_IP_);
-       ret = bch2_btree_path_traverse(trans, btree_path, 0);
-       if (ret)
-               goto out;
-
-       /*
-        * The old key in the insert entry might actually refer to an existing
-        * key in the btree that has been deleted from cache and not yet
-        * flushed. Check for this and skip the flush so we don't run triggers
-        * against a stale key.
-        */
-       bch2_btree_path_peek_slot_exact(btree_path, &k);
-       if (!bkey_deleted(&k))
-               goto out;
-
-       i->key_cache_already_flushed = true;
-       i->flags |= BTREE_TRIGGER_NORUN;
-
-       btree_path_set_should_be_locked(btree_path);
-       ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
-out:
-       bch2_path_put(trans, btree_path, true);
-       return ret;
-}
-
-static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                         struct bkey_i *k, enum btree_update_flags flags,
-                         unsigned long ip)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_insert_entry *i, n;
-       u64 seq = 0;
-       int cmp;
-
-       EBUG_ON(!path->should_be_locked);
-       EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-       EBUG_ON(!bpos_eq(k->k.p, path->pos));
-
-       /*
-        * The transaction journal res hasn't been allocated at this point.
-        * That occurs at commit time. Reuse the seq field to pass in the seq
-        * of a prejournaled key.
-        */
-       if (flags & BTREE_UPDATE_PREJOURNAL)
-               seq = trans->journal_res.seq;
-
-       n = (struct btree_insert_entry) {
-               .flags          = flags,
-               .bkey_type      = __btree_node_type(path->level, path->btree_id),
-               .btree_id       = path->btree_id,
-               .level          = path->level,
-               .cached         = path->cached,
-               .path           = path,
-               .k              = k,
-               .seq            = seq,
-               .ip_allocated   = ip,
-       };
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(i != trans->updates &&
-                      btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
-
-       /*
-        * Pending updates are kept sorted: first, find position of new update,
-        * then delete/trim any updates the new update overwrites:
-        */
-       trans_for_each_update(trans, i) {
-               cmp = btree_insert_entry_cmp(&n, i);
-               if (cmp <= 0)
-                       break;
-       }
-
-       if (!cmp && i < trans->updates + trans->nr_updates) {
-               EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
-
-               bch2_path_put(trans, i->path, true);
-               i->flags        = n.flags;
-               i->cached       = n.cached;
-               i->k            = n.k;
-               i->path         = n.path;
-               i->seq          = n.seq;
-               i->ip_allocated = n.ip_allocated;
-       } else {
-               array_insert_item(trans->updates, trans->nr_updates,
-                                 i - trans->updates, n);
-
-               i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v;
-               i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0;
-
-               if (unlikely(trans->journal_replay_not_finished)) {
-                       struct bkey_i *j_k =
-                               bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p);
-
-                       if (j_k) {
-                               i->old_k = j_k->k;
-                               i->old_v = &j_k->v;
-                       }
-               }
-       }
-
-       __btree_path_get(i->path, true);
-
-       /*
-        * If a key is present in the key cache, it must also exist in the
-        * btree - this is necessary for cache coherency. When iterating over
-        * a btree that's cached in the key cache, the btree iter code checks
-        * the key cache - but the key has to exist in the btree for that to
-        * work:
-        */
-       if (path->cached && bkey_deleted(&i->old_k))
-               return flush_new_cached_update(trans, path, i, flags, ip);
-
-       return 0;
-}
-
-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-                                  struct bkey_i *k, enum btree_update_flags flags)
-{
-       struct btree_path *path = iter->update_path ?: iter->path;
-       struct bkey_cached *ck;
-       int ret;
-
-       if (iter->flags & BTREE_ITER_IS_EXTENTS)
-               return bch2_trans_update_extent(trans, iter, k, flags);
-
-       if (bkey_deleted(&k->k) &&
-           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
-               if (unlikely(ret < 0))
-                       return ret;
-
-               if (ret)
-                       k->k.type = KEY_TYPE_whiteout;
-       }
-
-       /*
-        * Ensure that updates to cached btrees go to the key cache:
-        */
-       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
-           !path->cached &&
-           !path->level &&
-           btree_id_cached(trans->c, path->btree_id)) {
-               if (!iter->key_cache_path ||
-                   !iter->key_cache_path->should_be_locked ||
-                   !bpos_eq(iter->key_cache_path->pos, k->k.p)) {
-                       if (!iter->key_cache_path)
-                               iter->key_cache_path =
-                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
-                                                     BTREE_ITER_INTENT|
-                                                     BTREE_ITER_CACHED, _THIS_IP_);
-
-                       iter->key_cache_path =
-                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
-                                                       iter->flags & BTREE_ITER_INTENT,
-                                                       _THIS_IP_);
-
-                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
-                                                      BTREE_ITER_CACHED);
-                       if (unlikely(ret))
-                               return ret;
-
-                       ck = (void *) iter->key_cache_path->l[0].b;
-
-                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-                               trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
-                               return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
-                       }
-
-                       btree_path_set_should_be_locked(iter->key_cache_path);
-               }
-
-               path = iter->key_cache_path;
-       }
-
-       return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
-}
-
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
-                                      struct btree_iter *iter, struct bkey_i *k,
-                                      enum btree_update_flags flags)
-{
-       trans->journal_res.seq = seq;
-       return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
-                                                BTREE_UPDATE_PREJOURNAL);
-}
-
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
-                                           enum btree_id btree,
-                                           struct bkey_i *k)
-{
-       struct btree_write_buffered_key *i;
-       int ret;
-
-       EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
-       EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
-       trans_for_each_wb_update(trans, i) {
-               if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
-                       bkey_copy(&i->k, k);
-                       return 0;
-               }
-       }
-
-       if (!trans->wb_updates ||
-           trans->nr_wb_updates == trans->wb_updates_size) {
-               struct btree_write_buffered_key *u;
-
-               if (trans->nr_wb_updates == trans->wb_updates_size) {
-                       struct btree_transaction_stats *s = btree_trans_stats(trans);
-
-                       BUG_ON(trans->wb_updates_size > U8_MAX / 2);
-                       trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
-                       if (s)
-                               s->wb_updates_size = trans->wb_updates_size;
-               }
-
-               u = bch2_trans_kmalloc_nomemzero(trans,
-                                       trans->wb_updates_size *
-                                       sizeof(struct btree_write_buffered_key));
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       return ret;
-
-               if (trans->nr_wb_updates)
-                       memcpy(u, trans->wb_updates, trans->nr_wb_updates *
-                              sizeof(struct btree_write_buffered_key));
-               trans->wb_updates = u;
-       }
-
-       trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
-               .btree  = btree,
-       };
-
-       bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
-       trans->nr_wb_updates++;
-
-       return 0;
-}
-
-int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
-                            enum btree_id btree, struct bpos end)
-{
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT);
-       k = bch2_btree_iter_prev(iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       bch2_btree_iter_advance(iter);
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       BUG_ON(k.k->type != KEY_TYPE_deleted);
-
-       if (bkey_gt(k.k->p, end)) {
-               ret = -BCH_ERR_ENOSPC_btree_slot;
-               goto err;
-       }
-
-       return 0;
-err:
-       bch2_trans_iter_exit(trans, iter);
-       return ret;
-}
-
-void bch2_trans_commit_hook(struct btree_trans *trans,
-                           struct btree_trans_commit_hook *h)
-{
-       h->next = trans->hooks;
-       trans->hooks = h;
-}
-
-int bch2_btree_insert_nonextent(struct btree_trans *trans,
-                               enum btree_id btree, struct bkey_i *k,
-                               enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, btree, k->k.p,
-                            BTREE_ITER_NOT_EXTENTS|
-                            BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id,
-                       struct bkey_i *k, enum btree_update_flags flags)
-{
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(trans, &iter, k, flags);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/**
- * bch2_btree_insert - insert keys into the extent btree
- * @c:                 pointer to struct bch_fs
- * @id:                        btree to insert into
- * @insert_keys:       list of keys to insert
- * @hook:              insert callback
- */
-int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
-                     struct bkey_i *k,
-                     struct disk_reservation *disk_res,
-                     u64 *journal_seq, int flags)
-{
-       return bch2_trans_do(c, disk_res, journal_seq, flags,
-                            __bch2_btree_insert(&trans, id, k, 0));
-}
-
-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter,
-                               unsigned len, unsigned update_flags)
-{
-       struct bkey_i *k;
-
-       k = bch2_trans_kmalloc(trans, sizeof(*k));
-       if (IS_ERR(k))
-               return PTR_ERR(k);
-
-       bkey_init(&k->k);
-       k->k.p = iter->pos;
-       bch2_key_resize(&k->k, len);
-       return bch2_trans_update(trans, iter, k, update_flags);
-}
-
-int bch2_btree_delete_at(struct btree_trans *trans,
-                        struct btree_iter *iter, unsigned update_flags)
-{
-       return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
-}
-
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
-                                 enum btree_id btree, struct bpos pos)
-{
-       struct bkey_i *k;
-
-       k = bch2_trans_kmalloc(trans, sizeof(*k));
-       if (IS_ERR(k))
-               return PTR_ERR(k);
-
-       bkey_init(&k->k);
-       k->k.p = pos;
-       return bch2_trans_update_buffered(trans, btree, k);
-}
-
-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
-                                 struct bpos start, struct bpos end,
-                                 unsigned update_flags,
-                                 u64 *journal_seq)
-{
-       u32 restart_count = trans->restart_count;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT);
-       while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(trans->c, 0);
-               struct bkey_i delete;
-
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               bkey_init(&delete.k);
-
-               /*
-                * This could probably be more efficient for extents:
-                */
-
-               /*
-                * For extents, iter.pos won't necessarily be the same as
-                * bkey_start_pos(k.k) (for non extents they always will be the
-                * same). It's important that we delete starting from iter.pos
-                * because the range we want to delete could start in the middle
-                * of k.
-                *
-                * (bch2_btree_iter_peek() does guarantee that iter.pos >=
-                * bkey_start_pos(k.k)).
-                */
-               delete.k.p = iter.pos;
-
-               if (iter.flags & BTREE_ITER_IS_EXTENTS)
-                       bch2_key_resize(&delete.k,
-                                       bpos_min(end, k.k->p).offset -
-                                       iter.pos.offset);
-
-               ret   = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
-                       bch2_trans_commit(trans, &disk_res, journal_seq,
-                                         BTREE_INSERT_NOFAIL);
-               bch2_disk_reservation_put(trans->c, &disk_res);
-err:
-               /*
-                * the bch2_trans_begin() call is in a weird place because we
-                * need to call it after every transaction commit, to avoid path
-                * overflow, but don't want to call it if the delete operation
-                * is a no-op and we have no work to do:
-                */
-               bch2_trans_begin(trans);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       ret = 0;
-               if (ret)
-                       break;
-       }
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
-}
-
-/*
- * bch_btree_delete_range - delete everything within a given range
- *
- * Range is a half open interval - [start, end)
- */
-int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
-                           struct bpos start, struct bpos end,
-                           unsigned update_flags,
-                           u64 *journal_seq)
-{
-       int ret = bch2_trans_run(c,
-                       bch2_btree_delete_range_trans(&trans, id, start, end,
-                                                     update_flags, journal_seq));
-       if (ret == -BCH_ERR_transaction_restart_nested)
-               ret = 0;
-       return ret;
-}
-
-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
-                      struct bpos pos, bool set)
-{
-       struct bkey_i *k;
-       int ret = 0;
-
-       k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
-       ret = PTR_ERR_OR_ZERO(k);
-       if (unlikely(ret))
-               return ret;
-
-       bkey_init(&k->k);
-       k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
-       k->k.p = pos;
-
-       return bch2_trans_update_buffered(trans, btree, k);
-}
-
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
-{
-       struct printbuf buf = PRINTBUF;
-       struct jset_entry_log *l;
-       unsigned u64s;
-       int ret;
-
-       prt_vprintf(&buf, fmt, args);
-       ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
-       if (ret)
-               goto err;
-
-       u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
-       ret = darray_make_room(entries, jset_u64s(u64s));
-       if (ret)
-               goto err;
-
-       l = (void *) &darray_top(*entries);
-       l->entry.u64s           = cpu_to_le16(u64s);
-       l->entry.btree_id       = 0;
-       l->entry.level          = 1;
-       l->entry.type           = BCH_JSET_ENTRY_log;
-       l->entry.pad[0]         = 0;
-       l->entry.pad[1]         = 0;
-       l->entry.pad[2]         = 0;
-       memcpy(l->d, buf.buf, buf.pos);
-       while (buf.pos & 7)
-               l->d[buf.pos++] = '\0';
-
-       entries->nr += jset_u64s(u64s);
-err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-static int
-__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
-                 va_list args)
-{
-       int ret;
-
-       if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-               ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
-       } else {
-               ret = bch2_trans_do(c, NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|commit_flags,
-                       __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args));
-       }
-
-       return ret;
-}
-
-int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, 0, fmt, args);
-       va_end(args);
-       return ret;
-}
-
-/*
- * Use for logging messages during recovery to enable reserved space and avoid
- * blocking.
- */
-int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...)
-{
-       va_list args;
-       int ret;
-
-       va_start(args, fmt);
-       ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args);
-       va_end(args);
-       return ret;
-}
index 5f96db539fd762f85e8433cd413977864e066e16..a6bf6ed37ced60cfee4bb61c15c47c06d5ace9c7 100644 (file)
@@ -11,6 +11,9 @@
 
 #include <linux/sort.h>
 
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+                               struct journal_entry_pin *, u64);
+
 static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
 {
        const struct btree_write_buffered_key *l = _l;
@@ -45,6 +48,13 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
        if (ret)
                return ret;
 
+       /*
+        * We can't clone a path that has write locks: unshare it now, before
+        * set_pos and traverse():
+        */
+       if (iter->path->ref > 1)
+               iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
        path = iter->path;
 
        if (!*write_locked) {
@@ -64,23 +74,18 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 
        bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
        (*fast)++;
-
-       if (path->ref > 1) {
-               /*
-                * We can't clone a path that has write locks: if the path is
-                * shared, unlock before set_pos(), traverse():
-                */
-               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
-               *write_locked = false;
-       }
        return 0;
 trans_commit:
-       return  bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?:
+       trans->journal_res.seq = wb->journal_seq;
+
+       return  bch2_trans_update(trans, iter, &wb->k,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  commit_flags|
-                                 BTREE_INSERT_NOCHECK_RW|
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_JOURNAL_RECLAIM);
+                                 BCH_TRANS_COMMIT_no_check_rw|
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_no_journal_res|
+                                 BCH_TRANS_COMMIT_journal_reclaim);
 }
 
 static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
@@ -123,8 +128,11 @@ btree_write_buffered_insert(struct btree_trans *trans,
        bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
                             BTREE_ITER_CACHED|BTREE_ITER_INTENT);
 
+       trans->journal_res.seq = wb->journal_seq;
+
        ret   = bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0);
+               bch2_trans_update(trans, &iter, &wb->k,
+                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -148,7 +156,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
        if (!locked && !mutex_trylock(&wb->flush_lock))
                return 0;
 
-       bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
+       bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
+                             bch2_btree_write_buffer_journal_flush);
        bch2_journal_pin_drop(j, &wb->journal_pin);
 
        s = btree_write_buffer_switch(wb);
@@ -166,7 +175,7 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         * However, since we're not flushing in the order they appear in the
         * journal we won't be able to drop our journal pin until everything is
         * flushed - which means this could deadlock the journal if we weren't
-        * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+        * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
         * if it would block taking a journal reservation.
         *
         * If that happens, simply skip the key so we can optimistically insert
@@ -193,7 +202,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 
                if (!iter.path || iter.path->btree_id != i->btree) {
                        bch2_trans_iter_exit(trans, &iter);
-                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
+                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+                                            BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
                }
 
                bch2_btree_iter_set_pos(&iter, i->k.k.p);
@@ -249,21 +259,14 @@ slowpath:
                if (!i->journal_seq)
                        continue;
 
-               if (i->journal_seq > pin.seq) {
-                       struct journal_entry_pin pin2;
-
-                       memset(&pin2, 0, sizeof(pin2));
-
-                       bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
-                       bch2_journal_pin_drop(j, &pin);
-                       bch2_journal_pin_copy(j, &pin, &pin2, NULL);
-                       bch2_journal_pin_drop(j, &pin2);
-               }
+               bch2_journal_pin_update(j, i->journal_seq, &pin,
+                             bch2_btree_write_buffer_journal_flush);
 
                ret = commit_do(trans, NULL, NULL,
                                commit_flags|
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_JOURNAL_RECLAIM,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_no_journal_res|
+                               BCH_TRANS_COMMIT_journal_reclaim,
                                btree_write_buffered_insert(trans, i));
                if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
                        break;
@@ -293,7 +296,7 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
        mutex_lock(&wb->flush_lock);
 
        return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
+                       __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
 }
 
 static inline u64 btree_write_buffer_ref(int idx)
index 7bb7f0caee451717c917164db891a07fab3bc570..58d8c6ffd955429d9f13207ddf04c1f687a68b2e 100644 (file)
@@ -367,12 +367,11 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
        struct printbuf buf = PRINTBUF;
 
        percpu_down_read(&c->mark_lock);
-       buf.atomic++;
 
        idx = bch2_replicas_entry_idx(c, r);
        if (idx < 0 &&
-           fsck_err(c, "no replicas entry\n"
-                    "  while marking %s",
+           fsck_err(c, ptr_to_missing_replicas_entry,
+                    "no replicas entry\n  while marking %s",
                     (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
                percpu_up_read(&c->mark_lock);
                ret = bch2_mark_replicas(c, r);
@@ -474,8 +473,9 @@ static inline int update_replicas_list(struct btree_trans *trans,
        d = trans->fs_usage_deltas;
        n = (void *) d->d + d->used;
        n->delta = sectors;
-       memcpy((void *) n + offsetof(struct replicas_delta, r),
-              r, replicas_entry_bytes(r));
+       unsafe_memcpy((void *) n + offsetof(struct replicas_delta, r),
+                     r, replicas_entry_bytes(r),
+                     "flexible array member embedded in strcuct with padding");
        bch2_replicas_entry_sort(&n->r);
        d->used += b;
        return 0;
@@ -680,7 +680,7 @@ static int check_bucket_ref(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
-       u16 bucket_sectors = !ptr->cached
+       u32 bucket_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
        struct printbuf buf = PRINTBUF;
@@ -695,6 +695,7 @@ static int check_bucket_ref(struct btree_trans *trans,
 
        if (gen_after(ptr->gen, b_gen)) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                             BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
@@ -707,6 +708,7 @@ static int check_bucket_ref(struct btree_trans *trans,
 
        if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                             BCH_FSCK_ERR_ptr_too_stale,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
@@ -720,6 +722,7 @@ static int check_bucket_ref(struct btree_trans *trans,
 
        if (b_gen != ptr->gen && !ptr->cached) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                             BCH_FSCK_ERR_stale_dirty_ptr,
                        "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
@@ -741,6 +744,7 @@ static int check_bucket_ref(struct btree_trans *trans,
            ptr_data_type &&
            bucket_data_type != ptr_data_type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                             BCH_FSCK_ERR_ptr_bucket_data_type_mismatch,
                        "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
@@ -752,9 +756,10 @@ static int check_bucket_ref(struct btree_trans *trans,
                goto err;
        }
 
-       if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
+       if ((u64) bucket_sectors + sectors > U32_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
+                             BCH_FSCK_ERR_bucket_sector_count_overflow,
+                       "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
                        "while marking %s",
                        ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
@@ -795,7 +800,6 @@ static int mark_stripe_bucket(struct btree_trans *trans,
        /* * XXX doesn't handle deletion */
 
        percpu_down_read(&c->mark_lock);
-       buf.atomic++;
        g = PTR_GC_BUCKET(ca, ptr);
 
        if (g->dirty_sectors ||
@@ -936,14 +940,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_mark_extent(struct btree_trans *trans,
-                    enum btree_id btree_id, unsigned level,
-                    struct bkey_s_c old, struct bkey_s_c new,
-                    unsigned flags)
+static int __mark_extent(struct btree_trans *trans,
+                        enum btree_id btree_id, unsigned level,
+                        struct bkey_s_c k, unsigned flags)
 {
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -1019,6 +1021,14 @@ int bch2_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_mark_extent(struct btree_trans *trans,
+                    enum btree_id btree_id, unsigned level,
+                    struct bkey_s_c old, struct bkey_s_c new,
+                    unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
 int bch2_mark_stripe(struct btree_trans *trans,
                     enum btree_id btree_id, unsigned level,
                     struct bkey_s_c old, struct bkey_s_c new,
@@ -1125,13 +1135,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_mark_reservation(struct btree_trans *trans,
-                         enum btree_id btree_id, unsigned level,
-                         struct bkey_s_c old, struct bkey_s_c new,
-                         unsigned flags)
+static int __mark_reservation(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bch_fs_usage *fs_usage;
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
@@ -1158,6 +1166,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_mark_reservation(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
+                         struct bkey_s_c old, struct bkey_s_c new,
+                         unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
 static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
                                 struct bkey_s_c_reflink_p p,
                                 u64 start, u64 end,
@@ -1184,7 +1200,8 @@ static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
        *idx = r->offset;
        return 0;
 not_found:
-       if (fsck_err(c, "pointer to missing indirect extent\n"
+       if (fsck_err(c, reflink_p_to_missing_reflink_v,
+                    "pointer to missing indirect extent\n"
                     "  %s\n"
                     "  missing range %llu-%llu",
                     (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
@@ -1201,7 +1218,7 @@ not_found:
                new->k.p                = bkey_start_pos(p.k);
                new->k.p.offset += *idx - start;
                bch2_key_resize(&new->k, next_idx - *idx);
-               ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i,
+               ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
                                          BTREE_TRIGGER_NORUN);
        }
 
@@ -1212,13 +1229,11 @@ fsck_err:
        return ret;
 }
 
-int bch2_mark_reflink_p(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       unsigned flags)
+static int __mark_reflink_p(struct btree_trans *trans,
+                           enum btree_id btree_id, unsigned level,
+                           struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        struct reflink_gc *ref;
        size_t l, r, m;
@@ -1252,6 +1267,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
        return ret;
 }
 
+int bch2_mark_reflink_p(struct btree_trans *trans,
+                       enum btree_id btree_id, unsigned level,
+                       struct bkey_s_c old, struct bkey_s_c new,
+                       unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
 void bch2_trans_fs_usage_revert(struct btree_trans *trans,
                                struct replicas_delta_list *deltas)
 {
@@ -1299,8 +1322,8 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        static int warned_disk_usage = 0;
        bool warn = false;
-       unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-       struct replicas_delta *d = deltas->d, *d2;
+       u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+       struct replicas_delta *d, *d2;
        struct replicas_delta *top = (void *) deltas->d + deltas->used;
        struct bch_fs_usage *dst;
        s64 added = 0, should_not_have_added;
@@ -1358,7 +1381,7 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
 
        if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
                bch2_trans_inconsistent(trans,
-                                       "disk usage increased %lli more than %u sectors reserved)",
+                                       "disk usage increased %lli more than %llu sectors reserved)",
                                        should_not_have_added, disk_res_sectors);
        return 0;
 need_mark:
@@ -1453,15 +1476,11 @@ err:
        return ret;
 }
 
-int bch2_trans_mark_extent(struct btree_trans *trans,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c old, struct bkey_i *new,
-                          unsigned flags)
+static int __trans_mark_extent(struct btree_trans *trans,
+                              enum btree_id btree_id, unsigned level,
+                              struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -1518,6 +1537,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
        return ret;
 }
 
+int bch2_trans_mark_extent(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c old, struct bkey_i *new,
+                          unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+                 (int) bch2_bkey_needs_rebalance(c, old);
+
+       if (mod) {
+               int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+               if (ret)
+                       return ret;
+       }
+
+       return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
                                         struct bkey_s_c_stripe s,
                                         unsigned idx, bool deleting)
@@ -1671,15 +1708,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
        return ret;
 }
 
-int bch2_trans_mark_reservation(struct btree_trans *trans,
-                               enum btree_id btree_id, unsigned level,
-                               struct bkey_s_c old,
-                               struct bkey_i *new,
-                               unsigned flags)
+static int __trans_mark_reservation(struct btree_trans *trans,
+                                   enum btree_id btree_id, unsigned level,
+                                   struct bkey_s_c k, unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
        struct replicas_delta_list *d;
@@ -1701,7 +1733,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
        return 0;
 }
 
-static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
+                               struct bkey_s_c old,
+                               struct bkey_i *new,
+                               unsigned flags)
+{
+       return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
                        u64 *idx, unsigned flags)
 {
@@ -1768,35 +1809,38 @@ err:
        return ret;
 }
 
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-                             enum btree_id btree_id, unsigned level,
-                             struct bkey_s_c old,
-                             struct bkey_i *new,
-                             unsigned flags)
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
+                               struct bkey_s_c k, unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        u64 idx, end_idx;
        int ret = 0;
 
-       if (flags & BTREE_TRIGGER_INSERT) {
-               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-
-               v->front_pad = v->back_pad = 0;
-       }
-
        idx     = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
        end_idx = le64_to_cpu(p.v->idx) + p.k->size +
                le32_to_cpu(p.v->back_pad);
 
        while (idx < end_idx && !ret)
-               ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
-
+               ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
        return ret;
 }
 
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c old,
+                             struct bkey_i *new,
+                             unsigned flags)
+{
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+               v->front_pad = v->back_pad = 0;
+       }
+
+       return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    struct bch_dev *ca, size_t b,
                                    enum bch_data_type type,
@@ -1819,6 +1863,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 
        if (a->v.data_type && type && a->v.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
+                             BCH_FSCK_ERR_bucket_metadata_type_mismatch,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
                        iter.pos.inode, iter.pos.offset, a->v.gen,
@@ -1826,16 +1871,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                        bch2_data_types[type],
                        bch2_data_types[type]);
                ret = -EIO;
-               goto out;
+               goto err;
        }
 
-       a->v.data_type          = type;
-       a->v.dirty_sectors      = sectors;
-
-       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-       if (ret)
-               goto out;
-out:
+       if (a->v.data_type      != type ||
+           a->v.dirty_sectors  != sectors) {
+               a->v.data_type          = type;
+               a->v.dirty_sectors      = sectors;
+               ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+       }
+err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1923,12 +1968,29 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
 
 int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
 {
-       int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca));
+       int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
+
        if (ret)
                bch_err_fn(c, ret);
        return ret;
 }
 
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i) {
+               int ret = bch2_trans_mark_dev_sb(c, ca);
+               if (ret) {
+                       percpu_ref_put(&ca->ref);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 /* Disk reservations: */
 
 #define SECTORS_CACHE  1024
index a418f664896de662c4d5653fde5b1123e036219f..21f6cb356921f1e3b1f9df59fbdae7309f3931fa 100644 (file)
 
 #include "buckets_types.h"
 #include "extents.h"
-#include "super.h"
+#include "sb-members.h"
+
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
+{
+       return div_u64(s, ca->mi.bucket_size);
+}
+
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
+{
+       return ((sector_t) b) * ca->mi.bucket_size;
+}
+
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
+{
+       u32 remainder;
+
+       div_u64_rem(s, ca->mi.bucket_size, &remainder);
+       return remainder;
+}
+
+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
+                                                u32 *offset)
+{
+       return div_u64_rem(s, ca->mi.bucket_size, offset);
+}
 
 #define for_each_bucket(_b, _buckets)                          \
        for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
             _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
+/*
+ * Ugly hack alert:
+ *
+ * We need to cram a spinlock in a single byte, because that's what we have left
+ * in struct bucket, and we care about the size of these - during fsck, we need
+ * in memory state for every single bucket on every device.
+ *
+ * We used to do
+ *   while (xchg(&b->lock, 1) cpu_relax();
+ * but, it turns out not all architectures support xchg on a single byte.
+ *
+ * So now we use bit_spin_lock(), with fun games since we can't burn a whole
+ * ulong for this - we just need to make sure the lock bit always ends up in the
+ * first byte.
+ */
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define BUCKET_LOCK_BITNR      0
+#else
+#define BUCKET_LOCK_BITNR      (BITS_PER_LONG - 1)
+#endif
+
+union ulong_byte_assert {
+       ulong   ulong;
+       u8      byte;
+};
+
 static inline void bucket_unlock(struct bucket *b)
 {
-       smp_store_release(&b->lock, 0);
+       BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte);
+
+       clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock);
+       wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR);
 }
 
 static inline void bucket_lock(struct bucket *b)
 {
-       while (xchg(&b->lock, 1))
-               cpu_relax();
+       wait_on_bit_lock((void *) &b->lock, BUCKET_LOCK_BITNR,
+                        TASK_UNINTERRUPTIBLE);
 }
 
 static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
@@ -156,7 +210,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 
        switch (watermark) {
        case BCH_WATERMARK_NR:
-               unreachable();
+               BUG();
        case BCH_WATERMARK_stripe:
                reserved += ca->mi.nbuckets >> 6;
                fallthrough;
@@ -285,12 +339,48 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
 int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({                                                                                             \
+       int ret = 0;                                                                            \
+                                                                                               \
+       if (_old.k->type)                                                                       \
+               ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT);     \
+       if (!ret && _new.k->type)                                                               \
+               ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE);  \
+       ret;                                                                                    \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)  \
+       mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
 void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
                                    size_t, enum bch_data_type, unsigned);
 int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
+
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       u64 b_offset    = bucket_to_sector(ca, b);
+       u64 b_end       = bucket_to_sector(ca, b + 1);
+       unsigned i;
+
+       if (!b)
+               return true;
+
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+               u64 end = offset + (1 << layout->sb_max_size_bits);
+
+               if (!(offset >= b_end || end <= b_offset))
+                       return true;
+       }
+
+       return false;
+}
 
 /* disk reservations: */
 
index 81ab685cdef9f35dbed4a55fa2044aad8dcbaf59..ec1b636ef78d075d1c2b6a9dd2b610a5ba8f274c 100644 (file)
@@ -133,7 +133,7 @@ retry_rehash:
        b->t = n;
        kvfree(t);
 
-       pr_debug("took %zu rehashes, table at %zu/%zu elements",
+       pr_debug("took %zu rehashes, table at %zu/%lu elements",
                 nr_rehashes, nr_elements, 1UL << b->t->bits);
 out:
        mutex_unlock(&b->lock);
index fb603df099a5b43d00b6a2bfb783a28403442a97..4bb88aefed121f275582df94e3cea9dcdec7c58c 100644 (file)
@@ -86,10 +86,9 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
                devs[i] = strndup_user((const char __user *)(unsigned long)
                                       user_devs[i],
                                       PATH_MAX);
-               if (!devs[i]) {
-                       ret = -ENOMEM;
+               ret= PTR_ERR_OR_ZERO(devs[i]);
+               if (ret)
                        goto err;
-               }
        }
 
        c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
@@ -117,8 +116,9 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        err = bch2_fs_open_incremental(path);
        kfree(path);
@@ -149,9 +149,10 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
 static long bch2_ioctl_query_uuid(struct bch_fs *c,
                        struct bch_ioctl_query_uuid __user *user_arg)
 {
-       return copy_to_user(&user_arg->uuid,
-                           &c->sb.user_uuid,
-                           sizeof(c->sb.user_uuid));
+       if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
+                        sizeof(c->sb.user_uuid)))
+               return -EFAULT;
+       return 0;
 }
 
 #if 0
@@ -188,8 +189,9 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        ret = bch2_dev_add(c, path);
        kfree(path);
@@ -230,8 +232,9 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
                return -EINVAL;
 
        path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
-       if (!path)
-               return -ENOMEM;
+       ret = PTR_ERR_OR_ZERO(path);
+       if (ret)
+               return ret;
 
        ret = bch2_dev_online(c, path);
        kfree(path);
@@ -329,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
        struct bch_ioctl_data_event e = {
                .type                   = BCH_DATA_EVENT_PROGRESS,
                .p.data_type            = ctx->stats.data_type,
-               .p.btree_id             = ctx->stats.btree_id,
-               .p.pos                  = ctx->stats.pos,
+               .p.btree_id             = ctx->stats.pos.btree,
+               .p.pos                  = ctx->stats.pos.pos,
                .p.sectors_done         = atomic64_read(&ctx->stats.sectors_seen),
                .p.sectors_total        = bch2_fs_usage_read_short(c).used,
        };
@@ -338,7 +341,10 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
        if (len < sizeof(e))
                return -EINVAL;
 
-       return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
+       if (copy_to_user(buf, &e, sizeof(e)))
+               return -EFAULT;
+
+       return sizeof(e);
 }
 
 static const struct file_operations bcachefs_data_ops = {
@@ -417,7 +423,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
                return -EFAULT;
 
-       arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
+       arg = kzalloc(size_add(sizeof(*arg), replica_entries_bytes), GFP_KERNEL);
        if (!arg)
                return -ENOMEM;
 
@@ -466,9 +472,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        percpu_up_read(&c->mark_lock);
        kfree(src);
 
-       if (!ret)
-               ret = copy_to_user(user_arg, arg,
-                       sizeof(*arg) + arg->replica_entries_bytes);
+       if (ret)
+               goto err;
+       if (copy_to_user(user_arg, arg,
+                        sizeof(*arg) + arg->replica_entries_bytes))
+               ret = -EFAULT;
 err:
        kfree(arg);
        return ret;
@@ -513,7 +521,10 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
 
        percpu_ref_put(&ca->ref);
 
-       return copy_to_user(user_arg, &arg, sizeof(arg));
+       if (copy_to_user(user_arg, &arg, sizeof(arg)))
+               return -EFAULT;
+
+       return 0;
 }
 
 static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -550,8 +561,9 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
                goto err;
        }
 
-       ret = copy_to_user((void __user *)(unsigned long)arg.sb,
-                          sb, vstruct_bytes(sb));
+       if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
+                        vstruct_bytes(sb)))
+               ret = -EFAULT;
 err:
        if (!IS_ERR_OR_NULL(ca))
                percpu_ref_put(&ca->ref);
@@ -617,6 +629,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
            arg.pad)
                return -EINVAL;
 
+       if (arg.nbuckets > U32_MAX)
+               return -EINVAL;
+
        ca = bch2_device_lookup(c, arg.dev, arg.flags);
        if (IS_ERR(ca))
                return PTR_ERR(ca);
index 3a4890d39ff98990b6a6cd46fd8bc3f67eacf8a8..0f563ca53c36e7f6724c2697a81efa3e85ab6d0e 100644 (file)
@@ -17,7 +17,7 @@ int __init bch2_chardev_init(void);
 static inline long bch2_fs_ioctl(struct bch_fs *c,
                                unsigned cmd, void __user * arg)
 {
-       return -ENOSYS;
+       return -ENOTTY;
 }
 
 static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
index a08997a5bb67566f27abbcb5e7a85b61d8c48b19..3c761ad6b1c8ef1fef8781dc10f393ccfee9997e 100644 (file)
@@ -139,7 +139,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm,
 
                for (i = 0; i < pages; i++) {
                        unsigned offset = offset_in_page(buf);
-                       unsigned pg_len = min(len, PAGE_SIZE - offset);
+                       unsigned pg_len = min_t(size_t, len, PAGE_SIZE - offset);
 
                        sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset);
                        buf += pg_len;
@@ -159,15 +159,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
                crypto_alloc_sync_skcipher("chacha20", 0, 0);
        int ret;
 
-       if (!chacha20) {
-               pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
-               return PTR_ERR(chacha20);
+       ret = PTR_ERR_OR_ZERO(chacha20);
+       if (ret) {
+               pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret));
+               return ret;
        }
 
        ret = crypto_skcipher_setkey(&chacha20->base,
                                     (void *) key, sizeof(*key));
        if (ret) {
-               pr_err("crypto_skcipher_setkey() error: %i", ret);
+               pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret));
                goto err;
        }
 
@@ -265,9 +266,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 #ifdef CONFIG_HIGHMEM
                __bio_for_each_segment(bv, bio, *iter, *iter) {
-                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+                       void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
+
                        bch2_checksum_update(&state, p, bv.bv_len);
-                       kunmap_atomic(p);
+                       kunmap_local(p);
                }
 #else
                __bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -287,10 +289,10 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 #ifdef CONFIG_HIGHMEM
                __bio_for_each_segment(bv, bio, *iter, *iter) {
-                       void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
+                       void *p = kmap_local_page(bv.bv_page) + bv.bv_offset;
 
                        crypto_shash_update(desc, p, bv.bv_len);
-                       kunmap_atomic(p);
+                       kunmap_local(p);
                }
 #else
                __bio_for_each_bvec(bv, bio, *iter, *iter)
@@ -360,18 +362,18 @@ struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
 
        state.type = type;
        bch2_checksum_init(&state);
-       state.seed = a.lo;
+       state.seed = le64_to_cpu(a.lo);
 
        BUG_ON(!bch2_checksum_mergeable(type));
 
        while (b_len) {
-               unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
+               unsigned page_len = min_t(unsigned, b_len, PAGE_SIZE);
 
                bch2_checksum_update(&state,
-                               page_address(ZERO_PAGE(0)), b);
-               b_len -= b;
+                               page_address(ZERO_PAGE(0)), page_len);
+               b_len -= page_len;
        }
-       a.lo = bch2_checksum_final(&state);
+       a.lo = cpu_to_le64(bch2_checksum_final(&state));
        a.lo ^= b.lo;
        a.hi ^= b.hi;
        return a;
@@ -394,9 +396,9 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
                unsigned                        csum_type;
                struct bch_csum                 csum;
        } splits[3] = {
-               { crc_a, len_a, new_csum_type },
-               { crc_b, len_b, new_csum_type },
-               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type },
+               { crc_a, len_a, new_csum_type, { 0 }},
+               { crc_b, len_b, new_csum_type, { 0 } },
+               { NULL,  bio_sectors(bio) - len_a - len_b, new_csum_type, { 0 } },
        }, *i;
        bool mergeable = crc_old.csum_type == new_csum_type &&
                bch2_checksum_mergeable(new_csum_type);
@@ -426,9 +428,10 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
                merged = bch2_checksum_bio(c, crc_old.csum_type,
                                extent_nonce(version, crc_old), bio);
 
-       if (bch2_crc_cmp(merged, crc_old.csum)) {
-               bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n"
+       if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
+               bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
                        "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
+                       __func__,
                        crc_old.csum.hi,
                        crc_old.csum.lo,
                        merged.hi,
@@ -458,6 +461,48 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
        return 0;
 }
 
+/* BCH_SB_FIELD_crypt: */
+
+static int bch2_sb_crypt_validate(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+               prt_printf(err, "wrong size (got %zu should be %zu)",
+                      vstruct_bytes(&crypt->field), sizeof(*crypt));
+               return -BCH_ERR_invalid_sb_crypt;
+       }
+
+       if (BCH_CRYPT_KDF_TYPE(crypt)) {
+               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+               return -BCH_ERR_invalid_sb_crypt;
+       }
+
+       return 0;
+}
+
+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
+                                 struct bch_sb_field *f)
+{
+       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
+
+       prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
+       prt_newline(out);
+       prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
+       prt_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+       .validate       = bch2_sb_crypt_validate,
+       .to_text        = bch2_sb_crypt_to_text,
+};
+
 #ifdef __KERNEL__
 static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
@@ -489,16 +534,31 @@ static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
        key_serial_t key_id;
 
+       key_id = request_key("user", key_description, NULL,
+                            KEY_SPEC_SESSION_KEYRING);
+       if (key_id >= 0)
+               goto got_key;
+
        key_id = request_key("user", key_description, NULL,
                             KEY_SPEC_USER_KEYRING);
-       if (key_id < 0)
-               return -errno;
+       if (key_id >= 0)
+               goto got_key;
+
+       key_id = request_key("user", key_description, NULL,
+                            KEY_SPEC_USER_SESSION_KEYRING);
+       if (key_id >= 0)
+               goto got_key;
+
+       return -errno;
+got_key:
 
        if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
                return -1;
 
        return 0;
 }
+
+#include "../crypto.h"
 #endif
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
@@ -511,9 +571,43 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 
        ret = __bch2_request_key(key_description.buf, key);
        printbuf_exit(&key_description);
+
+#ifndef __KERNEL__
+       if (ret) {
+               char *passphrase = read_passphrase("Enter passphrase: ");
+               struct bch_encrypted_key sb_key;
+
+               bch2_passphrase_check(sb, passphrase,
+                                     key, &sb_key);
+               ret = 0;
+       }
+#endif
+
+       /* stash with memfd, pass memfd fd to mount */
+
        return ret;
 }
 
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *sb)
+{
+       key_serial_t key_id;
+       struct printbuf key_description = PRINTBUF;
+
+       prt_printf(&key_description, "bcachefs:");
+       pr_uuid(&key_description, sb->user_uuid.b);
+
+       key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING);
+       printbuf_exit(&key_description);
+       if (key_id < 0)
+               return errno;
+
+       keyctl_revoke(key_id);
+
+       return 0;
+}
+#endif
+
 int bch2_decrypt_sb_key(struct bch_fs *c,
                        struct bch_sb_field_crypt *crypt,
                        struct bch_key *key)
@@ -534,7 +628,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
        /* decrypt real key: */
        ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
-                            &sb_key, sizeof(sb_key));
+                                     &sb_key, sizeof(sb_key));
        if (ret)
                goto err;
 
@@ -584,7 +678,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 
        mutex_lock(&c->sb_lock);
 
-       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
        if (!crypt)
                goto out;
 
@@ -597,7 +691,7 @@ int bch2_disable_encryption(struct bch_fs *c)
        if (ret)
                goto out;
 
-       crypt->key.magic        = BCH_KEY_MAGIC;
+       crypt->key.magic        = cpu_to_le64(BCH_KEY_MAGIC);
        crypt->key.key          = key;
 
        SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
@@ -618,14 +712,14 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
        mutex_lock(&c->sb_lock);
 
        /* Do we already have an encryption key? */
-       if (bch2_sb_get_crypt(c->disk_sb.sb))
+       if (bch2_sb_field_get(c->disk_sb.sb, crypt))
                goto err;
 
        ret = bch2_alloc_ciphers(c);
        if (ret)
                goto err;
 
-       key.magic = BCH_KEY_MAGIC;
+       key.magic = cpu_to_le64(BCH_KEY_MAGIC);
        get_random_bytes(&key.key, sizeof(key.key));
 
        if (keyed) {
@@ -646,7 +740,8 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
        if (ret)
                goto err;
 
-       crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
+       crypt = bch2_sb_field_resize(&c->disk_sb, crypt,
+                                    sizeof(*crypt) / sizeof(u64));
        if (!crypt) {
                ret = -BCH_ERR_ENOSPC_sb_crypt;
                goto err;
@@ -687,7 +782,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
                goto out;
        }
 
-       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
+       crypt = bch2_sb_field_get(c->disk_sb.sb, crypt);
        if (!crypt)
                goto out;
 
index 1ad1d5f03939ce12e3d7469f64db634450ba9f2c..13998388c545c476545b1e6cd418306f67dcf90e 100644 (file)
@@ -40,14 +40,16 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
  */
 #define csum_vstruct(_c, _type, _nonce, _i)                            \
 ({                                                                     \
-       const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
-       const void *end = vstruct_end(_i);                              \
+       const void *_start = ((const void *) (_i)) + sizeof((_i)->csum);\
                                                                        \
-       bch2_checksum(_c, _type, _nonce, start, end - start);           \
+       bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
 })
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
 int bch2_request_key(struct bch_sb *, struct bch_key *);
+#ifndef __KERNEL__
+int bch2_revoke_key(struct bch_sb *);
+#endif
 
 int bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
                 void *data, size_t);
@@ -72,6 +74,8 @@ static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type,
                : 0;
 }
 
+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt;
+
 int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
                        struct bch_key *);
 
index c9ca7cce55f870b0a32921ce767cf4521aab5943..a8b148ec2a2b6b8ed1f33d10ad195b72afa112e0 100644 (file)
@@ -3,7 +3,6 @@
 #include "checksum.h"
 #include "compress.h"
 #include "extents.h"
-#include "io.h"
 #include "super-io.h"
 
 #include <linux/lz4.h>
@@ -571,7 +570,6 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
        size_t decompress_workspace_size = 0;
-       bool decompress_workspace_needed;
        ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
                                                 c->opts.encoded_extent_max);
        struct {
@@ -581,7 +579,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                size_t                          decompress_workspace;
        } compression_types[] = {
                { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4,
-                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) },
+                       max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS),
+                       0 },
                { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
                        zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
                        zlib_inflate_workspacesize(), },
@@ -620,9 +619,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
                if (!(features & (1 << i->feature)))
                        continue;
 
-               if (i->decompress_workspace)
-                       decompress_workspace_needed = true;
-
                if (mempool_initialized(&c->compress_workspace[i->type]))
                        continue;
 
@@ -643,7 +639,8 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 static u64 compression_opt_to_feature(unsigned v)
 {
        unsigned type = bch2_compression_decode(v).type;
-       return 1ULL << bch2_compression_opt_to_feature[type];
+
+       return BIT_ULL(bch2_compression_opt_to_feature[type]);
 }
 
 int bch2_fs_compress_init(struct bch_fs *c)
@@ -700,14 +697,32 @@ err:
        return ret;
 }
 
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+       struct bch_compression_opt opt = bch2_compression_decode(v);
+
+       if (opt.type < BCH_COMPRESSION_OPT_NR)
+               prt_str(out, bch2_compression_opts[opt.type]);
+       else
+               prt_printf(out, "(unknown compression opt %u)", opt.type);
+       if (opt.level)
+               prt_printf(out, ":%u", opt.level);
+}
+
 void bch2_opt_compression_to_text(struct printbuf *out,
                                  struct bch_fs *c,
                                  struct bch_sb *sb,
                                  u64 v)
 {
-       struct bch_compression_opt opt = bch2_compression_decode(v);
+       return bch2_compression_opt_to_text(out, v);
+}
 
-       prt_str(out, bch2_compression_opts[opt.type]);
-       if (opt.level)
-               prt_printf(out, ":%u", opt.level);
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+       if (!bch2_compression_opt_valid(v)) {
+               prt_printf(err, "invalid compression opt %llu", v);
+               return -BCH_ERR_invalid_sb_opt_compression;
+       }
+
+       return 0;
 }
index 052ea303241fc31407edde0bcc2d3037d7691137..607fd5e232c902dbb39f3dac84ea2e214e6b106c 100644 (file)
@@ -4,12 +4,18 @@
 
 #include "extents_types.h"
 
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+       BCH_COMPRESSION_OPTS()
+#undef x
+};
+
 struct bch_compression_opt {
        u8              type:4,
                        level:4;
 };
 
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
 {
        return (struct bch_compression_opt) {
                .type   = v & 15,
@@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
        };
 }
 
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+       struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+       return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+       return bch2_compression_opt_valid(v)
+               ? __bch2_compression_decode(v)
+               : (struct bch_compression_opt) { 0 };
+}
+
 static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
 {
        return opt.type|(opt.level << 4);
 }
 
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-       BCH_COMPRESSION_OPTS()
-#undef x
-};
-
 static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
 {
        return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
 int bch2_fs_compress_init(struct bch_fs *);
 
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
 int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
 void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
 
 #define bch2_opt_compression (struct bch_opt_fn) {             \
-       .parse          = bch2_opt_compression_parse,   \
-       .to_text        = bch2_opt_compression_to_text, \
+       .parse          = bch2_opt_compression_parse,           \
+       .to_text        = bch2_opt_compression_to_text,         \
+       .validate       = bch2_opt_compression_validate,        \
 }
 
 #endif /* _BCACHEFS_COMPRESS_H */
index 442a9b806a3c164d5ae4209ea4494d45ee815230..02a996e06a64e3d10483f7fcbffc0de66428f9ed 100644 (file)
@@ -43,12 +43,12 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb,
                prt_tab(out);
                prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i]));
                prt_newline(out);
-       };
+       }
 };
 
 int bch2_sb_counters_to_cpu(struct bch_fs *c)
 {
-       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
        unsigned int i;
        unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
        u64 val = 0;
@@ -66,13 +66,13 @@ int bch2_sb_counters_to_cpu(struct bch_fs *c)
 
 int bch2_sb_counters_from_cpu(struct bch_fs *c)
 {
-       struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb);
+       struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters);
        struct bch_sb_field_counters *ret;
        unsigned int i;
        unsigned int nr = bch2_sb_counter_nr_entries(ctrs);
 
        if (nr < BCH_COUNTER_NR) {
-               ret = bch2_sb_resize_counters(&c->disk_sb,
+               ret = bch2_sb_field_resize(&c->disk_sb, counters,
                                               sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR);
 
                if (ret) {
index 114f86b45fd52ffc0e3be365f7eb4109e0358fdc..43ea21ad9ea338931e0cb7a54d13bf9f50874b77 100644 (file)
@@ -8,7 +8,6 @@
  * Inspired by CCAN's darray
  */
 
-#include "util.h"
 #include <linux/slab.h>
 
 #define DARRAY(type)                                                   \
@@ -19,20 +18,25 @@ struct {                                                            \
 
 typedef DARRAY(void) darray_void;
 
-static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+int __bch2_darray_resize(darray_void *, size_t, size_t, gfp_t);
+
+static inline int __darray_resize(darray_void *d, size_t element_size,
+                                 size_t new_size, gfp_t gfp)
 {
-       if (d->nr + more > d->size) {
-               size_t new_size = roundup_pow_of_two(d->nr + more);
-               void *data = krealloc_array(d->data, new_size, t_size, gfp);
+       return unlikely(new_size > d->size)
+               ? __bch2_darray_resize(d, element_size, new_size, gfp)
+               : 0;
+}
 
-               if (!data)
-                       return -ENOMEM;
+#define darray_resize_gfp(_d, _new_size, _gfp)                         \
+       __darray_resize((darray_void *) (_d), sizeof((_d)->data[0]), (_new_size), _gfp)
 
-               d->data = data;
-               d->size = new_size;
-       }
+#define darray_resize(_d, _new_size)                                   \
+       darray_resize_gfp(_d, _new_size, GFP_KERNEL)
 
-       return 0;
+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp)
+{
+       return __darray_resize(d, t_size, d->nr + more, gfp);
 }
 
 #define darray_make_room_gfp(_d, _more, _gfp)                          \
@@ -41,6 +45,8 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
 #define darray_make_room(_d, _more)                                    \
        darray_make_room_gfp(_d, _more, GFP_KERNEL)
 
+#define darray_room(_d)                ((_d).size - (_d).nr)
+
 #define darray_top(_d)         ((_d).data[(_d).nr])
 
 #define darray_push_gfp(_d, _item, _gfp)                               \
@@ -69,9 +75,15 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more,
        _ret;                                                           \
 })
 
+#define darray_remove_item(_d, _pos)                                   \
+       array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+
 #define darray_for_each(_d, _i)                                                \
        for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
 
+#define darray_for_each_reverse(_d, _i)                                        \
+       for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+
 #define darray_init(_d)                                                        \
 do {                                                                   \
        (_d)->data = NULL;                                              \
index cfc6244637009b0676ba735d179b199ae995879b..55769d77e6e79c188fd5a94fb6db264a0066f8a1 100644 (file)
@@ -9,10 +9,11 @@
 #include "ec.h"
 #include "error.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "keylist.h"
 #include "move.h"
 #include "nocow_locking.h"
+#include "rebalance.h"
 #include "subvolume.h"
 #include "trace.h"
 
@@ -49,10 +50,6 @@ static void trace_move_extent_fail2(struct data_update *m,
        if (insert) {
                i = 0;
                bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) {
-                       struct bkey_s new_s;
-                       new_s.k = (void *) new.k;
-                       new_s.v = (void *) new.v;
-
                        if (((1U << i) & m->data_opts.rewrite_ptrs) &&
                            (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
                            !ptr->cached)
@@ -165,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
                        if (((1U << i) & m->data_opts.rewrite_ptrs) &&
                            (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
                            !ptr->cached) {
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
-                               /*
-                                * See comment below:
                                bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
-                               */
                                rewrites_found |= 1U << i;
                        }
                        i++;
@@ -215,14 +208,8 @@ restart_drop_extra_replicas:
                        if (!p.ptr.cached &&
                            durability - ptr_durability >= m->op.opts.data_replicas) {
                                durability -= ptr_durability;
-                               bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
-                               /*
-                                * Currently, we're dropping unneeded replicas
-                                * instead of marking them as cached, since
-                                * cached data in stripe buckets prevents them
-                                * from being reused:
+
                                bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
-                               */
                                goto restart_drop_extra_replicas;
                        }
                }
@@ -252,19 +239,47 @@ restart_drop_extra_replicas:
 
                next_pos = insert->k.p;
 
+               /*
+                * Check for nonce offset inconsistency:
+                * This is debug code - we've been seeing this bug rarely, and
+                * it's been hard to reproduce, so this should give us some more
+                * information when it does occur:
+                */
+               struct printbuf err = PRINTBUF;
+               int invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), 0, &err);
+               printbuf_exit(&err);
+
+               if (invalid) {
+                       struct printbuf buf = PRINTBUF;
+
+                       prt_str(&buf, "about to insert invalid key in data update path");
+                       prt_str(&buf, "\nold: ");
+                       bch2_bkey_val_to_text(&buf, c, old);
+                       prt_str(&buf, "\nk:   ");
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       prt_str(&buf, "\nnew: ");
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+                       bch2_print_string_as_lines(KERN_ERR, buf.buf);
+                       printbuf_exit(&buf);
+
+                       bch2_fatal_error(c);
+                       goto out;
+               }
+
                ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
                                                k.k->p, bkey_start_pos(&insert->k)) ?:
                        bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-                                               k.k->p, insert->k.p);
-               if (ret)
-                       goto err;
-
-               ret   = bch2_trans_update(trans, &iter, insert,
+                                               k.k->p, insert->k.p) ?:
+                       bch2_bkey_set_needs_rebalance(c, insert,
+                                                     op->opts.background_target,
+                                                     op->opts.background_compression) ?:
+                       bch2_trans_update(trans, &iter, insert,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL|
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc|
                                m->data_opts.btree_insert_flags);
                if (!ret) {
                        bch2_btree_iter_set_pos(&iter, next_pos);
@@ -285,11 +300,11 @@ next:
                }
                continue;
 nowork:
-               if (m->ctxt && m->ctxt->stats) {
+               if (m->stats && m->stats) {
                        BUG_ON(k.k->p.offset <= iter.pos.offset);
-                       atomic64_inc(&m->ctxt->stats->keys_raced);
+                       atomic64_inc(&m->stats->keys_raced);
                        atomic64_add(k.k->p.offset - iter.pos.offset,
-                                    &m->ctxt->stats->sectors_raced);
+                                    &m->stats->sectors_raced);
                }
 
                this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@@ -307,7 +322,7 @@ out:
 
 int bch2_data_update_index_update(struct bch_write_op *op)
 {
-       return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op));
+       return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op));
 }
 
 void bch2_data_update_read_done(struct data_update *m,
@@ -415,7 +430,7 @@ void bch2_update_unwritten_extent(struct btree_trans *trans,
                        break;
        }
 
-       if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
+       if (closure_nr_remaining(&cl) != 1) {
                bch2_trans_unlock(trans);
                closure_sync(&cl);
        }
@@ -443,6 +458,8 @@ int bch2_data_update_init(struct btree_trans *trans,
        bch2_bkey_buf_reassemble(&m->k, c, k);
        m->btree_id     = btree_id;
        m->data_opts    = data_opts;
+       m->ctxt         = ctxt;
+       m->stats        = ctxt ? ctxt->stats : NULL;
 
        bch2_write_op_init(&m->op, c, io_opts);
        m->op.pos       = bkey_start_pos(k.k);
@@ -491,7 +508,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 
                if (c->opts.nocow_enabled) {
                        if (ctxt) {
-                               move_ctxt_wait_event(ctxt, trans,
+                               move_ctxt_wait_event(ctxt,
                                                (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
                                                                          PTR_BUCKET_POS(c, &p.ptr), 0)) ||
                                                !atomic_read(&ctxt->read_sectors));
index 49e9055cbb5262a642532ea75856b1df10e5101f..9dc17b9d83795181798deb5af39401d4d6248581 100644 (file)
@@ -4,7 +4,7 @@
 #define _BCACHEFS_DATA_UPDATE_H
 
 #include "bkey_buf.h"
-#include "io_types.h"
+#include "io_write_types.h"
 
 struct moving_context;
 
@@ -23,6 +23,7 @@ struct data_update {
        struct bkey_buf         k;
        struct data_update_opts data_opts;
        struct moving_context   *ctxt;
+       struct bch_move_stats   *stats;
        struct bch_write_op     op;
 };
 
index ae47e1854b80a217c46c49236275afcf90f20ca7..57c5128db173f4579168c71b8c67749b1d63004c 100644 (file)
@@ -19,7 +19,6 @@
 #include "extents.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
 #include "super.h"
 
 #include <linux/console.h>
@@ -154,10 +153,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
        BUG_ON(b->nsets != 1);
 
        for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k))
-               if (k->type == KEY_TYPE_btree_ptr_v2) {
-                       struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
-                       v->mem_ptr = 0;
-               }
+               if (k->type == KEY_TYPE_btree_ptr_v2)
+                       ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0;
 
        v = c->verify_data;
        bkey_copy(&v->key, &b->key);
@@ -322,16 +319,16 @@ static ssize_t flush_buf(struct dump_iter *i)
 {
        if (i->buf.pos) {
                size_t bytes = min_t(size_t, i->buf.pos, i->size);
-               int err = copy_to_user(i->ubuf, i->buf.buf, bytes);
+               int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes);
 
-               if (err)
-                       return err;
+               i->ret   += copied;
+               i->ubuf  += copied;
+               i->size  -= copied;
+               i->buf.pos -= copied;
+               memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos);
 
-               i->ret   += bytes;
-               i->ubuf  += bytes;
-               i->size  -= bytes;
-               i->buf.pos -= bytes;
-               memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos);
+               if (copied != bytes)
+                       return -EFAULT;
        }
 
        return i->size ? 0 : i->ret;
@@ -369,7 +366,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                               size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        ssize_t ret;
@@ -382,17 +379,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
-       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+       trans = bch2_trans_get(i->c);
+       ret = for_each_btree_key2(trans, iter, i->id, i->from,
                                  BTREE_ITER_PREFETCH|
                                  BTREE_ITER_ALL_SNAPSHOTS, k, ({
                bch2_bkey_val_to_text(&i->buf, i->c, k);
                prt_newline(&i->buf);
-               drop_locks_do(&trans, flush_buf(i));
+               drop_locks_do(trans, flush_buf(i));
        }));
        i->from = iter.pos;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
@@ -411,7 +408,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                                       size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct btree *b;
        ssize_t ret;
@@ -427,26 +424,26 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
        if (bpos_eq(SPOS_MAX, i->from))
                return i->ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
+       trans = bch2_trans_get(i->c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) {
+       for_each_btree_node(trans, iter, i->id, i->from, 0, b, ret) {
                bch2_btree_node_to_text(&i->buf, i->c, b);
                i->from = !bpos_eq(SPOS_MAX, b->key.k.p)
                        ? bpos_successor(b->key.k.p)
                        : b->key.k.p;
 
-               ret = drop_locks_do(&trans, flush_buf(i));
+               ret = drop_locks_do(trans, flush_buf(i));
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
@@ -465,7 +462,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                                       size_t size, loff_t *ppos)
 {
        struct dump_iter *i = file->private_data;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        ssize_t ret;
@@ -478,9 +475,9 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, i->c, 0, 0);
+       trans = bch2_trans_get(i->c);
 
-       ret = for_each_btree_key2(&trans, iter, i->id, i->from,
+       ret = for_each_btree_key2(trans, iter, i->id, i->from,
                                  BTREE_ITER_PREFETCH|
                                  BTREE_ITER_ALL_SNAPSHOTS, k, ({
                struct btree_path_level *l = &iter.path->l[0];
@@ -493,11 +490,11 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                }
 
                bch2_bfloat_to_text(&i->buf, l->b, _k);
-               drop_locks_do(&trans, flush_buf(i));
+               drop_locks_do(trans, flush_buf(i));
        }));
        i->from = iter.pos;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (!ret)
                ret = flush_buf(i);
@@ -520,7 +517,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *
 
        prt_printf(out, "%px btree=%s l=%u ",
               b,
-              bch2_btree_ids[b->c.btree_id],
+              bch2_btree_id_str(b->c.btree_id),
               b->c.level);
        prt_newline(out);
 
@@ -922,18 +919,18 @@ void bch2_fs_debug_init(struct bch_fs *c)
             bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
             bd++) {
                bd->id = bd - c->btree_debug;
-               debugfs_create_file(bch2_btree_ids[bd->id],
+               debugfs_create_file(bch2_btree_id_str(bd->id),
                                    0400, c->btree_debug_dir, bd,
                                    &btree_debug_ops);
 
                snprintf(name, sizeof(name), "%s-formats",
-                        bch2_btree_ids[bd->id]);
+                        bch2_btree_id_str(bd->id));
 
                debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
                                    &btree_format_debug_ops);
 
                snprintf(name, sizeof(name), "%s-bfloat-failed",
-                        bch2_btree_ids[bd->id]);
+                        bch2_btree_id_str(bd->id));
 
                debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
                                    &bfloat_failed_debug_ops);
index 065ea59ee9fa5ecd8dcd68f12dd2ef8bd8e29497..0542d9948c24d42d1cf702d044b861b3f55af13e 100644 (file)
 
 #include <linux/dcache.h>
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 {
-       unsigned len = bkey_val_bytes(d.k) -
-               offsetof(struct bch_dirent, d_name);
+       unsigned bkey_u64s = bkey_val_u64s(d.k);
+       unsigned bkey_bytes = bkey_u64s * sizeof(u64);
+       u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1];
+#if CPU_BIG_ENDIAN
+       unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8;
+#else
+       unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8;
+#endif
+
+       return bkey_bytes -
+               offsetof(struct bch_dirent, d_name) -
+               trailing_nuls;
+}
 
-       return strnlen(d.v->d_name, len);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
+{
+       return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
 }
 
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -41,7 +54,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
 static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+       struct qstr name = bch2_dirent_get_name(d);
 
        return bch2_dirent_hash(info, &name);
 }
@@ -49,20 +62,20 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
 static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
 {
        struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
-       int len = bch2_dirent_name_bytes(l);
-       const struct qstr *r = _r;
+       const struct qstr l_name = bch2_dirent_get_name(l);
+       const struct qstr *r_name = _r;
 
-       return len - r->len ?: memcmp(l.v->d_name, r->name, len);
+       return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
 }
 
 static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
 {
        struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
        struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
-       int l_len = bch2_dirent_name_bytes(l);
-       int r_len = bch2_dirent_name_bytes(r);
+       const struct qstr l_name = bch2_dirent_get_name(l);
+       const struct qstr r_name = bch2_dirent_get_name(r);
 
-       return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
+       return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
 }
 
 static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -84,63 +97,62 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
        .is_visible     = dirent_is_visible,
 };
 
-int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_dirent_invalid(struct bch_fs *c, struct bkey_s_c k,
                        enum bkey_invalid_flags flags,
                        struct printbuf *err)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
-       unsigned len;
-
-       len = bch2_dirent_name_bytes(d);
-       if (!len) {
-               prt_printf(err, "empty name");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) {
-               prt_printf(err, "value too big (%zu > %u)",
-                      bkey_val_u64s(k.k), dirent_val_u64s(len));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (len > BCH_NAME_MAX) {
-               prt_printf(err, "dirent name too big (%u > %u)",
-                      len, BCH_NAME_MAX);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (len == 1 && !memcmp(d.v->d_name, ".", 1)) {
-               prt_printf(err, "invalid name");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (len == 2 && !memcmp(d.v->d_name, "..", 2)) {
-               prt_printf(err, "invalid name");
-               return -BCH_ERR_invalid_bkey;
-       }
+       struct qstr d_name = bch2_dirent_get_name(d);
+       int ret = 0;
 
-       if (memchr(d.v->d_name, '/', len)) {
-               prt_printf(err, "invalid name");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(!d_name.len, c, err,
+                        dirent_empty_name,
+                        "empty name");
 
-       if (d.v->d_type != DT_SUBVOL &&
-           le64_to_cpu(d.v->d_inum) == d.k->p.inode) {
-               prt_printf(err, "dirent points to own directory");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), c, err,
+                        dirent_val_too_big,
+                        "value too big (%zu > %u)",
+                        bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
 
-       return 0;
+       /*
+        * Check new keys don't exceed the max length
+        * (older keys may be larger.)
+        */
+       bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX, c, err,
+                        dirent_name_too_long,
+                        "dirent name too big (%u > %u)",
+                        d_name.len, BCH_NAME_MAX);
+
+       bkey_fsck_err_on(d_name.len != strnlen(d_name.name, d_name.len), c, err,
+                        dirent_name_embedded_nul,
+                        "dirent has stray data after name's NUL");
+
+       bkey_fsck_err_on((d_name.len == 1 && !memcmp(d_name.name, ".", 1)) ||
+                        (d_name.len == 2 && !memcmp(d_name.name, "..", 2)), c, err,
+                        dirent_name_dot_or_dotdot,
+                        "invalid name");
+
+       bkey_fsck_err_on(memchr(d_name.name, '/', d_name.len), c, err,
+                        dirent_name_has_slash,
+                        "name with /");
+
+       bkey_fsck_err_on(d.v->d_type != DT_SUBVOL &&
+                        le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, err,
+                        dirent_to_itself,
+                        "dirent points to own directory");
+fsck_err:
+       return ret;
 }
 
 void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
                         struct bkey_s_c k)
 {
        struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+       struct qstr d_name = bch2_dirent_get_name(d);
 
        prt_printf(out, "%.*s -> %llu type %s",
-              bch2_dirent_name_bytes(d),
-              d.v->d_name,
+              d_name.len,
+              d_name.name,
               d.v->d_type != DT_SUBVOL
               ? le64_to_cpu(d.v->d_inum)
               : le32_to_cpu(d.v->d_child_subvol),
@@ -189,7 +201,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
                       const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
-                      u64 *dir_offset, int flags)
+                      u64 *dir_offset,
+                      bch_str_hash_flags_t str_hash_flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
@@ -200,7 +213,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
                return ret;
 
        ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                           dir, &dirent->k_i, flags);
+                           dir, &dirent->k_i, str_hash_flags);
        *dir_offset = dirent->k.p.offset;
 
        return ret;
@@ -457,21 +470,19 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
                       const struct bch_hash_info *hash_info,
                       const struct qstr *name, subvol_inum *inum)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+       ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
                                          name, inum, 0);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (!ret)
-               bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+               bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -500,25 +511,25 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent dirent;
        subvol_inum target;
        u32 snapshot;
        struct bkey_buf sk;
+       struct qstr name;
        int ret;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents,
                           SPOS(inum.inum, ctx->pos, snapshot),
                           POS(inum.inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_dirent)
@@ -526,7 +537,7 @@ retry:
 
                dirent = bkey_s_c_to_dirent(k);
 
-               ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+               ret = bch2_dirent_read_target(trans, inum, dirent, &target);
                if (ret < 0)
                        break;
                if (ret)
@@ -535,11 +546,13 @@ retry:
                /* dir_emit() can fault and block: */
                bch2_bkey_buf_reassemble(&sk, c, k);
                dirent = bkey_i_to_s_c_dirent(sk.k);
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
+
+               name = bch2_dirent_get_name(dirent);
 
                ctx->pos = dirent.k->p.offset;
-               if (!dir_emit(ctx, dirent.v->d_name,
-                             bch2_dirent_name_bytes(dirent),
+               if (!dir_emit(ctx, name.name,
+                             name.len,
                              target.inum,
                              vfs_d_type(dirent.v->d_type)))
                        break;
@@ -549,16 +562,16 @@ retry:
                 * read_target looks up subvolumes, we can overflow paths if the
                 * directory has many subvolumes in it
                 */
-               ret = btree_trans_too_many_iters(&trans);
+               ret = btree_trans_too_many_iters(trans);
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
index b42f4a13bc551debad140c8a6e59e39d2006a774..8a55245547ba0ad079568db4b4bb9aab679652f2 100644 (file)
@@ -7,7 +7,7 @@
 enum bkey_invalid_flags;
 extern const struct bch_hash_desc bch2_dirent_hash_desc;
 
-int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_dirent_invalid(struct bch_fs *, struct bkey_s_c,
                        enum bkey_invalid_flags, struct printbuf *);
 void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -24,7 +24,7 @@ struct bch_fs;
 struct bch_hash_info;
 struct bch_inode_info;
 
-unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
 
 static inline unsigned dirent_val_u64s(unsigned len)
 {
@@ -37,7 +37,8 @@ int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
 
 int bch2_dirent_create(struct btree_trans *, subvol_inum,
                       const struct bch_hash_info *, u8,
-                      const struct qstr *, u64, u64 *, int);
+                      const struct qstr *, u64, u64 *,
+                      bch_str_hash_flags_t);
 
 static inline unsigned vfs_d_type(unsigned type)
 {
index de14ca3a9895ddc2d756776252fde5dd0480fd03..4d0cb0ccff32f2c75fa66f932f517f00b9cfdf25 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "disk_groups.h"
+#include "sb-members.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -24,28 +25,27 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
        struct bch_sb_field_disk_groups *groups =
                field_to_type(f, disk_groups);
        struct bch_disk_group *g, *sorted = NULL;
-       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
        unsigned nr_groups = disk_groups_nr(groups);
        unsigned i, len;
        int ret = 0;
 
        for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               unsigned g;
+               struct bch_member m = bch2_sb_member_get(sb, i);
+               unsigned group_id;
 
-               if (!BCH_MEMBER_GROUP(m))
+               if (!BCH_MEMBER_GROUP(&m))
                        continue;
 
-               g = BCH_MEMBER_GROUP(m) - 1;
+               group_id = BCH_MEMBER_GROUP(&m) - 1;
 
-               if (g >= nr_groups) {
+               if (group_id >= nr_groups) {
                        prt_printf(err, "disk %u has invalid label %u (have %u)",
-                              i, g, nr_groups);
+                                  i, group_id, nr_groups);
                        return -BCH_ERR_invalid_sb_disk_groups;
                }
 
-               if (BCH_GROUP_DELETED(&groups->entries[g])) {
-                       prt_printf(err, "disk %u has deleted label %u", i, g);
+               if (BCH_GROUP_DELETED(&groups->entries[group_id])) {
+                       prt_printf(err, "disk %u has deleted label %u", i, group_id);
                        return -BCH_ERR_invalid_sb_disk_groups;
                }
        }
@@ -151,22 +151,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
 
 int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 {
-       struct bch_sb_field_members *mi;
        struct bch_sb_field_disk_groups *groups;
        struct bch_disk_groups_cpu *cpu_g, *old_g;
        unsigned i, g, nr_groups;
 
        lockdep_assert_held(&c->sb_lock);
 
-       mi              = bch2_sb_get_members(c->disk_sb.sb);
-       groups          = bch2_sb_get_disk_groups(c->disk_sb.sb);
+       groups          = bch2_sb_field_get(c->disk_sb.sb, disk_groups);
        nr_groups       = disk_groups_nr(groups);
 
        if (!groups)
                return 0;
 
-       cpu_g = kzalloc(sizeof(*cpu_g) +
-                       sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+       cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL);
        if (!cpu_g)
                return -BCH_ERR_ENOMEM_disk_groups_to_cpu;
 
@@ -178,17 +175,17 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
                dst->deleted    = BCH_GROUP_DELETED(src);
                dst->parent     = BCH_GROUP_PARENT(src);
+               memcpy(dst->label, src->label, sizeof(dst->label));
        }
 
        for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               struct bch_disk_group_cpu *dst =
-                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+               struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, i);
+               struct bch_disk_group_cpu *dst;
 
-               if (!bch2_member_exists(m))
+               if (!bch2_member_exists(&m))
                        continue;
 
-               g = BCH_MEMBER_GROUP(m);
+               g = BCH_MEMBER_GROUP(&m);
                while (g) {
                        dst = &cpu_g->entries[g - 1];
                        __set_bit(i, dst->devs.d);
@@ -299,7 +296,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
                                 const char *name, unsigned namelen)
 {
        struct bch_sb_field_disk_groups *groups =
-               bch2_sb_get_disk_groups(sb->sb);
+               bch2_sb_field_get(sb->sb, disk_groups);
        unsigned i, nr_groups = disk_groups_nr(groups);
        struct bch_disk_group *g;
 
@@ -317,7 +314,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
                         sizeof(struct bch_disk_group) * (nr_groups + 1)) /
                        sizeof(u64);
 
-               groups = bch2_sb_resize_disk_groups(sb, u64s);
+               groups = bch2_sb_field_resize(sb, disk_groups, u64s);
                if (!groups)
                        return -BCH_ERR_ENOSPC_disk_label_add;
 
@@ -341,7 +338,7 @@ static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
 int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
 {
        struct bch_sb_field_disk_groups *groups =
-               bch2_sb_get_disk_groups(sb->sb);
+               bch2_sb_field_get(sb->sb, disk_groups);
        int v = -1;
 
        do {
@@ -371,7 +368,7 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
                if (*next == '.')
                        next++;
 
-               groups = bch2_sb_get_disk_groups(sb->sb);
+               groups = bch2_sb_field_get(sb->sb, disk_groups);
 
                v = __bch2_disk_group_find(groups, parent, name, len);
                if (v < 0)
@@ -386,10 +383,60 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
        return v;
 }
 
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+       struct bch_disk_groups_cpu *groups;
+       struct bch_disk_group_cpu *g;
+       unsigned nr = 0;
+       u16 path[32];
+
+       out->atomic++;
+       rcu_read_lock();
+       groups = rcu_dereference(c->disk_groups);
+       if (!groups)
+               goto invalid;
+
+       while (1) {
+               if (nr == ARRAY_SIZE(path))
+                       goto invalid;
+
+               if (v >= groups->nr)
+                       goto invalid;
+
+               g = groups->entries + v;
+
+               if (g->deleted)
+                       goto invalid;
+
+               path[nr++] = v;
+
+               if (!g->parent)
+                       break;
+
+               v = g->parent - 1;
+       }
+
+       while (nr) {
+               v = path[--nr];
+               g = groups->entries + v;
+
+               prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+               if (nr)
+                       prt_printf(out, ".");
+       }
+out:
+       rcu_read_unlock();
+       out->atomic--;
+       return;
+invalid:
+       prt_printf(out, "invalid label %u", v);
+       goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
        struct bch_sb_field_disk_groups *groups =
-               bch2_sb_get_disk_groups(sb);
+               bch2_sb_field_get(sb, disk_groups);
        struct bch_disk_group *g;
        unsigned nr = 0;
        u16 path[32];
@@ -443,7 +490,7 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
        if (ret)
                return ret;
 
-       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+       mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
        SET_BCH_MEMBER_GROUP(mi, v + 1);
        return 0;
 }
@@ -497,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
        return -EINVAL;
 }
 
-void bch2_opt_target_to_text(struct printbuf *out,
-                            struct bch_fs *c,
-                            struct bch_sb *sb,
-                            u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 {
        struct target t = target_decode(v);
 
@@ -508,48 +552,71 @@ void bch2_opt_target_to_text(struct printbuf *out,
        case TARGET_NULL:
                prt_printf(out, "none");
                break;
-       case TARGET_DEV:
-               if (c) {
-                       struct bch_dev *ca;
-
-                       rcu_read_lock();
-                       ca = t.dev < c->sb.nr_devices
-                               ? rcu_dereference(c->devs[t.dev])
-                               : NULL;
-
-                       if (ca && percpu_ref_tryget(&ca->io_ref)) {
-                               prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
-                               percpu_ref_put(&ca->io_ref);
-                       } else if (ca) {
-                               prt_printf(out, "offline device %u", t.dev);
-                       } else {
-                               prt_printf(out, "invalid device %u", t.dev);
-                       }
-
-                       rcu_read_unlock();
+       case TARGET_DEV: {
+               struct bch_dev *ca;
+
+               out->atomic++;
+               rcu_read_lock();
+               ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+
+               if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                       prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+                       percpu_ref_put(&ca->io_ref);
+               } else if (ca) {
+                       prt_printf(out, "offline device %u", t.dev);
                } else {
-                       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-                       struct bch_member *m = mi->members + t.dev;
-
-                       if (bch2_dev_exists(sb, mi, t.dev)) {
-                               prt_printf(out, "Device ");
-                               pr_uuid(out, m->uuid.b);
-                               prt_printf(out, " (%u)", t.dev);
-                       } else {
-                               prt_printf(out, "Bad device %u", t.dev);
-                       }
+                       prt_printf(out, "invalid device %u", t.dev);
                }
+
+               rcu_read_unlock();
+               out->atomic--;
                break;
+       }
        case TARGET_GROUP:
-               if (c) {
-                       mutex_lock(&c->sb_lock);
-                       bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
-                       mutex_unlock(&c->sb_lock);
+               bch2_disk_path_to_text(out, c, t.group);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+       struct target t = target_decode(v);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               prt_printf(out, "none");
+               break;
+       case TARGET_DEV: {
+               struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+               if (bch2_dev_exists(sb, t.dev)) {
+                       prt_printf(out, "Device ");
+                       pr_uuid(out, m.uuid.b);
+                       prt_printf(out, " (%u)", t.dev);
                } else {
-                       bch2_disk_path_to_text(out, sb, t.group);
+                       prt_printf(out, "Bad device %u", t.dev);
                }
                break;
+       }
+       case TARGET_GROUP:
+               bch2_disk_path_to_text_sb(out, sb, t.group);
+               break;
        default:
                BUG();
        }
 }
+
+void bch2_opt_target_to_text(struct printbuf *out,
+                            struct bch_fs *c,
+                            struct bch_sb *sb,
+                            u64 v)
+{
+       if (c)
+               bch2_target_to_text(out, c, v);
+       else
+               bch2_target_to_text_sb(out, sb, v);
+}
index bd7711767fd4f95537fb2ed38d615fdf6aeec250..441826fff224369b79698442e6b314cf5331c02c 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_DISK_GROUPS_H
 #define _BCACHEFS_DISK_GROUPS_H
 
+#include "disk_groups_types.h"
+
 extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
 
 static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 /* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
 
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
 void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
index f58e84a2bf88eb09772d5da414ebdb29906f93b7..c730f0933d29a9f63aec199385914fad6d083762 100644 (file)
 #include "btree_update.h"
 #include "btree_write_buffer.h"
 #include "buckets.h"
+#include "checksum.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
+#include "io_read.h"
 #include "keylist.h"
 #include "recovery.h"
 #include "replicas.h"
@@ -104,29 +105,26 @@ struct ec_bio {
 
 /* Stripes btree keys: */
 
-int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_stripe_invalid(struct bch_fs *c, struct bkey_s_c k,
                        enum bkey_invalid_flags flags,
                        struct printbuf *err)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+       int ret = 0;
 
-       if (bkey_eq(k.k->p, POS_MIN)) {
-               prt_printf(err, "stripe at POS_MIN");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (k.k->p.inode) {
-               prt_printf(err, "nonzero inode field");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_eq(k.k->p, POS_MIN) ||
+                        bpos_gt(k.k->p, POS(0, U32_MAX)), c, err,
+                        stripe_pos_bad,
+                        "stripe at bad pos");
 
-       if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) {
-               prt_printf(err, "incorrect value size (%zu < %u)",
-                      bkey_val_u64s(k.k), stripe_val_u64s(s));
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_val_u64s(k.k) < stripe_val_u64s(s), c, err,
+                        stripe_val_size_bad,
+                        "incorrect value size (%zu < %u)",
+                        bkey_val_u64s(k.k), stripe_val_u64s(s));
 
-       return bch2_bkey_ptrs_invalid(c, k, flags, err);
+       ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
@@ -152,6 +150,7 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
                prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
                if (i < nr_data)
                        prt_printf(out, "#%u", stripe_blockcount_get(s, i));
+               prt_printf(out, " gen %u", ptr->gen);
                if (ptr_stale(ca, ptr))
                        prt_printf(out, " stale");
        }
@@ -305,16 +304,21 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                        struct bch_csum got = ec_block_checksum(buf, i, offset);
 
                        if (bch2_crc_cmp(want, got)) {
-                               struct printbuf buf2 = PRINTBUF;
+                               struct printbuf err = PRINTBUF;
+                               struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
+
+                               prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
+                                          want.hi, want.lo,
+                                          got.hi, got.lo,
+                                          bch2_csum_types[v->csum_type]);
+                               prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
+                               bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
+                               bch_err_ratelimited(ca, "%s", err.buf);
+                               printbuf_exit(&err);
 
-                               bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key));
-
-                               bch_err_ratelimited(c,
-                                       "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
-                                       (void *) _RET_IP_, i, j, v->csum_type,
-                                       want.lo, got.lo, buf2.buf);
-                               printbuf_exit(&buf2);
                                clear_bit(i, buf->valid);
+
+                               bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
                                break;
                        }
 
@@ -372,7 +376,11 @@ static void ec_block_endio(struct bio *bio)
        struct bch_dev *ca = ec_bio->ca;
        struct closure *cl = bio->bi_private;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca,
+                              bio_data_dir(bio)
+                              ? BCH_MEMBER_ERROR_write
+                              : BCH_MEMBER_ERROR_read,
+                              "erasure coding %s error: %s",
                               bio_data_dir(bio) ? "write" : "read",
                               bch2_blk_status_to_str(bio->bi_status)))
                clear_bit(ec_bio->idx, ec_bio->buf->valid);
@@ -473,14 +481,10 @@ err:
        return ret;
 }
 
-static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
-{
-       return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe));
-}
-
 /* recovery read path: */
-int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio)
 {
+       struct bch_fs *c = trans->c;
        struct ec_stripe_buf *buf;
        struct closure cl;
        struct bch_stripe *v;
@@ -495,7 +499,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
        if (!buf)
                return -BCH_ERR_ENOMEM_ec_read_extent;
 
-       ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
+       ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf));
        if (ret) {
                bch_err_ratelimited(c,
                        "error doing reconstruct read: error %i looking up stripe", ret);
@@ -787,12 +791,10 @@ static void ec_stripe_delete_work(struct work_struct *work)
 {
        struct bch_fs *c =
                container_of(work, struct bch_fs, ec_stripe_delete_work);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
        u64 idx;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        while (1) {
                mutex_lock(&c->ec_stripes_heap_lock);
                idx = stripe_idx_to_delete(c);
@@ -801,15 +803,15 @@ static void ec_stripe_delete_work(struct work_struct *work)
                if (!idx)
                        break;
 
-               ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL,
-                               ec_stripe_delete(&trans, idx));
+               ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                               ec_stripe_delete(trans, idx));
                if (ret) {
                        bch_err_fn(c, ret);
                        break;
                }
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
 }
@@ -981,8 +983,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
        while (1) {
                ret = commit_do(trans, NULL, NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL,
+                               BCH_TRANS_COMMIT_no_check_rw|
+                               BCH_TRANS_COMMIT_no_enospc,
                        ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
                                                s, &bp_pos));
                if (ret)
@@ -998,24 +1000,22 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 
 static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v;
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush(&trans);
+       ret = bch2_btree_write_buffer_flush(trans);
        if (ret)
                goto err;
 
        for (i = 0; i < nr_data; i++) {
-               ret = ec_stripe_update_bucket(&trans, s, i);
+               ret = ec_stripe_update_bucket(trans, s, i);
                if (ret)
                        break;
        }
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -1121,9 +1121,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        }
 
        ret = bch2_trans_do(c, &s->res, NULL,
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_NOFAIL,
-                           ec_stripe_key_update(&trans,
+                           BCH_TRANS_COMMIT_no_check_rw|
+                           BCH_TRANS_COMMIT_no_enospc,
+                           ec_stripe_key_update(trans,
                                        bkey_i_to_stripe(&s->new_stripe.key),
                                        !s->have_existing_stripe));
        if (ret) {
@@ -1133,8 +1133,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
 
        ret = ec_stripe_update_extents(c, &s->new_stripe);
        if (ret) {
-               bch_err(c, "error creating stripe: error updating pointers: %s",
-                       bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating stripe: error updating pointers");
                goto err;
        }
 err:
@@ -1374,6 +1373,15 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
                        h->nr_active_devs++;
 
        rcu_read_unlock();
+
+       /*
+        * If we only have redundancy + 1 devices, we're better off with just
+        * replication:
+        */
+       if (h->nr_active_devs < h->redundancy + 2)
+               bch_err(c, "insufficient devices available to create stripe (have %u, need %u) - mismatched bucket sizes?",
+                       h->nr_active_devs, h->redundancy + 2);
+
        list_add(&h->list, &c->ec_stripe_head_list);
        return h;
 }
@@ -1425,6 +1433,11 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
 
        h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark);
 found:
+       if (!IS_ERR_OR_NULL(h) &&
+           h->nr_active_devs < h->redundancy + 2) {
+               mutex_unlock(&h->lock);
+               h = NULL;
+       }
        mutex_unlock(&c->ec_stripe_head_lock);
        return h;
 }
@@ -1682,8 +1695,6 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
        int ret;
 
        h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark);
-       if (!h)
-               bch_err(c, "no stripe head");
        if (IS_ERR_OR_NULL(h))
                return h;
 
@@ -1822,7 +1833,7 @@ void bch2_fs_ec_flush(struct bch_fs *c)
 
 int bch2_stripes_read(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        const struct bch_stripe *s;
@@ -1830,9 +1841,7 @@ int bch2_stripes_read(struct bch_fs *c)
        unsigned i;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                if (k.k->type != KEY_TYPE_stripe)
                        continue;
@@ -1855,9 +1864,9 @@ int bch2_stripes_read(struct bch_fs *c)
 
                bch2_stripes_heap_insert(c, m, k.k->p.offset);
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
index 885ae5d5165587e4f72312401edcf1b67dcff964..7d0237c9819f1a42561f5ec81512e1c4278d12fd 100644 (file)
@@ -8,7 +8,7 @@
 
 enum bkey_invalid_flags;
 
-int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
                        enum bkey_invalid_flags, struct printbuf *);
 void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
                         struct bkey_s_c);
@@ -199,7 +199,7 @@ struct ec_stripe_head {
        struct ec_stripe_new    *s;
 };
 
-int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
 
@@ -240,7 +240,7 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
                        bch2_ec_do_stripe_creates(c);
                        break;
                default:
-                       unreachable();
+                       BUG();
                }
 }
 
index dc906fc9176fecf9f8a7240ff78780a742367735..d260ff9bbfeb7b9121f222a4362f37f95c927977 100644 (file)
@@ -12,8 +12,6 @@ static const char * const bch2_errcode_strs[] = {
        NULL
 };
 
-#define BCH_ERR_0      0
-
 static unsigned bch2_errcode_parents[] = {
 #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class,
        BCH_ERRCODES()
@@ -61,3 +59,10 @@ int __bch2_err_class(int err)
 
        return -err;
 }
+
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+       if (status == BLK_STS_REMOVED)
+               return "device removed";
+       return blk_status_to_str(status);
+}
index 735eb24161139931ca1437d707389fe767b073ac..e5c3262cc3032d33e561b21a4a380568ae38f917 100644 (file)
@@ -3,6 +3,8 @@
 #define _BCACHEFS_ERRCODE_H
 
 #define BCH_ERRCODES()                                                         \
+       x(ERANGE,                       ERANGE_option_too_small)                \
+       x(ERANGE,                       ERANGE_option_too_big)                  \
        x(ENOMEM,                       ENOMEM_stripe_buf)                      \
        x(ENOMEM,                       ENOMEM_replicas_table)                  \
        x(ENOMEM,                       ENOMEM_cpu_replicas)                    \
@@ -71,7 +73,6 @@
        x(ENOMEM,                       ENOMEM_fsck_add_nlink)                  \
        x(ENOMEM,                       ENOMEM_journal_key_insert)              \
        x(ENOMEM,                       ENOMEM_journal_keys_sort)               \
-       x(ENOMEM,                       ENOMEM_journal_replay)                  \
        x(ENOMEM,                       ENOMEM_read_superblock_clean)           \
        x(ENOMEM,                       ENOMEM_fs_alloc)                        \
        x(ENOMEM,                       ENOMEM_fs_name_alloc)                   \
@@ -91,6 +92,7 @@
        x(ENOSPC,                       ENOSPC_sb_quota)                        \
        x(ENOSPC,                       ENOSPC_sb_replicas)                     \
        x(ENOSPC,                       ENOSPC_sb_members)                      \
+       x(ENOSPC,                       ENOSPC_sb_members_v2)                   \
        x(ENOSPC,                       ENOSPC_sb_crypt)                        \
        x(ENOSPC,                       ENOSPC_btree_slot)                      \
        x(ENOSPC,                       ENOSPC_snapshot_tree)                   \
        x(ENOENT,                       ENOENT_str_hash_set_must_replace)       \
        x(ENOENT,                       ENOENT_inode)                           \
        x(ENOENT,                       ENOENT_not_subvol)                      \
+       x(ENOENT,                       ENOENT_not_directory)                   \
        x(ENOENT,                       ENOENT_directory_dead)                  \
        x(ENOENT,                       ENOENT_subvolume)                       \
        x(ENOENT,                       ENOENT_snapshot_tree)                   \
        x(BCH_ERR_invalid_sb,           invalid_sb_crypt)                       \
        x(BCH_ERR_invalid_sb,           invalid_sb_clean)                       \
        x(BCH_ERR_invalid_sb,           invalid_sb_quota)                       \
+       x(BCH_ERR_invalid_sb,           invalid_sb_errors)                      \
+       x(BCH_ERR_invalid_sb,           invalid_sb_opt_compression)             \
        x(BCH_ERR_invalid,              invalid_bkey)                           \
        x(BCH_ERR_operation_blocked,    nocow_lock_blocked)                     \
+       x(EIO,                          btree_node_read_err)                    \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_fixable)            \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_want_retry)         \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_must_retry)         \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_bad_node)           \
+       x(BCH_ERR_btree_node_read_err,  btree_node_read_err_incompatible)       \
+       x(0,                            nopromote)                              \
+       x(BCH_ERR_nopromote,            nopromote_may_not)                      \
+       x(BCH_ERR_nopromote,            nopromote_already_promoted)             \
+       x(BCH_ERR_nopromote,            nopromote_unwritten)                    \
+       x(BCH_ERR_nopromote,            nopromote_congested)                    \
+       x(BCH_ERR_nopromote,            nopromote_in_flight)                    \
+       x(BCH_ERR_nopromote,            nopromote_enomem)
 
 enum bch_errcode {
        BCH_ERR_START           = 2048,
@@ -243,4 +261,8 @@ static inline long bch2_err_class(long err)
        return err < 0 ? __bch2_err_class(err) : err;
 }
 
+#define BLK_STS_REMOVED                ((__force blk_status_t)128)
+
+const char *bch2_blk_status_to_str(blk_status_t);
+
 #endif /* _BCACHFES_ERRCODE_H */
index 39009cf0c44866280198933936d61d7f22cca49b..7b28d37922fd0e47d82ac1d27403f031cc577c7b 100644 (file)
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
-#include "io.h"
 #include "super.h"
 
 #define FSCK_ERR_RATELIMIT_NR  10
@@ -57,8 +56,9 @@ void bch2_io_error_work(struct work_struct *work)
        up_write(&c->state_lock);
 }
 
-void bch2_io_error(struct bch_dev *ca)
+void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
 {
+       atomic64_inc(&ca->errors[type]);
        //queue_work(system_long_wq, &ca->io_error_work);
 }
 
@@ -117,31 +117,34 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
        if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
                return NULL;
 
-       list_for_each_entry(s, &c->fsck_errors, list)
+       list_for_each_entry(s, &c->fsck_error_msgs, list)
                if (s->fmt == fmt) {
                        /*
                         * move it to the head of the list: repeated fsck errors
                         * are common
                         */
-                       list_move(&s->list, &c->fsck_errors);
+                       list_move(&s->list, &c->fsck_error_msgs);
                        return s;
                }
 
        s = kzalloc(sizeof(*s), GFP_NOFS);
        if (!s) {
-               if (!c->fsck_alloc_err)
+               if (!c->fsck_alloc_msgs_err)
                        bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
-               c->fsck_alloc_err = true;
+               c->fsck_alloc_msgs_err = true;
                return NULL;
        }
 
        INIT_LIST_HEAD(&s->list);
        s->fmt = fmt;
-       list_add(&s->list, &c->fsck_errors);
+       list_add(&s->list, &c->fsck_error_msgs);
        return s;
 }
 
-int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
+int bch2_fsck_err(struct bch_fs *c,
+                 enum bch_fsck_flags flags,
+                 enum bch_sb_error_id err,
+                 const char *fmt, ...)
 {
        struct fsck_err_state *s = NULL;
        va_list args;
@@ -149,11 +152,13 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
        struct printbuf buf = PRINTBUF, *out = &buf;
        int ret = -BCH_ERR_fsck_ignore;
 
+       bch2_sb_error_count(c, err);
+
        va_start(args, fmt);
        prt_vprintf(out, fmt, args);
        va_end(args);
 
-       mutex_lock(&c->fsck_error_lock);
+       mutex_lock(&c->fsck_error_msgs_lock);
        s = fsck_err_get(c, fmt);
        if (s) {
                /*
@@ -163,7 +168,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
                 */
                if (s->last_msg && !strcmp(buf.buf, s->last_msg)) {
                        ret = s->ret;
-                       mutex_unlock(&c->fsck_error_lock);
+                       mutex_unlock(&c->fsck_error_msgs_lock);
                        printbuf_exit(&buf);
                        return ret;
                }
@@ -258,7 +263,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...)
        if (s)
                s->ret = ret;
 
-       mutex_unlock(&c->fsck_error_lock);
+       mutex_unlock(&c->fsck_error_msgs_lock);
 
        printbuf_exit(&buf);
 
@@ -279,9 +284,9 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
 {
        struct fsck_err_state *s, *n;
 
-       mutex_lock(&c->fsck_error_lock);
+       mutex_lock(&c->fsck_error_msgs_lock);
 
-       list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
+       list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
                if (s->ratelimited && s->last_msg)
                        bch_err(c, "Saw %llu errors like:\n    %s", s->nr, s->last_msg);
 
@@ -290,5 +295,5 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
                kfree(s);
        }
 
-       mutex_unlock(&c->fsck_error_lock);
+       mutex_unlock(&c->fsck_error_msgs_lock);
 }
index 7ce9540052e53df99a90d4ac958c671ad375bdc7..d167d65986e0425f2c2e8b2d5503b5d0a6526c6c 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <linux/list.h>
 #include <linux/printk.h>
+#include "sb-errors.h"
 
 struct bch_dev;
 struct bch_fs;
@@ -101,18 +102,26 @@ struct fsck_err_state {
        char                    *last_msg;
 };
 
-#define FSCK_CAN_FIX           (1 << 0)
-#define FSCK_CAN_IGNORE                (1 << 1)
-#define FSCK_NEED_FSCK         (1 << 2)
-#define FSCK_NO_RATELIMIT      (1 << 3)
+enum bch_fsck_flags {
+       FSCK_CAN_FIX            = 1 << 0,
+       FSCK_CAN_IGNORE         = 1 << 1,
+       FSCK_NEED_FSCK          = 1 << 2,
+       FSCK_NO_RATELIMIT       = 1 << 3,
+};
+
+#define fsck_err_count(_c, _err)       bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err)
 
-__printf(3, 4) __cold
-int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...);
+__printf(4, 5) __cold
+int bch2_fsck_err(struct bch_fs *,
+                 enum bch_fsck_flags,
+                 enum bch_sb_error_id,
+                 const char *, ...);
 void bch2_flush_fsck_errs(struct bch_fs *);
 
-#define __fsck_err(c, _flags, msg, ...)                                        \
+#define __fsck_err(c, _flags, _err_type, ...)                          \
 ({                                                                     \
-       int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);        \
+       int _ret = bch2_fsck_err(c, _flags, BCH_FSCK_ERR_##_err_type,   \
+                                __VA_ARGS__);                          \
                                                                        \
        if (_ret != -BCH_ERR_fsck_fix &&                                \
            _ret != -BCH_ERR_fsck_ignore) {                             \
@@ -127,26 +136,53 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 
 /* XXX: mark in superblock that filesystem contains errors, if we ignore: */
 
-#define __fsck_err_on(cond, c, _flags, ...)                            \
-       (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
+#define __fsck_err_on(cond, c, _flags, _err_type, ...)                 \
+       (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false)
+
+#define need_fsck_err_on(cond, c, _err_type, ...)                              \
+       __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define need_fsck_err(c, _err_type, ...)                               \
+       __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err(c, _err_type, ...)                            \
+       __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
+
+#define mustfix_fsck_err_on(cond, c, _err_type, ...)                   \
+       __fsck_err_on(cond, c, FSCK_CAN_FIX, _err_type, __VA_ARGS__)
 
-#define need_fsck_err_on(cond, c, ...)                                 \
-       __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err(c, _err_type, ...)                                    \
+       __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define need_fsck_err(c, ...)                                          \
-       __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
+#define fsck_err_on(cond, c, _err_type, ...)                           \
+       __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
-#define mustfix_fsck_err(c, ...)                                       \
-       __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
+static inline void bch2_bkey_fsck_err(struct bch_fs *c,
+                                    struct printbuf *err_msg,
+                                    enum bch_sb_error_id err_type,
+                                    const char *fmt, ...)
+{
+       va_list args;
 
-#define mustfix_fsck_err_on(cond, c, ...)                              \
-       __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
+       va_start(args, fmt);
+       prt_vprintf(err_msg, fmt, args);
+       va_end(args);
 
-#define fsck_err(c, ...)                                               \
-       __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+}
 
-#define fsck_err_on(cond, c, ...)                                      \
-       __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
+#define bkey_fsck_err(c, _err_msg, _err_type, ...)                     \
+do {                                                                   \
+       prt_printf(_err_msg, __VA_ARGS__);                              \
+       bch2_sb_error_count(c, BCH_FSCK_ERR_##_err_type);               \
+       ret = -BCH_ERR_invalid_bkey;                                    \
+       goto fsck_err;                                                  \
+} while (0)
+
+#define bkey_fsck_err_on(cond, ...)                                    \
+do {                                                                   \
+       if (unlikely(cond))                                             \
+               bkey_fsck_err(__VA_ARGS__);                             \
+} while (0)
 
 /*
  * Fatal errors: these don't indicate a bug, but we can't continue running in RW
@@ -179,26 +215,26 @@ do {                                                                      \
 void bch2_io_error_work(struct work_struct *);
 
 /* Does the error handling without logging a message */
-void bch2_io_error(struct bch_dev *);
+void bch2_io_error(struct bch_dev *, enum bch_member_error_type);
 
-#define bch2_dev_io_err_on(cond, ca, ...)                              \
+#define bch2_dev_io_err_on(cond, ca, _type, ...)                       \
 ({                                                                     \
        bool _ret = (cond);                                             \
                                                                        \
        if (_ret) {                                                     \
                bch_err_dev_ratelimited(ca, __VA_ARGS__);               \
-               bch2_io_error(ca);                                      \
+               bch2_io_error(ca, _type);                               \
        }                                                               \
        _ret;                                                           \
 })
 
-#define bch2_dev_inum_io_err_on(cond, ca, ...)                         \
+#define bch2_dev_inum_io_err_on(cond, ca, _type, ...)                  \
 ({                                                                     \
        bool _ret = (cond);                                             \
                                                                        \
        if (_ret) {                                                     \
                bch_err_inum_offset_ratelimited(ca, __VA_ARGS__);       \
-               bch2_io_error(ca);                                      \
+               bch2_io_error(ca, _type);                               \
        }                                                               \
        _ret;                                                           \
 })
index c13e0afc66eaa4430bc58b68960c22ea5fd4de5e..a864de231b69e297e85491dfd285928152c467b8 100644 (file)
@@ -13,6 +13,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "error.h"
@@ -162,17 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
                           enum bkey_invalid_flags flags,
                           struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
-               prt_printf(err, "value too big (%zu > %u)",
-                      bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+                        btree_ptr_val_too_big,
+                        "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 
-       return bch2_bkey_ptrs_invalid(c, k, flags, err);
+       ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -181,17 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
                              enum bkey_invalid_flags flags,
                              struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
-               prt_printf(err, "value too big (%zu > %zu)",
-                      bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+                        btree_ptr_v2_val_too_big,
+                        "value too big (%zu > %zu)",
+                        bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 
-       return bch2_bkey_ptrs_invalid(c, k, flags, err);
+       ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -372,19 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
                             enum bkey_invalid_flags flags,
                             struct printbuf *err)
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+       int ret = 0;
 
-       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
-               prt_printf(err, "invalid nr_replicas (%u)",
-                      r.v->nr_replicas);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+                        reservation_key_nr_replicas_invalid,
+                        "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+       return ret;
 }
 
 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -517,13 +522,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
        switch (type) {
        case BCH_EXTENT_ENTRY_crc32:
                set_common_fields(dst->crc32, src);
-               memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum));
+               dst->crc32.csum         = (u32 __force) *((__le32 *) &src.csum.lo);
                break;
        case BCH_EXTENT_ENTRY_crc64:
                set_common_fields(dst->crc64, src);
                dst->crc64.nonce        = src.nonce;
-               dst->crc64.csum_lo      = src.csum.lo;
-               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
+               dst->crc64.csum_lo      = (u64 __force) src.csum.lo;
+               dst->crc64.csum_hi      = (u64 __force) *((__le16 *) &src.csum.hi);
                break;
        case BCH_EXTENT_ENTRY_crc128:
                set_common_fields(dst->crc128, src);
@@ -757,18 +762,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
        return i;
 }
 
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-       union bch_extent_entry *next = extent_entry_next(entry);
-
-       /* stripes have ptrs, but their layout doesn't work with this code */
-       BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-       memmove_u64s_down(entry, next,
-                         (u64 *) bkey_val_end(k) - (u64 *) next);
-       k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
@@ -915,11 +908,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 
                bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
                        bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-                       if (p1.ptr.dev          == p2.ptr.dev &&
-                           p1.ptr.gen          == p2.ptr.gen &&
-                           (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-                           (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-                               return true;
+                               if (p1.ptr.dev          == p2.ptr.dev &&
+                                   p1.ptr.gen          == p2.ptr.gen &&
+                                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                                       return true;
 
                return false;
        } else {
@@ -992,10 +985,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       const struct bch_extent_ptr *ptr;
-       const struct bch_extent_stripe_ptr *ec;
-       struct bch_dev *ca;
        bool first = true;
 
        if (c)
@@ -1006,9 +995,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                        prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       ptr = entry_to_ptr(entry);
-                       ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+               case BCH_EXTENT_ENTRY_ptr: {
+                       const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+                       struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
@@ -1030,10 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                                        prt_printf(out, " stale");
                        }
                        break;
+               }
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+               case BCH_EXTENT_ENTRY_crc128: {
+                       struct bch_extent_crc_unpacked crc =
+                               bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
                        prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
@@ -1042,12 +1033,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                               bch2_csum_types[crc.csum_type],
                               bch2_compression_types[crc.compression_type]);
                        break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ec = &entry->stripe_ptr;
+               }
+               case BCH_EXTENT_ENTRY_stripe_ptr: {
+                       const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
 
                        prt_printf(out, "ec: idx %llu block %u",
                               (u64) ec->idx, ec->block);
                        break;
+               }
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       prt_str(out, "rebalance: target ");
+                       if (c)
+                               bch2_target_to_text(out, c, r->target);
+                       else
+                               prt_printf(out, "%u", r->target);
+                       prt_str(out, " compression ");
+                       bch2_compression_opt_to_text(out, r->compression);
+                       break;
+               }
                default:
                        prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
                        return;
@@ -1057,8 +1062,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-static int extent_ptr_invalid(const struct bch_fs *c,
+static int extent_ptr_invalid(struct bch_fs *c,
                              struct bkey_s_c k,
+                             enum bkey_invalid_flags flags,
                              const struct bch_extent_ptr *ptr,
                              unsigned size_ondisk,
                              bool metadata,
@@ -1069,43 +1075,44 @@ static int extent_ptr_invalid(const struct bch_fs *c,
        u64 bucket;
        u32 bucket_offset;
        struct bch_dev *ca;
+       int ret = 0;
 
        if (!bch2_dev_exists2(c, ptr->dev)) {
-               prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
-               return -BCH_ERR_invalid_bkey;
+               /*
+                * If we're in the write path this key might have already been
+                * overwritten, and we could be seeing a device that doesn't
+                * exist anymore due to racing with device removal:
+                */
+               if (flags & BKEY_INVALID_WRITE)
+                       return 0;
+
+               bkey_fsck_err(c, err, ptr_to_invalid_device,
+                          "pointer to invalid device (%u)", ptr->dev);
        }
 
        ca = bch_dev_bkey_exists(c, ptr->dev);
        bkey_for_each_ptr(ptrs, ptr2)
-               if (ptr != ptr2 && ptr->dev == ptr2->dev) {
-                       prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+                                ptr_to_duplicate_device,
+                                "multiple pointers to same device (%u)", ptr->dev);
 
        bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
-       if (bucket >= ca->mi.nbuckets) {
-               prt_printf(err, "pointer past last bucket (%llu > %llu)",
-                      bucket, ca->mi.nbuckets);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
-               prt_printf(err, "pointer before first bucket (%llu < %u)",
-                      bucket, ca->mi.first_bucket);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
-               prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+       bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+                        ptr_after_last_bucket,
+                        "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+       bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+                        ptr_before_first_bucket,
+                        "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+       bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+                        ptr_spans_multiple_buckets,
+                        "pointer spans multiple buckets (%u + %u > %u)",
                       bucket_offset, size_ondisk, ca->mi.bucket_size);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+fsck_err:
+       return ret;
 }
 
-int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
                           enum bkey_invalid_flags flags,
                           struct printbuf *err)
 {
@@ -1115,48 +1122,39 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
        unsigned size_ondisk = k.k->size;
        unsigned nonce = UINT_MAX;
        unsigned nr_ptrs = 0;
-       bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
-       int ret;
+       bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+       int ret = 0;
 
        if (bkey_is_btree_ptr(k.k))
                size_ondisk = btree_sectors(c);
 
        bkey_extent_entry_for_each(ptrs, entry) {
-               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
-                       prt_printf(err, "invalid extent entry type (got %u, max %u)",
-                              __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+                       extent_ptrs_invalid_entry,
+                       "invalid extent entry type (got %u, max %u)",
+                       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
 
-               if (bkey_is_btree_ptr(k.k) &&
-                   !extent_entry_is_ptr(entry)) {
-                       prt_printf(err, "has non ptr field");
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+                                !extent_entry_is_ptr(entry), c, err,
+                                btree_ptr_has_non_ptr,
+                                "has non ptr field");
 
                switch (extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
-                       ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
-                                                false, err);
+                       ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+                                                size_ondisk, false, err);
                        if (ret)
                                return ret;
 
-                       if (nr_ptrs && unwritten != entry->ptr.unwritten) {
-                               prt_printf(err, "extent with unwritten and written ptrs");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
-                               prt_printf(err, "has unwritten ptrs");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+                                        ptr_cached_and_erasure_coded,
+                                        "cached, erasure coded ptr");
 
-                       if (entry->ptr.cached && have_ec) {
-                               prt_printf(err, "cached, erasure coded ptr");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       if (!entry->ptr.unwritten)
+                               have_written = true;
+                       else
+                               have_unwritten = true;
 
-                       unwritten = entry->ptr.unwritten;
                        have_ec = false;
                        crc_since_last_ptr = false;
                        nr_ptrs++;
@@ -1166,72 +1164,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                case BCH_EXTENT_ENTRY_crc128:
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       if (crc.offset + crc.live_size >
-                           crc.uncompressed_size) {
-                               prt_printf(err, "checksum offset + key size > uncompressed size");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       size_ondisk = crc.compressed_size;
-
-                       if (!bch2_checksum_type_valid(c, crc.csum_type)) {
-                               prt_printf(err, "invalid checksum type");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
-                               prt_printf(err, "invalid compression type");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+                                        ptr_crc_uncompressed_size_too_small,
+                                        "checksum offset + key size > uncompressed size");
+                       bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+                                        ptr_crc_csum_type_unknown,
+                                        "invalid checksum type");
+                       bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+                                        ptr_crc_compression_type_unknown,
+                                        "invalid compression type");
 
                        if (bch2_csum_type_is_encryption(crc.csum_type)) {
                                if (nonce == UINT_MAX)
                                        nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce) {
-                                       prt_printf(err, "incorrect nonce");
-                                       return -BCH_ERR_invalid_bkey;
-                               }
+                               else if (nonce != crc.offset + crc.nonce)
+                                       bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+                                                     "incorrect nonce");
                        }
 
-                       if (crc_since_last_ptr) {
-                               prt_printf(err, "redundant crc entry");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(crc_since_last_ptr, c, err,
+                                        ptr_crc_redundant,
+                                        "redundant crc entry");
                        crc_since_last_ptr = true;
+
+                       bkey_fsck_err_on(crc_is_encoded(crc) &&
+                                        (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+                                        (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+                                        ptr_crc_uncompressed_size_too_big,
+                                        "too large encoded extent");
+
+                       size_ondisk = crc.compressed_size;
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
-                       if (have_ec) {
-                               prt_printf(err, "redundant stripe entry");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(have_ec, c, err,
+                                        ptr_stripe_redundant,
+                                        "redundant stripe entry");
                        have_ec = true;
                        break;
-               case BCH_EXTENT_ENTRY_rebalance:
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       if (!bch2_compression_opt_valid(r->compression)) {
+                               struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+                               prt_printf(err, "invalid compression opt %u:%u",
+                                          opt.type, opt.level);
+                               return -BCH_ERR_invalid_bkey;
+                       }
                        break;
                }
+               }
        }
 
-       if (!nr_ptrs) {
-               prt_str(err, "no ptrs");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
-               prt_str(err, "too many ptrs");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (crc_since_last_ptr) {
-               prt_printf(err, "redundant crc entry");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (have_ec) {
-               prt_printf(err, "redundant stripe entry");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(!nr_ptrs, c, err,
+                        extent_ptrs_no_ptrs,
+                        "no ptrs");
+       bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+                        extent_ptrs_too_many_ptrs,
+                        "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+       bkey_fsck_err_on(have_written && have_unwritten, c, err,
+                        extent_ptrs_written_and_unwritten,
+                        "extent with unwritten and written ptrs");
+       bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+                        extent_ptrs_unwritten,
+                        "has unwritten ptrs");
+       bkey_fsck_err_on(crc_since_last_ptr, c, err,
+                        extent_ptrs_redundant_crc,
+                        "redundant crc entry");
+       bkey_fsck_err_on(have_ec, c, err,
+                        extent_ptrs_redundant_stripe,
+                        "redundant stripe entry");
+fsck_err:
+       return ret;
 }
 
 void bch2_ptr_swab(struct bkey_s k)
@@ -1272,6 +1275,125 @@ void bch2_ptr_swab(struct bkey_s k)
        }
 }
 
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+
+       bkey_extent_entry_for_each(ptrs, entry)
+               if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+                       return &entry->rebalance;
+
+       return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+                                      unsigned target, unsigned compression)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       unsigned rewrite_ptrs = 0;
+
+       if (compression) {
+               unsigned compression_type = bch2_compression_opt_to_type(compression);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+               unsigned i = 0;
+
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                       if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+                               rewrite_ptrs = 0;
+                               goto incompressible;
+                       }
+
+                       if (!p.ptr.cached && p.crc.compression_type != compression_type)
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+incompressible:
+       if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+               const struct bch_extent_ptr *ptr;
+               unsigned i = 0;
+
+               bkey_for_each_ptr(ptrs, ptr) {
+                       if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+
+       return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+       /*
+        * If it's an indirect extent, we don't delete the rebalance entry when
+        * done so that we know what options were applied - check if it still
+        * needs work done:
+        */
+       if (r &&
+           k.k->type == KEY_TYPE_reflink_v &&
+           !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+               r = NULL;
+
+       return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+                                 unsigned target, unsigned compression)
+{
+       struct bkey_s k = bkey_i_to_s(_k);
+       struct bch_extent_rebalance *r;
+       bool needs_rebalance;
+
+       if (!bkey_extent_is_direct_data(k.k))
+               return 0;
+
+       /* get existing rebalance entry: */
+       r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+       if (r) {
+               if (k.k->type == KEY_TYPE_reflink_v) {
+                       /*
+                        * indirect extents: existing options take precedence,
+                        * so that we don't move extents back and forth if
+                        * they're referenced by different inodes with different
+                        * options:
+                        */
+                       if (r->target)
+                               target = r->target;
+                       if (r->compression)
+                               compression = r->compression;
+               }
+
+               r->target       = target;
+               r->compression  = compression;
+       }
+
+       needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+       if (needs_rebalance && !r) {
+               union bch_extent_entry *new = bkey_val_end(k);
+
+               new->rebalance.type             = 1U << BCH_EXTENT_ENTRY_rebalance;
+               new->rebalance.compression      = compression;
+               new->rebalance.target           = target;
+               new->rebalance.unused           = 0;
+               k.k->u64s += extent_entry_u64s(new);
+       } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+               /*
+                * For indirect extents, don't delete the rebalance entry when
+                * we're finished so that we know we specifically moved it or
+                * compressed it to its current location/compression type
+                */
+               extent_entry_drop(k, (union bch_extent_entry *) r);
+       }
+
+       return 0;
+}
+
 /* Generic extent code: */
 
 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
index 6e9d23a06758685fd90372df237d8d83c8119b4c..a2ce8a3be13ca418a001d8ff93d9091565aed800 100644 (file)
@@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
        memcpy_u64s_small(dst, new, extent_entry_u64s(new));
 }
 
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+       union bch_extent_entry *next = extent_entry_next(entry);
+
+       /* stripes have ptrs, but their layout doesn't work with this code */
+       BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+       memmove_u64s_down(entry, next,
+                         (u64 *) bkey_val_end(k) - (u64 *) next);
+       k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
        return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -155,7 +167,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
                        common_fields(crc->crc32),
                };
 
-               memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum));
+               *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum;
                return ret;
        }
        case BCH_EXTENT_ENTRY_crc64: {
@@ -165,8 +177,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
                        .csum.lo                = (__force __le64) crc->crc64.csum_lo,
                };
 
-               u16 hi = crc->crc64.csum_hi;
-               memcpy(&ret.csum.hi, &hi, sizeof(hi));
+               *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi;
+
                return ret;
        }
        case BCH_EXTENT_ENTRY_crc128: {
@@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
                crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
 }
 
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+       return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
 /* bkey_ptrs: generically over any key type that has ptrs */
 
 struct bkey_ptrs_c {
@@ -383,12 +400,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_invalid(struct bch_fs *, struct bkey_s_c,
                           enum bkey_invalid_flags, struct printbuf *);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_btree_ptr_v2_invalid(struct bch_fs *, struct bkey_s_c,
                              enum bkey_invalid_flags, struct printbuf *);
 void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
@@ -428,7 +445,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reservation_invalid(struct bch_fs *, struct bkey_s_c,
                             enum bkey_invalid_flags, struct printbuf *);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
@@ -520,6 +537,7 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
        case KEY_TYPE_reflink_v:
        case KEY_TYPE_inline_data:
        case KEY_TYPE_indirect_inline_data:
+       case KEY_TYPE_error:
                return true;
        default:
                return false;
@@ -632,6 +650,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 
 static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
 {
+       struct bch_extent_ptr *dest;
+
        EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev));
 
        switch (k->k.type) {
@@ -641,10 +661,8 @@ static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr
                EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
                ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-               memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-                      &ptr,
-                      sizeof(ptr));
+               dest = (struct bch_extent_ptr *)((void *) &k->v + bkey_val_bytes(&k->k));
+               *dest = ptr;
                k->k.u64s++;
                break;
        default:
@@ -687,11 +705,19 @@ void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
-int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
                           enum bkey_invalid_flags, struct printbuf *);
 
 void bch2_ptr_swab(struct bkey_s);
 
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+                                      unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+                                 unsigned, unsigned);
+
 /* Generic extent code: */
 
 enum bch_extent_overlap {
@@ -736,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
        k->size = new_size;
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static inline void extent_save(struct btree *b, struct bkey_packed *dst,
-                              struct bkey *src)
-{
-       struct bkey_format *f = &b->format;
-       struct bkey_i *dst_unpacked;
-
-       if ((dst_unpacked = packed_to_bkey(dst)))
-               dst_unpacked->k = *src;
-       else
-               BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
 #endif /* _BCACHEFS_EXTENTS_H */
index bb5305441f275938d4f76535a9b9bf1aaccf7934..4496cf91a4c17bcde4e4a934eb0475007ff1311c 100644 (file)
@@ -51,7 +51,7 @@ int bch2_create_trans(struct btree_trans *trans,
                bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
 
                if (flags & BCH_CREATE_TMPFILE)
-                       new_inode->bi_flags |= BCH_INODE_UNLINKED;
+                       new_inode->bi_flags |= BCH_INODE_unlinked;
 
                ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
                if (ret)
index d433f4d5662da9fd8991823ca2c37d1c0aae3082..b0e8144ec5500cd37a2d35f71f399c1ebe424d53 100644 (file)
 #include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
 #include "journal.h"
-#include "io.h"
+#include "io_misc.h"
 #include "keylist.h"
 #include "quota.h"
 #include "reflink.h"
 #include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
-#include <linux/writeback.h>
 
 #include <trace/events/writeback.h>
 
-static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool);
-
-struct folio_vec {
-       struct folio    *fv_folio;
-       size_t          fv_offset;
-       size_t          fv_len;
-};
-
-static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
-{
-
-       struct folio *folio     = page_folio(bv.bv_page);
-       size_t offset           = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
-               bv.bv_offset;
-       size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
-
-       return (struct folio_vec) {
-               .fv_folio       = folio,
-               .fv_offset      = offset,
-               .fv_len         = len,
-       };
-}
-
-static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
-                                                   struct bvec_iter iter)
-{
-       return biovec_to_foliovec(bio_iter_iovec(bio, iter));
-}
-
-#define __bio_for_each_folio(bvl, bio, iter, start)                    \
-       for (iter = (start);                                            \
-            (iter).bi_size &&                                          \
-               ((bvl = bio_iter_iovec_folio((bio), (iter))), 1);       \
-            bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
-
-/**
- * bio_for_each_folio - iterate over folios within a bio
- *
- * Like other non-_all versions, this iterates over what bio->bi_iter currently
- * points to. This version is for drivers, where the bio may have previously
- * been split or cloned.
- */
-#define bio_for_each_folio(bvl, bio, iter)                             \
-       __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
-
-/*
- * Use u64 for the end pos and sector helpers because if the folio covers the
- * max supported range of the mapping, the start offset of the next folio
- * overflows loff_t. This breaks much of the range based processing in the
- * buffered write path.
- */
-static inline u64 folio_end_pos(struct folio *folio)
-{
-       return folio_pos(folio) + folio_size(folio);
-}
-
-static inline size_t folio_sectors(struct folio *folio)
-{
-       return PAGE_SECTORS << folio_order(folio);
-}
-
-static inline loff_t folio_sector(struct folio *folio)
-{
-       return folio_pos(folio) >> 9;
-}
-
-static inline u64 folio_end_sector(struct folio *folio)
-{
-       return folio_end_pos(folio) >> 9;
-}
-
-typedef DARRAY(struct folio *) folios;
-
-static int filemap_get_contig_folios_d(struct address_space *mapping,
-                                      loff_t start, u64 end,
-                                      int fgp_flags, gfp_t gfp,
-                                      folios *folios)
-{
-       struct folio *f;
-       u64 pos = start;
-       int ret = 0;
-
-       while (pos < end) {
-               if ((u64) pos >= (u64) start + (1ULL << 20))
-                       fgp_flags &= ~FGP_CREAT;
-
-               ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL);
-               if (ret)
-                       break;
-
-               f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp);
-               if (IS_ERR_OR_NULL(f))
-                       break;
-
-               BUG_ON(folios->nr && folio_pos(f) != pos);
-
-               pos = folio_end_pos(f);
-               darray_push(folios, f);
-       }
-
-       if (!folios->nr && !ret && (fgp_flags & FGP_CREAT))
-               ret = -ENOMEM;
-
-       return folios->nr ? 0 : ret;
-}
-
 struct nocow_flush {
        struct closure  *cl;
        struct bch_dev  *ca;
@@ -157,9 +52,9 @@ static void nocow_flush_endio(struct bio *_bio)
        bio_put(&bio->bio);
 }
 
-static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
-                                               struct bch_inode_info *inode,
-                                               struct closure *cl)
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct closure *cl)
 {
        struct nocow_flush *bio;
        struct bch_dev *ca;
@@ -190,2586 +85,84 @@ static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
                                   struct nocow_flush, bio);
                bio->cl                 = cl;
                bio->ca                 = ca;
-               bio->bio.bi_end_io      = nocow_flush_endio;
-               closure_bio_submit(&bio->bio, cl);
-       }
-}
-
-static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
-                                        struct bch_inode_info *inode)
-{
-       struct closure cl;
-
-       closure_init_stack(&cl);
-       bch2_inode_flush_nocow_writes_async(c, inode, &cl);
-       closure_sync(&cl);
-
-       return 0;
-}
-
-static inline bool bio_full(struct bio *bio, unsigned len)
-{
-       if (bio->bi_vcnt >= bio->bi_max_vecs)
-               return true;
-       if (bio->bi_iter.bi_size > UINT_MAX - len)
-               return true;
-       return false;
-}
-
-static inline struct address_space *faults_disabled_mapping(void)
-{
-       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
-}
-
-static inline void set_fdm_dropped_locks(void)
-{
-       current->faults_disabled_mapping =
-               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
-}
-
-static inline bool fdm_dropped_locks(void)
-{
-       return ((unsigned long) current->faults_disabled_mapping) & 1;
-}
-
-struct quota_res {
-       u64                             sectors;
-};
-
-struct bch_writepage_io {
-       struct bch_inode_info           *inode;
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-struct dio_write {
-       struct kiocb                    *req;
-       struct address_space            *mapping;
-       struct bch_inode_info           *inode;
-       struct mm_struct                *mm;
-       unsigned                        loop:1,
-                                       extending:1,
-                                       sync:1,
-                                       flush:1,
-                                       free_iov:1;
-       struct quota_res                quota_res;
-       u64                             written;
-
-       struct iov_iter                 iter;
-       struct iovec                    inline_vecs[2];
-
-       /* must be last: */
-       struct bch_write_op             op;
-};
-
-struct dio_read {
-       struct closure                  cl;
-       struct kiocb                    *req;
-       long                            ret;
-       bool                            should_dirty;
-       struct bch_read_bio             rbio;
-};
-
-/* pagecache_block must be held */
-static noinline int write_invalidate_inode_pages_range(struct address_space *mapping,
-                                             loff_t start, loff_t end)
-{
-       int ret;
-
-       /*
-        * XXX: the way this is currently implemented, we can spin if a process
-        * is continually redirtying a specific page
-        */
-       do {
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = filemap_write_and_wait_range(mapping, start, end);
-               if (ret)
-                       break;
-
-               if (!mapping->nrpages)
-                       return 0;
-
-               ret = invalidate_inode_pages2_range(mapping,
-                               start >> PAGE_SHIFT,
-                               end >> PAGE_SHIFT);
-       } while (ret == -EBUSY);
-
-       return ret;
-}
-
-/* quotas */
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res)
-{
-       BUG_ON(res->sectors > inode->ei_quota_reserved);
-
-       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
-                       -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
-       inode->ei_quota_reserved -= res->sectors;
-       res->sectors = 0;
-}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res)
-{
-       if (res->sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __bch2_quota_reservation_put(c, inode, res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     u64 sectors,
-                                     bool check_enospc)
-{
-       int ret;
-
-       if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
-               return 0;
-
-       mutex_lock(&inode->ei_quota_lock);
-       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
-                             check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
-       if (likely(!ret)) {
-               inode->ei_quota_reserved += sectors;
-               res->sectors += sectors;
-       }
-       mutex_unlock(&inode->ei_quota_lock);
-
-       return ret;
-}
-
-#else
-
-static void __bch2_quota_reservation_put(struct bch_fs *c,
-                                        struct bch_inode_info *inode,
-                                        struct quota_res *res) {}
-
-static void bch2_quota_reservation_put(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      struct quota_res *res) {}
-
-static int bch2_quota_reservation_add(struct bch_fs *c,
-                                     struct bch_inode_info *inode,
-                                     struct quota_res *res,
-                                     unsigned sectors,
-                                     bool check_enospc)
-{
-       return 0;
-}
-
-#endif
-
-/* i_size updates: */
-
-struct inode_new_size {
-       loff_t          new_size;
-       u64             now;
-       unsigned        fields;
-};
-
-static int inode_set_size(struct bch_inode_info *inode,
-                         struct bch_inode_unpacked *bi,
-                         void *p)
-{
-       struct inode_new_size *s = p;
-
-       bi->bi_size = s->new_size;
-       if (s->fields & ATTR_ATIME)
-               bi->bi_atime = s->now;
-       if (s->fields & ATTR_MTIME)
-               bi->bi_mtime = s->now;
-       if (s->fields & ATTR_CTIME)
-               bi->bi_ctime = s->now;
-
-       return 0;
-}
-
-int __must_check bch2_write_inode_size(struct bch_fs *c,
-                                      struct bch_inode_info *inode,
-                                      loff_t new_size, unsigned fields)
-{
-       struct inode_new_size s = {
-               .new_size       = new_size,
-               .now            = bch2_current_time(c),
-               .fields         = fields,
-       };
-
-       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
-}
-
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, s64 sectors)
-{
-       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
-                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
-                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
-                               inode->ei_inode.bi_sectors);
-       inode->v.i_blocks += sectors;
-
-#ifdef CONFIG_BCACHEFS_QUOTA
-       if (quota_res &&
-           !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
-           sectors > 0) {
-               BUG_ON(sectors > quota_res->sectors);
-               BUG_ON(sectors > inode->ei_quota_reserved);
-
-               quota_res->sectors -= sectors;
-               inode->ei_quota_reserved -= sectors;
-       } else {
-               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
-       }
-#endif
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-                          struct quota_res *quota_res, s64 sectors)
-{
-       if (sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __i_sectors_acct(c, inode, quota_res, sectors);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-}
-
-/* page state: */
-
-/* stored in page->private: */
-
-#define BCH_FOLIO_SECTOR_STATE()       \
-       x(unallocated)                  \
-       x(reserved)                     \
-       x(dirty)                        \
-       x(dirty_reserved)               \
-       x(allocated)
-
-enum bch_folio_sector_state {
-#define x(n)   SECTOR_##n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-};
-
-static const char * const bch2_folio_sector_states[] = {
-#define x(n)   #n,
-       BCH_FOLIO_SECTOR_STATE()
-#undef x
-       NULL
-};
-
-static inline enum bch_folio_sector_state
-folio_sector_dirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_dirty;
-       case SECTOR_reserved:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_undirty(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_dirty:
-               return SECTOR_unallocated;
-       case SECTOR_dirty_reserved:
-               return SECTOR_reserved;
-       default:
-               return state;
-       }
-}
-
-static inline enum bch_folio_sector_state
-folio_sector_reserve(enum bch_folio_sector_state state)
-{
-       switch (state) {
-       case SECTOR_unallocated:
-               return SECTOR_reserved;
-       case SECTOR_dirty:
-               return SECTOR_dirty_reserved;
-       default:
-               return state;
-       }
-}
-
-struct bch_folio_sector {
-       /* Uncompressed, fully allocated replicas (or on disk reservation): */
-       unsigned                nr_replicas:4;
-
-       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-       unsigned                replicas_reserved:4;
-
-       /* i_sectors: */
-       enum bch_folio_sector_state state:8;
-};
-
-struct bch_folio {
-       spinlock_t              lock;
-       atomic_t                write_count;
-       /*
-        * Is the sector state up to date with the btree?
-        * (Not the data itself)
-        */
-       bool                    uptodate;
-       struct bch_folio_sector s[];
-};
-
-static inline void folio_sector_set(struct folio *folio,
-                            struct bch_folio *s,
-                            unsigned i, unsigned n)
-{
-       s->s[i].state = n;
-}
-
-/* file offset (to folio offset) to bch_folio_sector index */
-static inline int folio_pos_to_s(struct folio *folio, loff_t pos)
-{
-       u64 f_offset = pos - folio_pos(folio);
-       BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio));
-       return f_offset >> SECTOR_SHIFT;
-}
-
-static inline struct bch_folio *__bch2_folio(struct folio *folio)
-{
-       return folio_has_private(folio)
-               ? (struct bch_folio *) folio_get_private(folio)
-               : NULL;
-}
-
-static inline struct bch_folio *bch2_folio(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-
-       return __bch2_folio(folio);
-}
-
-/* for newly allocated folios: */
-static void __bch2_folio_release(struct folio *folio)
-{
-       kfree(folio_detach_private(folio));
-}
-
-static void bch2_folio_release(struct folio *folio)
-{
-       EBUG_ON(!folio_test_locked(folio));
-       __bch2_folio_release(folio);
-}
-
-/* for newly allocated folios: */
-static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       struct bch_folio *s;
-
-       s = kzalloc(sizeof(*s) +
-                   sizeof(struct bch_folio_sector) *
-                   folio_sectors(folio), gfp);
-       if (!s)
-               return NULL;
-
-       spin_lock_init(&s->lock);
-       folio_attach_private(folio, s);
-       return s;
-}
-
-static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp)
-{
-       return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp);
-}
-
-static unsigned bkey_to_sector_state(struct bkey_s_c k)
-{
-       if (bkey_extent_is_reservation(k))
-               return SECTOR_reserved;
-       if (bkey_extent_is_allocation(k.k))
-               return SECTOR_allocated;
-       return SECTOR_unallocated;
-}
-
-static void __bch2_folio_set(struct folio *folio,
-                            unsigned pg_offset, unsigned pg_len,
-                            unsigned nr_ptrs, unsigned state)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       BUG_ON(pg_offset >= sectors);
-       BUG_ON(pg_offset + pg_len > sectors);
-
-       spin_lock(&s->lock);
-
-       for (i = pg_offset; i < pg_offset + pg_len; i++) {
-               s->s[i].nr_replicas     = nr_ptrs;
-               folio_sector_set(folio, s, i, state);
-       }
-
-       if (i == sectors)
-               s->uptodate = true;
-
-       spin_unlock(&s->lock);
-}
-
-/*
- * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the
- * extents btree:
- */
-static int bch2_folio_set(struct bch_fs *c, subvol_inum inum,
-                         struct folio **folios, unsigned nr_folios)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bch_folio *s;
-       u64 offset = folio_sector(folios[0]);
-       unsigned folio_idx;
-       u32 snapshot;
-       bool need_set = false;
-       int ret;
-
-       for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) {
-               s = bch2_folio_create(folios[folio_idx], GFP_KERNEL);
-               if (!s)
-                       return -ENOMEM;
-
-               need_set |= !s->uptodate;
-       }
-
-       if (!need_set)
-               return 0;
-
-       folio_idx = 0;
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-                          SPOS(inum.inum, offset, snapshot),
-                          BTREE_ITER_SLOTS, k, ret) {
-               unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
-               unsigned state = bkey_to_sector_state(k);
-
-               while (folio_idx < nr_folios) {
-                       struct folio *folio = folios[folio_idx];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end   = folio_end_sector(folio);
-                       unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start;
-                       unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start;
-
-                       BUG_ON(k.k->p.offset < folio_start);
-                       BUG_ON(bkey_start_offset(k.k) > folio_end);
-
-                       if (!bch2_folio(folio)->uptodate)
-                               __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state);
-
-                       if (k.k->p.offset < folio_end)
-                               break;
-                       folio_idx++;
-               }
-
-               if (folio_idx == nr_folios)
-                       break;
-       }
-
-       offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
-
-static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
-{
-       struct bvec_iter iter;
-       struct folio_vec fv;
-       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-       unsigned state = bkey_to_sector_state(k);
-
-       bio_for_each_folio(fv, bio, iter)
-               __bch2_folio_set(fv.fv_folio,
-                                fv.fv_offset >> 9,
-                                fv.fv_len >> 9,
-                                nr_ptrs, state);
-}
-
-static void mark_pagecache_unallocated(struct bch_inode_info *inode,
-                                      u64 start, u64 end)
-{
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       unsigned i, j;
-
-       if (end <= start)
-               return;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
-
-                       BUG_ON(end <= folio_start);
-
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++)
-                                       s->s[j].nr_replicas = 0;
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-}
-
-static void mark_pagecache_reserved(struct bch_inode_info *inode,
-                                   u64 start, u64 end)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
-       struct folio_batch fbatch;
-       s64 i_sectors_delta = 0;
-       unsigned i, j;
-
-       if (end <= start)
-               return;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(inode->v.i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-                       u64 folio_start = folio_sector(folio);
-                       u64 folio_end = folio_end_sector(folio);
-                       unsigned folio_offset = max(start, folio_start) - folio_start;
-                       unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
-                       struct bch_folio *s;
-
-                       BUG_ON(end <= folio_start);
-
-                       folio_lock(folio);
-                       s = bch2_folio(folio);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = folio_offset; j < folio_offset + folio_len; j++) {
-                                       i_sectors_delta -= s->s[j].state == SECTOR_dirty;
-                                       folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state));
-                               }
-                               spin_unlock(&s->lock);
-                       }
-
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
-}
-
-static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
-{
-       /* XXX: this should not be open coded */
-       return inode->ei_inode.bi_data_replicas
-               ? inode->ei_inode.bi_data_replicas - 1
-               : c->opts.data_replicas;
-}
-
-static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
-                                         unsigned nr_replicas)
-{
-       return max(0, (int) nr_replicas -
-                  s->nr_replicas -
-                  s->replicas_reserved);
-}
-
-static int bch2_get_folio_disk_reservation(struct bch_fs *c,
-                               struct bch_inode_info *inode,
-                               struct folio *folio, bool check_enospc)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned nr_replicas = inode_nr_replicas(c, inode);
-       struct disk_reservation disk_res = { 0 };
-       unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       for (i = 0; i < sectors; i++)
-               disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
-
-       if (!disk_res_sectors)
-               return 0;
-
-       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       disk_res_sectors, 1,
-                                       !check_enospc
-                                       ? BCH_DISK_RESERVATION_NOFAIL
-                                       : 0);
-       if (unlikely(ret))
-               return ret;
-
-       for (i = 0; i < sectors; i++)
-               s->s[i].replicas_reserved +=
-                       sectors_to_reserve(&s->s[i], nr_replicas);
-
-       return 0;
-}
-
-struct bch2_folio_reservation {
-       struct disk_reservation disk;
-       struct quota_res        quota;
-};
-
-static void bch2_folio_reservation_init(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       memset(res, 0, sizeof(*res));
-
-       res->disk.nr_replicas = inode_nr_replicas(c, inode);
-}
-
-static void bch2_folio_reservation_put(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct bch2_folio_reservation *res)
-{
-       bch2_disk_reservation_put(c, &res->disk);
-       bch2_quota_reservation_put(c, inode, &res->quota);
-}
-
-static int bch2_folio_reservation_get(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       unsigned offset, unsigned len)
-{
-       struct bch_folio *s = bch2_folio_create(folio, 0);
-       unsigned i, disk_sectors = 0, quota_sectors = 0;
-       int ret;
-
-       if (!s)
-               return -ENOMEM;
-
-       BUG_ON(!s->uptodate);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               disk_sectors += sectors_to_reserve(&s->s[i],
-                                               res->disk.nr_replicas);
-               quota_sectors += s->s[i].state == SECTOR_unallocated;
-       }
-
-       if (disk_sectors) {
-               ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
-               if (unlikely(ret))
-                       return ret;
-       }
-
-       if (quota_sectors) {
-               ret = bch2_quota_reservation_add(c, inode, &res->quota,
-                                                quota_sectors, true);
-               if (unlikely(ret)) {
-                       struct disk_reservation tmp = {
-                               .sectors = disk_sectors
-                       };
-
-                       bch2_disk_reservation_put(c, &tmp);
-                       res->disk.sectors -= disk_sectors;
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_clear_folio_bits(struct folio *folio)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_folio *s = bch2_folio(folio);
-       struct disk_reservation disk_res = { 0 };
-       int i, sectors = folio_sectors(folio), dirty_sectors = 0;
-
-       if (!s)
-               return;
-
-       EBUG_ON(!folio_test_locked(folio));
-       EBUG_ON(folio_test_writeback(folio));
-
-       for (i = 0; i < sectors; i++) {
-               disk_res.sectors += s->s[i].replicas_reserved;
-               s->s[i].replicas_reserved = 0;
-
-               dirty_sectors -= s->s[i].state == SECTOR_dirty;
-               folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state));
-       }
-
-       bch2_disk_reservation_put(c, &disk_res);
-
-       i_sectors_acct(c, inode, NULL, dirty_sectors);
-
-       bch2_folio_release(folio);
-}
-
-static void bch2_set_folio_dirty(struct bch_fs *c,
-                       struct bch_inode_info *inode,
-                       struct folio *folio,
-                       struct bch2_folio_reservation *res,
-                       unsigned offset, unsigned len)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, dirty_sectors = 0;
-
-       WARN_ON((u64) folio_pos(folio) + offset + len >
-               round_up((u64) i_size_read(&inode->v), block_bytes(c)));
-
-       BUG_ON(!s->uptodate);
-
-       spin_lock(&s->lock);
-
-       for (i = round_down(offset, block_bytes(c)) >> 9;
-            i < round_up(offset + len, block_bytes(c)) >> 9;
-            i++) {
-               unsigned sectors = sectors_to_reserve(&s->s[i],
-                                               res->disk.nr_replicas);
-
-               /*
-                * This can happen if we race with the error path in
-                * bch2_writepage_io_done():
-                */
-               sectors = min_t(unsigned, sectors, res->disk.sectors);
-
-               s->s[i].replicas_reserved += sectors;
-               res->disk.sectors -= sectors;
-
-               dirty_sectors += s->s[i].state == SECTOR_unallocated;
-
-               folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state));
-       }
-
-       spin_unlock(&s->lock);
-
-       i_sectors_acct(c, inode, &res->quota, dirty_sectors);
-
-       if (!folio_test_dirty(folio))
-               filemap_dirty_folio(inode->v.i_mapping, folio);
-}
-
-vm_fault_t bch2_page_fault(struct vm_fault *vmf)
-{
-       struct file *file = vmf->vma->vm_file;
-       struct address_space *mapping = file->f_mapping;
-       struct address_space *fdm = faults_disabled_mapping();
-       struct bch_inode_info *inode = file_bch_inode(file);
-       vm_fault_t ret;
-
-       if (fdm == mapping)
-               return VM_FAULT_SIGBUS;
-
-       /* Lock ordering: */
-       if (fdm > mapping) {
-               struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
-
-               if (bch2_pagecache_add_tryget(inode))
-                       goto got_lock;
-
-               bch2_pagecache_block_put(fdm_host);
-
-               bch2_pagecache_add_get(inode);
-               bch2_pagecache_add_put(inode);
-
-               bch2_pagecache_block_get(fdm_host);
-
-               /* Signal that lock has been dropped: */
-               set_fdm_dropped_locks();
-               return VM_FAULT_SIGBUS;
-       }
-
-       bch2_pagecache_add_get(inode);
-got_lock:
-       ret = filemap_fault(vmf);
-       bch2_pagecache_add_put(inode);
-
-       return ret;
-}
-
-vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
-{
-       struct folio *folio = page_folio(vmf->page);
-       struct file *file = vmf->vma->vm_file;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       unsigned len;
-       loff_t isize;
-       vm_fault_t ret;
-
-       bch2_folio_reservation_init(c, inode, &res);
-
-       sb_start_pagefault(inode->v.i_sb);
-       file_update_time(file);
-
-       /*
-        * Not strictly necessary, but helps avoid dio writes livelocking in
-        * write_invalidate_inode_pages_range() - can drop this if/when we get
-        * a write_invalidate_inode_pages_range() that works without dropping
-        * page lock before invalidating page
-        */
-       bch2_pagecache_add_get(inode);
-
-       folio_lock(folio);
-       isize = i_size_read(&inode->v);
-
-       if (folio->mapping != mapping || folio_pos(folio) >= isize) {
-               folio_unlock(folio);
-               ret = VM_FAULT_NOPAGE;
-               goto out;
-       }
-
-       len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio));
-
-       if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?:
-           bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) {
-               folio_unlock(folio);
-               ret = VM_FAULT_SIGBUS;
-               goto out;
-       }
-
-       bch2_set_folio_dirty(c, inode, folio, &res, 0, len);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       folio_wait_stable(folio);
-       ret = VM_FAULT_LOCKED;
-out:
-       bch2_pagecache_add_put(inode);
-       sb_end_pagefault(inode->v.i_sb);
-
-       return ret;
-}
-
-void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length)
-{
-       if (offset || length < folio_size(folio))
-               return;
-
-       bch2_clear_folio_bits(folio);
-}
-
-bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask)
-{
-       if (folio_test_dirty(folio) || folio_test_writeback(folio))
-               return false;
-
-       bch2_clear_folio_bits(folio);
-       return true;
-}
-
-/* readpage(s): */
-
-static void bch2_readpages_end_io(struct bio *bio)
-{
-       struct folio_iter fi;
-
-       bio_for_each_folio_all(fi, bio) {
-               if (!bio->bi_status) {
-                       folio_mark_uptodate(fi.folio);
-               } else {
-                       folio_clear_uptodate(fi.folio);
-                       folio_set_error(fi.folio);
-               }
-               folio_unlock(fi.folio);
-       }
-
-       bio_put(bio);
-}
-
-struct readpages_iter {
-       struct address_space    *mapping;
-       unsigned                idx;
-       folios                  folios;
-};
-
-static int readpages_iter_init(struct readpages_iter *iter,
-                              struct readahead_control *ractl)
-{
-       struct folio **fi;
-       int ret;
-
-       memset(iter, 0, sizeof(*iter));
-
-       iter->mapping = ractl->mapping;
-
-       ret = filemap_get_contig_folios_d(iter->mapping,
-                               ractl->_index << PAGE_SHIFT,
-                               (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
-                               0, mapping_gfp_mask(iter->mapping),
-                               &iter->folios);
-       if (ret)
-               return ret;
-
-       darray_for_each(iter->folios, fi) {
-               ractl->_nr_pages -= 1U << folio_order(*fi);
-               __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
-               folio_put(*fi);
-               folio_put(*fi);
-       }
-
-       return 0;
-}
-
-static inline struct folio *readpage_iter_peek(struct readpages_iter *iter)
-{
-       if (iter->idx >= iter->folios.nr)
-               return NULL;
-       return iter->folios.data[iter->idx];
-}
-
-static inline void readpage_iter_advance(struct readpages_iter *iter)
-{
-       iter->idx++;
-}
-
-static bool extent_partial_reads_expensive(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
-
-       bkey_for_each_crc(k.k, ptrs, crc, i)
-               if (crc.csum_type || crc.compression_type)
-                       return true;
-       return false;
-}
-
-static int readpage_bio_extend(struct btree_trans *trans,
-                              struct readpages_iter *iter,
-                              struct bio *bio,
-                              unsigned sectors_this_extent,
-                              bool get_more)
-{
-       /* Don't hold btree locks while allocating memory: */
-       bch2_trans_unlock(trans);
-
-       while (bio_sectors(bio) < sectors_this_extent &&
-              bio->bi_vcnt < bio->bi_max_vecs) {
-               struct folio *folio = readpage_iter_peek(iter);
-               int ret;
-
-               if (folio) {
-                       readpage_iter_advance(iter);
-               } else {
-                       pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
-
-                       if (!get_more)
-                               break;
-
-                       folio = xa_load(&iter->mapping->i_pages, folio_offset);
-                       if (folio && !xa_is_value(folio))
-                               break;
-
-                       folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0);
-                       if (!folio)
-                               break;
-
-                       if (!__bch2_folio_create(folio, GFP_KERNEL)) {
-                               folio_put(folio);
-                               break;
-                       }
-
-                       ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL);
-                       if (ret) {
-                               __bch2_folio_release(folio);
-                               folio_put(folio);
-                               break;
-                       }
-
-                       folio_put(folio);
-               }
-
-               BUG_ON(folio_sector(folio) != bio_end_sector(bio));
-
-               BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0));
-       }
-
-       return bch2_trans_relock(trans);
-}
-
-static void bchfs_read(struct btree_trans *trans,
-                      struct bch_read_bio *rbio,
-                      subvol_inum inum,
-                      struct readpages_iter *readpages_iter)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       int flags = BCH_READ_RETRY_IF_STALE|
-               BCH_READ_MAY_PROMOTE;
-       u32 snapshot;
-       int ret = 0;
-
-       rbio->c = c;
-       rbio->start_time = local_clock();
-       rbio->subvol = inum.subvol;
-
-       bch2_bkey_buf_init(&sk);
-retry:
-       bch2_trans_begin(trans);
-       iter = (struct btree_iter) { NULL };
-
-       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
-                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
-                            BTREE_ITER_SLOTS);
-       while (1) {
-               struct bkey_s_c k;
-               unsigned bytes, sectors, offset_into_extent;
-               enum btree_id data_btree = BTREE_ID_extents;
-
-               /*
-                * read_extent -> io_time_reset may cause a transaction restart
-                * without returning an error, we need to check for that here:
-                */
-               ret = bch2_trans_relock(trans);
-               if (ret)
-                       break;
-
-               bch2_btree_iter_set_pos(&iter,
-                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       break;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               sectors = min(sectors, k.k->size - offset_into_extent);
-
-               if (readpages_iter) {
-                       ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors,
-                                                 extent_partial_reads_expensive(k));
-                       if (ret)
-                               break;
-               }
-
-               bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-
-               if (rbio->bio.bi_iter.bi_size == bytes)
-                       flags |= BCH_READ_LAST_FRAGMENT;
-
-               bch2_bio_page_state_set(&rbio->bio, k);
-
-               bch2_read_extent(trans, rbio, iter.pos,
-                                data_btree, k, offset_into_extent, flags);
-
-               if (flags & BCH_READ_LAST_FRAGMENT)
-                       break;
-
-               swap(rbio->bio.bi_iter.bi_size, bytes);
-               bio_advance(&rbio->bio, bytes);
-
-               ret = btree_trans_too_many_iters(trans);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (ret) {
-               bch_err_inum_offset_ratelimited(c,
-                               iter.pos.inode,
-                               iter.pos.offset << 9,
-                               "read error %i from btree lookup", ret);
-               rbio->bio.bi_status = BLK_STS_IOERR;
-               bio_endio(&rbio->bio);
-       }
-
-       bch2_bkey_buf_exit(&sk, c);
-}
-
-void bch2_readahead(struct readahead_control *ractl)
-{
-       struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct btree_trans trans;
-       struct folio *folio;
-       struct readpages_iter readpages_iter;
-       int ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       ret = readpages_iter_init(&readpages_iter, ractl);
-       BUG_ON(ret);
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_pagecache_add_get(inode);
-
-       while ((folio = readpage_iter_peek(&readpages_iter))) {
-               unsigned n = min_t(unsigned,
-                                  readpages_iter.folios.nr -
-                                  readpages_iter.idx,
-                                  BIO_MAX_VECS);
-               struct bch_read_bio *rbio =
-                       rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ,
-                                                  GFP_KERNEL, &c->bio_read),
-                                 opts);
-
-               readpage_iter_advance(&readpages_iter);
-
-               rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-               rbio->bio.bi_end_io = bch2_readpages_end_io;
-               BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-               bchfs_read(&trans, rbio, inode_inum(inode),
-                          &readpages_iter);
-               bch2_trans_unlock(&trans);
-       }
-
-       bch2_pagecache_add_put(inode);
-
-       bch2_trans_exit(&trans);
-       darray_exit(&readpages_iter.folios);
-}
-
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
-                            subvol_inum inum, struct folio *folio)
-{
-       struct btree_trans trans;
-
-       bch2_folio_create(folio, __GFP_NOFAIL);
-
-       rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
-       rbio->bio.bi_iter.bi_sector = folio_sector(folio);
-       BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
-       bch2_trans_init(&trans, c, 0, 0);
-       bchfs_read(&trans, rbio, inum, NULL);
-       bch2_trans_exit(&trans);
-}
-
-static void bch2_read_single_folio_end_io(struct bio *bio)
-{
-       complete(bio->bi_private);
-}
-
-static int bch2_read_single_folio(struct folio *folio,
-                                 struct address_space *mapping)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_read_bio *rbio;
-       struct bch_io_opts opts;
-       int ret;
-       DECLARE_COMPLETION_ONSTACK(done);
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
-                        opts);
-       rbio->bio.bi_private = &done;
-       rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
-
-       __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
-       wait_for_completion(&done);
-
-       ret = blk_status_to_errno(rbio->bio.bi_status);
-       bio_put(&rbio->bio);
-
-       if (ret < 0)
-               return ret;
-
-       folio_mark_uptodate(folio);
-       return 0;
-}
-
-int bch2_read_folio(struct file *file, struct folio *folio)
-{
-       int ret;
-
-       ret = bch2_read_single_folio(folio, folio->mapping);
-       folio_unlock(folio);
-       return bch2_err_class(ret);
-}
-
-/* writepages: */
-
-struct bch_writepage_state {
-       struct bch_writepage_io *io;
-       struct bch_io_opts      opts;
-       struct bch_folio_sector *tmp;
-       unsigned                tmp_sectors;
-};
-
-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
-                                                                 struct bch_inode_info *inode)
-{
-       struct bch_writepage_state ret = { 0 };
-
-       bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode);
-       return ret;
-}
-
-static void bch2_writepage_io_done(struct bch_write_op *op)
-{
-       struct bch_writepage_io *io =
-               container_of(op, struct bch_writepage_io, op);
-       struct bch_fs *c = io->op.c;
-       struct bio *bio = &io->op.wbio.bio;
-       struct folio_iter fi;
-       unsigned i;
-
-       if (io->op.error) {
-               set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
-
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       folio_set_error(fi.folio);
-                       mapping_set_error(fi.folio->mapping, -EIO);
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
-               bio_for_each_folio_all(fi, bio) {
-                       struct bch_folio *s;
-
-                       s = __bch2_folio(fi.folio);
-                       spin_lock(&s->lock);
-                       for (i = 0; i < folio_sectors(fi.folio); i++)
-                               s->s[i].nr_replicas = 0;
-                       spin_unlock(&s->lock);
-               }
-       }
-
-       /*
-        * racing with fallocate can cause us to add fewer sectors than
-        * expected - but we shouldn't add more sectors than expected:
-        */
-       WARN_ON_ONCE(io->op.i_sectors_delta > 0);
-
-       /*
-        * (error (due to going RO) halfway through a page can screw that up
-        * slightly)
-        * XXX wtf?
-          BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
-        */
-
-       /*
-        * PageWriteback is effectively our ref on the inode - fixup i_blocks
-        * before calling end_page_writeback:
-        */
-       i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
-
-       bio_for_each_folio_all(fi, bio) {
-               struct bch_folio *s = __bch2_folio(fi.folio);
-
-               if (atomic_dec_and_test(&s->write_count))
-                       folio_end_writeback(fi.folio);
-       }
-
-       bio_put(&io->op.wbio.bio);
-}
-
-static void bch2_writepage_do_io(struct bch_writepage_state *w)
-{
-       struct bch_writepage_io *io = w->io;
-
-       w->io = NULL;
-       closure_call(&io->op.cl, bch2_write, NULL, NULL);
-}
-
-/*
- * Get a bch_writepage_io and add @page to it - appending to an existing one if
- * possible, else allocating a new one:
- */
-static void bch2_writepage_io_alloc(struct bch_fs *c,
-                                   struct writeback_control *wbc,
-                                   struct bch_writepage_state *w,
-                                   struct bch_inode_info *inode,
-                                   u64 sector,
-                                   unsigned nr_replicas)
-{
-       struct bch_write_op *op;
-
-       w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS,
-                                             REQ_OP_WRITE,
-                                             GFP_KERNEL,
-                                             &c->writepage_bioset),
-                            struct bch_writepage_io, op.wbio.bio);
-
-       w->io->inode            = inode;
-       op                      = &w->io->op;
-       bch2_write_op_init(op, c, w->opts);
-       op->target              = w->opts.foreground_target;
-       op->nr_replicas         = nr_replicas;
-       op->res.nr_replicas     = nr_replicas;
-       op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
-       op->subvol              = inode->ei_subvol;
-       op->pos                 = POS(inode->v.i_ino, sector);
-       op->end_io              = bch2_writepage_io_done;
-       op->devs_need_flush     = &inode->ei_devs_need_flush;
-       op->wbio.bio.bi_iter.bi_sector = sector;
-       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
-}
-
-static int __bch2_writepage(struct folio *folio,
-                           struct writeback_control *wbc,
-                           void *data)
-{
-       struct bch_inode_info *inode = to_bch_ei(folio->mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_writepage_state *w = data;
-       struct bch_folio *s;
-       unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX;
-       loff_t i_size = i_size_read(&inode->v);
-       int ret;
-
-       EBUG_ON(!folio_test_uptodate(folio));
-
-       /* Is the folio fully inside i_size? */
-       if (folio_end_pos(folio) <= i_size)
-               goto do_io;
-
-       /* Is the folio fully outside i_size? (truncate in progress) */
-       if (folio_pos(folio) >= i_size) {
-               folio_unlock(folio);
-               return 0;
-       }
-
-       /*
-        * The folio straddles i_size.  It must be zeroed out on each and every
-        * writepage invocation because it may be mmapped.  "A file is mapped
-        * in multiples of the folio size.  For a file that is not a multiple of
-        * the  folio size, the remaining memory is zeroed when mapped, and
-        * writes to that region are not written out to the file."
-        */
-       folio_zero_segment(folio,
-                          i_size - folio_pos(folio),
-                          folio_size(folio));
-do_io:
-       f_sectors = folio_sectors(folio);
-       s = bch2_folio(folio);
-
-       if (f_sectors > w->tmp_sectors) {
-               kfree(w->tmp);
-               w->tmp = kzalloc(sizeof(struct bch_folio_sector) *
-                                f_sectors, __GFP_NOFAIL);
-               w->tmp_sectors = f_sectors;
-       }
-
-       /*
-        * Things get really hairy with errors during writeback:
-        */
-       ret = bch2_get_folio_disk_reservation(c, inode, folio, false);
-       BUG_ON(ret);
-
-       /* Before unlocking the page, get copy of reservations: */
-       spin_lock(&s->lock);
-       memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors);
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               nr_replicas_this_write =
-                       min_t(unsigned, nr_replicas_this_write,
-                             s->s[i].nr_replicas +
-                             s->s[i].replicas_reserved);
-       }
-
-       for (i = 0; i < f_sectors; i++) {
-               if (s->s[i].state < SECTOR_dirty)
-                       continue;
-
-               s->s[i].nr_replicas = w->opts.compression
-                       ? 0 : nr_replicas_this_write;
-
-               s->s[i].replicas_reserved = 0;
-               folio_sector_set(folio, s, i, SECTOR_allocated);
-       }
-       spin_unlock(&s->lock);
-
-       BUG_ON(atomic_read(&s->write_count));
-       atomic_set(&s->write_count, 1);
-
-       BUG_ON(folio_test_writeback(folio));
-       folio_start_writeback(folio);
-
-       folio_unlock(folio);
-
-       offset = 0;
-       while (1) {
-               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
-               u64 sector;
-
-               while (offset < f_sectors &&
-                      w->tmp[offset].state < SECTOR_dirty)
-                       offset++;
-
-               if (offset == f_sectors)
-                       break;
-
-               while (offset + sectors < f_sectors &&
-                      w->tmp[offset + sectors].state >= SECTOR_dirty) {
-                       reserved_sectors += w->tmp[offset + sectors].replicas_reserved;
-                       dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty;
-                       sectors++;
-               }
-               BUG_ON(!sectors);
-
-               sector = folio_sector(folio) + offset;
-
-               if (w->io &&
-                   (w->io->op.res.nr_replicas != nr_replicas_this_write ||
-                    bio_full(&w->io->op.wbio.bio, sectors << 9) ||
-                    w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
-                    (BIO_MAX_VECS * PAGE_SIZE) ||
-                    bio_end_sector(&w->io->op.wbio.bio) != sector))
-                       bch2_writepage_do_io(w);
-
-               if (!w->io)
-                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
-                                               nr_replicas_this_write);
-
-               atomic_inc(&s->write_count);
-
-               BUG_ON(inode != w->io->inode);
-               BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio,
-                                    sectors << 9, offset << 9));
-
-               /* Check for writing past i_size: */
-               WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
-                         round_up(i_size, block_bytes(c)) &&
-                         !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
-                         "writing past i_size: %llu > %llu (unrounded %llu)\n",
-                         bio_end_sector(&w->io->op.wbio.bio) << 9,
-                         round_up(i_size, block_bytes(c)),
-                         i_size);
-
-               w->io->op.res.sectors += reserved_sectors;
-               w->io->op.i_sectors_delta -= dirty_sectors;
-               w->io->op.new_i_size = i_size;
-
-               offset += sectors;
-       }
-
-       if (atomic_dec_and_test(&s->write_count))
-               folio_end_writeback(folio);
-
-       return 0;
-}
-
-int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-       struct bch_fs *c = mapping->host->i_sb->s_fs_info;
-       struct bch_writepage_state w =
-               bch_writepage_state_init(c, to_bch_ei(mapping->host));
-       struct blk_plug plug;
-       int ret;
-
-       blk_start_plug(&plug);
-       ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
-       if (w.io)
-               bch2_writepage_do_io(&w);
-       blk_finish_plug(&plug);
-       kfree(w.tmp);
-       return bch2_err_class(ret);
-}
-
-/* buffered writes: */
-
-int bch2_write_begin(struct file *file, struct address_space *mapping,
-                    loff_t pos, unsigned len,
-                    struct page **pagep, void **fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res;
-       struct folio *folio;
-       unsigned offset;
-       int ret = -ENOMEM;
-
-       res = kmalloc(sizeof(*res), GFP_KERNEL);
-       if (!res)
-               return -ENOMEM;
-
-       bch2_folio_reservation_init(c, inode, res);
-       *fsdata = res;
-
-       bch2_pagecache_add_get(inode);
-
-       folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT,
-                               FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE,
-                               mapping_gfp_mask(mapping));
-       if (IS_ERR_OR_NULL(folio))
-               goto err_unlock;
-
-       if (folio_test_uptodate(folio))
-               goto out;
-
-       offset = pos - folio_pos(folio);
-       len = min_t(size_t, len, folio_end_pos(folio) - pos);
-
-       /* If we're writing entire folio, don't need to read it in first: */
-       if (!offset && len == folio_size(folio))
-               goto out;
-
-       if (!offset && pos + len >= inode->v.i_size) {
-               folio_zero_segment(folio, len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-
-       if (folio_pos(folio) >= inode->v.i_size) {
-               folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio));
-               flush_dcache_folio(folio);
-               goto out;
-       }
-readpage:
-       ret = bch2_read_single_folio(folio, mapping);
-       if (ret)
-               goto err;
-out:
-       ret = bch2_folio_set(c, inode_inum(inode), &folio, 1);
-       if (ret)
-               goto err;
-
-       ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len);
-       if (ret) {
-               if (!folio_test_uptodate(folio)) {
-                       /*
-                        * If the folio hasn't been read in, we won't know if we
-                        * actually need a reservation - we don't actually need
-                        * to read here, we just need to check if the folio is
-                        * fully backed by uncompressed data:
-                        */
-                       goto readpage;
-               }
-
-               goto err;
-       }
-
-       *pagep = &folio->page;
-       return 0;
-err:
-       folio_unlock(folio);
-       folio_put(folio);
-       *pagep = NULL;
-err_unlock:
-       bch2_pagecache_add_put(inode);
-       kfree(res);
-       *fsdata = NULL;
-       return bch2_err_class(ret);
-}
-
-int bch2_write_end(struct file *file, struct address_space *mapping,
-                  loff_t pos, unsigned len, unsigned copied,
-                  struct page *page, void *fsdata)
-{
-       struct bch_inode_info *inode = to_bch_ei(mapping->host);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation *res = fsdata;
-       struct folio *folio = page_folio(page);
-       unsigned offset = pos - folio_pos(folio);
-
-       lockdep_assert_held(&inode->v.i_rwsem);
-       BUG_ON(offset + copied > folio_size(folio));
-
-       if (unlikely(copied < len && !folio_test_uptodate(folio))) {
-               /*
-                * The folio needs to be read in, but that would destroy
-                * our partial write - simplest thing is to just force
-                * userspace to redo the write:
-                */
-               folio_zero_range(folio, 0, folio_size(folio));
-               flush_dcache_folio(folio);
-               copied = 0;
-       }
-
-       spin_lock(&inode->v.i_lock);
-       if (pos + copied > inode->v.i_size)
-               i_size_write(&inode->v, pos + copied);
-       spin_unlock(&inode->v.i_lock);
-
-       if (copied) {
-               if (!folio_test_uptodate(folio))
-                       folio_mark_uptodate(folio);
-
-               bch2_set_folio_dirty(c, inode, folio, res, offset, copied);
-
-               inode->ei_last_dirtied = (unsigned long) current;
-       }
-
-       folio_unlock(folio);
-       folio_put(folio);
-       bch2_pagecache_add_put(inode);
-
-       bch2_folio_reservation_put(c, inode, res);
-       kfree(res);
-
-       return copied;
-}
-
-static noinline void folios_trunc(folios *folios, struct folio **fi)
-{
-       while (folios->data + folios->nr > fi) {
-               struct folio *f = darray_pop(folios);
-
-               folio_unlock(f);
-               folio_put(f);
-       }
-}
-
-static int __bch2_buffered_write(struct bch_inode_info *inode,
-                                struct address_space *mapping,
-                                struct iov_iter *iter,
-                                loff_t pos, unsigned len)
-{
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch2_folio_reservation res;
-       folios folios;
-       struct folio **fi, *f;
-       unsigned copied = 0, f_offset;
-       u64 end = pos + len, f_pos;
-       loff_t last_folio_pos = inode->v.i_size;
-       int ret = 0;
-
-       BUG_ON(!len);
-
-       bch2_folio_reservation_init(c, inode, &res);
-       darray_init(&folios);
-
-       ret = filemap_get_contig_folios_d(mapping, pos, end,
-                                  FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT,
-                                  mapping_gfp_mask(mapping),
-                                  &folios);
-       if (ret)
-               goto out;
-
-       BUG_ON(!folios.nr);
-
-       f = darray_first(folios);
-       if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
-               ret = bch2_read_single_folio(f, mapping);
-               if (ret)
-                       goto out;
-       }
-
-       f = darray_last(folios);
-       end = min(end, folio_end_pos(f));
-       last_folio_pos = folio_pos(f);
-       if (end != folio_end_pos(f) && !folio_test_uptodate(f)) {
-               if (end >= inode->v.i_size) {
-                       folio_zero_range(f, 0, folio_size(f));
-               } else {
-                       ret = bch2_read_single_folio(f, mapping);
-                       if (ret)
-                               goto out;
-               }
-       }
-
-       ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr);
-       if (ret)
-               goto out;
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-               /*
-                * XXX: per POSIX and fstests generic/275, on -ENOSPC we're
-                * supposed to write as much as we have disk space for.
-                *
-                * On failure here we should still write out a partial page if
-                * we aren't completely out of disk space - we don't do that
-                * yet:
-                */
-               ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len);
-               if (unlikely(ret)) {
-                       folios_trunc(&folios, fi);
-                       if (!folios.nr)
-                               goto out;
-
-                       end = min(end, folio_end_pos(darray_last(folios)));
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (mapping_writably_mapped(mapping))
-               darray_for_each(folios, fi)
-                       flush_dcache_folio(*fi);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-               unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
-
-               if (!f_copied) {
-                       folios_trunc(&folios, fi);
-                       break;
-               }
-
-               if (!folio_test_uptodate(f) &&
-                   f_copied != folio_size(f) &&
-                   pos + copied + f_copied < inode->v.i_size) {
-                       folio_zero_range(f, 0, folio_size(f));
-                       folios_trunc(&folios, fi);
-                       break;
-               }
-
-               flush_dcache_folio(f);
-               copied += f_copied;
-
-               if (f_copied != f_len) {
-                       folios_trunc(&folios, fi + 1);
-                       break;
-               }
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       if (!copied)
-               goto out;
-
-       end = pos + copied;
-
-       spin_lock(&inode->v.i_lock);
-       if (end > inode->v.i_size)
-               i_size_write(&inode->v, end);
-       spin_unlock(&inode->v.i_lock);
-
-       f_pos = pos;
-       f_offset = pos - folio_pos(darray_first(folios));
-       darray_for_each(folios, fi) {
-               struct folio *f = *fi;
-               u64 f_len = min(end, folio_end_pos(f)) - f_pos;
-
-               if (!folio_test_uptodate(f))
-                       folio_mark_uptodate(f);
-
-               bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len);
-
-               f_pos = folio_end_pos(f);
-               f_offset = 0;
-       }
-
-       inode->ei_last_dirtied = (unsigned long) current;
-out:
-       darray_for_each(folios, fi) {
-               folio_unlock(*fi);
-               folio_put(*fi);
-       }
-
-       /*
-        * If the last folio added to the mapping starts beyond current EOF, we
-        * performed a short write but left around at least one post-EOF folio.
-        * Clean up the mapping before we return.
-        */
-       if (last_folio_pos >= inode->v.i_size)
-               truncate_pagecache(&inode->v, inode->v.i_size);
-
-       darray_exit(&folios);
-       bch2_folio_reservation_put(c, inode, &res);
-
-       return copied ?: ret;
-}
-
-static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       loff_t pos = iocb->ki_pos;
-       ssize_t written = 0;
-       int ret = 0;
-
-       bch2_pagecache_add_get(inode);
-
-       do {
-               unsigned offset = pos & (PAGE_SIZE - 1);
-               unsigned bytes = iov_iter_count(iter);
-again:
-               /*
-                * Bring in the user page that we will copy from _first_.
-                * Otherwise there's a nasty deadlock on copying from the
-                * same page as we're writing to, without it being marked
-                * up-to-date.
-                *
-                * Not only is this an optimisation, but it is also required
-                * to check that the address is actually valid, when atomic
-                * usercopies are used, below.
-                */
-               if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                       bytes = min_t(unsigned long, iov_iter_count(iter),
-                                     PAGE_SIZE - offset);
-
-                       if (unlikely(fault_in_iov_iter_readable(iter, bytes))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-               }
-
-               if (unlikely(fatal_signal_pending(current))) {
-                       ret = -EINTR;
-                       break;
-               }
-
-               ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
-               if (unlikely(ret < 0))
-                       break;
-
-               cond_resched();
-
-               if (unlikely(ret == 0)) {
-                       /*
-                        * If we were unable to copy any data at all, we must
-                        * fall back to a single segment length write.
-                        *
-                        * If we didn't fallback here, we could livelock
-                        * because not all segments in the iov can be copied at
-                        * once without a pagefault.
-                        */
-                       bytes = min_t(unsigned long, PAGE_SIZE - offset,
-                                     iov_iter_single_seg_count(iter));
-                       goto again;
-               }
-               pos += ret;
-               written += ret;
-               ret = 0;
-
-               balance_dirty_pages_ratelimited(mapping);
-       } while (iov_iter_count(iter));
-
-       bch2_pagecache_add_put(inode);
-
-       return written ? written : ret;
-}
-
-/* O_DIRECT reads */
-
-static void bio_check_or_release(struct bio *bio, bool check_dirty)
-{
-       if (check_dirty) {
-               bio_check_pages_dirty(bio);
-       } else {
-               bio_release_pages(bio, false);
-               bio_put(bio);
-       }
-}
-
-static void bch2_dio_read_complete(struct closure *cl)
-{
-       struct dio_read *dio = container_of(cl, struct dio_read, cl);
-
-       dio->req->ki_complete(dio->req, dio->ret);
-       bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-}
-
-static void bch2_direct_IO_read_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-
-       if (bio->bi_status)
-               dio->ret = blk_status_to_errno(bio->bi_status);
-
-       closure_put(&dio->cl);
-}
-
-static void bch2_direct_IO_read_split_endio(struct bio *bio)
-{
-       struct dio_read *dio = bio->bi_private;
-       bool should_dirty = dio->should_dirty;
-
-       bch2_direct_IO_read_endio(bio);
-       bio_check_or_release(bio, should_dirty);
-}
-
-static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
-{
-       struct file *file = req->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct bch_io_opts opts;
-       struct dio_read *dio;
-       struct bio *bio;
-       loff_t offset = req->ki_pos;
-       bool sync = is_sync_kiocb(req);
-       size_t shorten;
-       ssize_t ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       if ((offset|iter->count) & (block_bytes(c) - 1))
-               return -EINVAL;
-
-       ret = min_t(loff_t, iter->count,
-                   max_t(loff_t, 0, i_size_read(&inode->v) - offset));
-
-       if (!ret)
-               return ret;
-
-       shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
-       iter->count -= shorten;
-
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_READ,
-                              GFP_KERNEL,
-                              &c->dio_read_bioset);
-
-       bio->bi_end_io = bch2_direct_IO_read_endio;
-
-       dio = container_of(bio, struct dio_read, rbio.bio);
-       closure_init(&dio->cl, NULL);
-
-       /*
-        * this is a _really_ horrible hack just to avoid an atomic sub at the
-        * end:
-        */
-       if (!sync) {
-               set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER -
-                          CLOSURE_RUNNING +
-                          CLOSURE_DESTRUCTOR);
-       } else {
-               atomic_set(&dio->cl.remaining,
-                          CLOSURE_REMAINING_INITIALIZER + 1);
-       }
-
-       dio->req        = req;
-       dio->ret        = ret;
-       /*
-        * This is one of the sketchier things I've encountered: we have to skip
-        * the dirtying of requests that are internal from the kernel (i.e. from
-        * loopback), because we'll deadlock on page_lock.
-        */
-       dio->should_dirty = iter_is_iovec(iter);
-
-       goto start;
-       while (iter->count) {
-               bio = bio_alloc_bioset(NULL,
-                                      bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                                      REQ_OP_READ,
-                                      GFP_KERNEL,
-                                      &c->bio_read);
-               bio->bi_end_io          = bch2_direct_IO_read_split_endio;
-start:
-               bio->bi_opf             = REQ_OP_READ|REQ_SYNC;
-               bio->bi_iter.bi_sector  = offset >> 9;
-               bio->bi_private         = dio;
-
-               ret = bio_iov_iter_get_pages(bio, iter);
-               if (ret < 0) {
-                       /* XXX: fault inject this path */
-                       bio->bi_status = BLK_STS_RESOURCE;
-                       bio_endio(bio);
-                       break;
-               }
-
-               offset += bio->bi_iter.bi_size;
-
-               if (dio->should_dirty)
-                       bio_set_pages_dirty(bio);
-
-               if (iter->count)
-                       closure_get(&dio->cl);
-
-               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
-       }
-
-       iter->count += shorten;
-
-       if (sync) {
-               closure_sync(&dio->cl);
-               closure_debug_destroy(&dio->cl);
-               ret = dio->ret;
-               bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
-               return ret;
-       } else {
-               return -EIOCBQUEUED;
-       }
-}
-
-ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
-{
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct address_space *mapping = file->f_mapping;
-       size_t count = iov_iter_count(iter);
-       ssize_t ret;
-
-       if (!count)
-               return 0; /* skip atime */
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               struct blk_plug plug;
-
-               if (unlikely(mapping->nrpages)) {
-                       ret = filemap_write_and_wait_range(mapping,
-                                               iocb->ki_pos,
-                                               iocb->ki_pos + count - 1);
-                       if (ret < 0)
-                               goto out;
-               }
-
-               file_accessed(file);
-
-               blk_start_plug(&plug);
-               ret = bch2_direct_IO_read(iocb, iter);
-               blk_finish_plug(&plug);
-
-               if (ret >= 0)
-                       iocb->ki_pos += ret;
-       } else {
-               bch2_pagecache_add_get(inode);
-               ret = generic_file_read_iter(iocb, iter);
-               bch2_pagecache_add_put(inode);
-       }
-out:
-       return bch2_err_class(ret);
-}
-
-/* O_DIRECT writes */
-
-static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
-                                      u64 offset, u64 size,
-                                      unsigned nr_replicas, bool compressed)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 end = offset + size;
-       u32 snapshot;
-       bool ret = true;
-       int err;
-
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (err)
-               goto err;
-
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
-                          SPOS(inum.inum, offset, snapshot),
-                          BTREE_ITER_SLOTS, k, err) {
-               if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end)))
-                       break;
-
-               if (k.k->p.snapshot != snapshot ||
-                   nr_replicas > bch2_bkey_replicas(c, k) ||
-                   (!compressed && bch2_bkey_sectors_compressed(k))) {
-                       ret = false;
-                       break;
-               }
-       }
-
-       offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
-err:
-       if (bch2_err_matches(err, BCH_ERR_transaction_restart))
-               goto retry;
-       bch2_trans_exit(&trans);
-
-       return err ? false : ret;
-}
-
-static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       return bch2_check_range_allocated(c, inode_inum(inode),
-                               dio->op.pos.offset, bio_sectors(bio),
-                               dio->op.opts.data_replicas,
-                               dio->op.opts.compression != 0);
-}
-
-static void bch2_dio_write_loop_async(struct bch_write_op *);
-static __always_inline long bch2_dio_write_done(struct dio_write *dio);
-
-/*
- * We're going to return -EIOCBQUEUED, but we haven't finished consuming the
- * iov_iter yet, so we need to stash a copy of the iovec: it might be on the
- * caller's stack, we're not guaranteed that it will live for the duration of
- * the IO:
- */
-static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
-{
-       struct iovec *iov = dio->inline_vecs;
-
-       /*
-        * iov_iter has a single embedded iovec - nothing to do:
-        */
-       if (iter_is_ubuf(&dio->iter))
-               return 0;
-
-       /*
-        * We don't currently handle non-iovec iov_iters here - return an error,
-        * and we'll fall back to doing the IO synchronously:
-        */
-       if (!iter_is_iovec(&dio->iter))
-               return -1;
-
-       if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
-               iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov),
-                                   GFP_KERNEL);
-               if (unlikely(!iov))
-                       return -ENOMEM;
-
-               dio->free_iov = true;
-       }
-
-       memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov));
-       dio->iter.__iov = iov;
-       return 0;
-}
-
-static void bch2_dio_write_flush_done(struct closure *cl)
-{
-       struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
-       struct bch_fs *c = dio->op.c;
-
-       closure_debug_destroy(cl);
-
-       dio->op.error = bch2_journal_error(&c->journal);
-
-       bch2_dio_write_done(dio);
-}
-
-static noinline void bch2_dio_write_flush(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct bch_inode_unpacked inode;
-       int ret;
-
-       dio->flush = 0;
-
-       closure_init(&dio->op.cl, NULL);
-
-       if (!dio->op.error) {
-               ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode);
-               if (ret) {
-                       dio->op.error = ret;
-               } else {
-                       bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl);
-                       bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl);
-               }
-       }
-
-       if (dio->sync) {
-               closure_sync(&dio->op.cl);
-               closure_debug_destroy(&dio->op.cl);
-       } else {
-               continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL);
-       }
-}
-
-static __always_inline long bch2_dio_write_done(struct dio_write *dio)
-{
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       bool sync = dio->sync;
-       long ret;
-
-       if (unlikely(dio->flush)) {
-               bch2_dio_write_flush(dio);
-               if (!sync)
-                       return -EIOCBQUEUED;
-       }
-
-       bch2_pagecache_block_put(inode);
-
-       if (dio->free_iov)
-               kfree(dio->iter.__iov);
-
-       ret = dio->op.error ?: ((long) dio->written << 9);
-       bio_put(&dio->op.wbio.bio);
-
-       /* inode->i_dio_count is our ref on inode and thus bch_fs */
-       inode_dio_end(&inode->v);
-
-       if (ret < 0)
-               ret = bch2_err_class(ret);
-
-       if (!sync) {
-               req->ki_complete(req, ret);
-               ret = -EIOCBQUEUED;
-       }
-       return ret;
-}
-
-static __always_inline void bch2_dio_write_end(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct bch_inode_info *inode = dio->inode;
-       struct bio *bio = &dio->op.wbio.bio;
-
-       req->ki_pos     += (u64) dio->op.written << 9;
-       dio->written    += dio->op.written;
-
-       if (dio->extending) {
-               spin_lock(&inode->v.i_lock);
-               if (req->ki_pos > inode->v.i_size)
-                       i_size_write(&inode->v, req->ki_pos);
-               spin_unlock(&inode->v.i_lock);
-       }
-
-       if (dio->op.i_sectors_delta || dio->quota_res.sectors) {
-               mutex_lock(&inode->ei_quota_lock);
-               __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta);
-               __bch2_quota_reservation_put(c, inode, &dio->quota_res);
-               mutex_unlock(&inode->ei_quota_lock);
-       }
-
-       bio_release_pages(bio, false);
-
-       if (unlikely(dio->op.error))
-               set_bit(EI_INODE_ERROR, &inode->ei_flags);
-}
-
-static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
-{
-       struct bch_fs *c = dio->op.c;
-       struct kiocb *req = dio->req;
-       struct address_space *mapping = dio->mapping;
-       struct bch_inode_info *inode = dio->inode;
-       struct bch_io_opts opts;
-       struct bio *bio = &dio->op.wbio.bio;
-       unsigned unaligned, iter_count;
-       bool sync = dio->sync, dropped_locks;
-       long ret;
-
-       bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-
-       while (1) {
-               iter_count = dio->iter.count;
-
-               EBUG_ON(current->faults_disabled_mapping);
-               current->faults_disabled_mapping = mapping;
-
-               ret = bio_iov_iter_get_pages(bio, &dio->iter);
-
-               dropped_locks = fdm_dropped_locks();
-
-               current->faults_disabled_mapping = NULL;
-
-               /*
-                * If the fault handler returned an error but also signalled
-                * that it dropped & retook ei_pagecache_lock, we just need to
-                * re-shoot down the page cache and retry:
-                */
-               if (dropped_locks && ret)
-                       ret = 0;
-
-               if (unlikely(ret < 0))
-                       goto err;
-
-               if (unlikely(dropped_locks)) {
-                       ret = write_invalidate_inode_pages_range(mapping,
-                                       req->ki_pos,
-                                       req->ki_pos + iter_count - 1);
-                       if (unlikely(ret))
-                               goto err;
-
-                       if (!bio->bi_iter.bi_size)
-                               continue;
-               }
-
-               unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
-               bio->bi_iter.bi_size -= unaligned;
-               iov_iter_revert(&dio->iter, unaligned);
-
-               if (!bio->bi_iter.bi_size) {
-                       /*
-                        * bio_iov_iter_get_pages was only able to get <
-                        * blocksize worth of pages:
-                        */
-                       ret = -EFAULT;
-                       goto err;
-               }
-
-               bch2_write_op_init(&dio->op, c, opts);
-               dio->op.end_io          = sync
-                       ? NULL
-                       : bch2_dio_write_loop_async;
-               dio->op.target          = dio->op.opts.foreground_target;
-               dio->op.write_point     = writepoint_hashed((unsigned long) current);
-               dio->op.nr_replicas     = dio->op.opts.data_replicas;
-               dio->op.subvol          = inode->ei_subvol;
-               dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
-               dio->op.devs_need_flush = &inode->ei_devs_need_flush;
-
-               if (sync)
-                       dio->op.flags |= BCH_WRITE_SYNC;
-               dio->op.flags |= BCH_WRITE_CHECK_ENOSPC;
-
-               ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
-                                                bio_sectors(bio), true);
-               if (unlikely(ret))
-                       goto err;
-
-               ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
-                                               dio->op.opts.data_replicas, 0);
-               if (unlikely(ret) &&
-                   !bch2_dio_write_check_allocated(dio))
-                       goto err;
-
-               task_io_account_write(bio->bi_iter.bi_size);
-
-               if (unlikely(dio->iter.count) &&
-                   !dio->sync &&
-                   !dio->loop &&
-                   bch2_dio_write_copy_iov(dio))
-                       dio->sync = sync = true;
-
-               dio->loop = true;
-               closure_call(&dio->op.cl, bch2_write, NULL, NULL);
-
-               if (!sync)
-                       return -EIOCBQUEUED;
-
-               bch2_dio_write_end(dio);
-
-               if (likely(!dio->iter.count) || dio->op.error)
-                       break;
-
-               bio_reset(bio, NULL, REQ_OP_WRITE);
+               bio->bio.bi_end_io      = nocow_flush_endio;
+               closure_bio_submit(&bio->bio, cl);
        }
-out:
-       return bch2_dio_write_done(dio);
-err:
-       dio->op.error = ret;
-
-       bio_release_pages(bio, false);
-
-       bch2_quota_reservation_put(c, inode, &dio->quota_res);
-       goto out;
 }
 
-static noinline __cold void bch2_dio_write_continue(struct dio_write *dio)
+static int bch2_inode_flush_nocow_writes(struct bch_fs *c,
+                                        struct bch_inode_info *inode)
 {
-       struct mm_struct *mm = dio->mm;
+       struct closure cl;
 
-       bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE);
+       closure_init_stack(&cl);
+       bch2_inode_flush_nocow_writes_async(c, inode, &cl);
+       closure_sync(&cl);
 
-       if (mm)
-               kthread_use_mm(mm);
-       bch2_dio_write_loop(dio);
-       if (mm)
-               kthread_unuse_mm(mm);
+       return 0;
 }
 
-static void bch2_dio_write_loop_async(struct bch_write_op *op)
-{
-       struct dio_write *dio = container_of(op, struct dio_write, op);
-
-       bch2_dio_write_end(dio);
+/* i_size updates: */
 
-       if (likely(!dio->iter.count) || dio->op.error)
-               bch2_dio_write_done(dio);
-       else
-               bch2_dio_write_continue(dio);
-}
+struct inode_new_size {
+       loff_t          new_size;
+       u64             now;
+       unsigned        fields;
+};
 
-static noinline
-ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
+static int inode_set_size(struct btree_trans *trans,
+                         struct bch_inode_info *inode,
+                         struct bch_inode_unpacked *bi,
+                         void *p)
 {
-       struct file *file = req->ki_filp;
-       struct address_space *mapping = file->f_mapping;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct dio_write *dio;
-       struct bio *bio;
-       bool locked = true, extending;
-       ssize_t ret;
-
-       prefetch(&c->opts);
-       prefetch((void *) &c->opts + 64);
-       prefetch(&inode->ei_inode);
-       prefetch((void *) &inode->ei_inode + 64);
-
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(req, iter);
-       if (unlikely(ret <= 0))
-               goto err;
-
-       ret = file_remove_privs(file);
-       if (unlikely(ret))
-               goto err;
-
-       ret = file_update_time(file);
-       if (unlikely(ret))
-               goto err;
-
-       if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-               goto err;
-
-       inode_dio_begin(&inode->v);
-       bch2_pagecache_block_get(inode);
-
-       extending = req->ki_pos + iter->count > inode->v.i_size;
-       if (!extending) {
-               inode_unlock(&inode->v);
-               locked = false;
-       }
+       struct inode_new_size *s = p;
 
-       bio = bio_alloc_bioset(NULL,
-                              bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-                              REQ_OP_WRITE,
-                              GFP_KERNEL,
-                              &c->dio_write_bioset);
-       dio = container_of(bio, struct dio_write, op.wbio.bio);
-       dio->req                = req;
-       dio->mapping            = mapping;
-       dio->inode              = inode;
-       dio->mm                 = current->mm;
-       dio->loop               = false;
-       dio->extending          = extending;
-       dio->sync               = is_sync_kiocb(req) || extending;
-       dio->flush              = iocb_is_dsync(req) && !c->opts.journal_flush_disabled;
-       dio->free_iov           = false;
-       dio->quota_res.sectors  = 0;
-       dio->written            = 0;
-       dio->iter               = *iter;
-       dio->op.c               = c;
-
-       if (unlikely(mapping->nrpages)) {
-               ret = write_invalidate_inode_pages_range(mapping,
-                                               req->ki_pos,
-                                               req->ki_pos + iter->count - 1);
-               if (unlikely(ret))
-                       goto err_put_bio;
-       }
+       bi->bi_size = s->new_size;
+       if (s->fields & ATTR_ATIME)
+               bi->bi_atime = s->now;
+       if (s->fields & ATTR_MTIME)
+               bi->bi_mtime = s->now;
+       if (s->fields & ATTR_CTIME)
+               bi->bi_ctime = s->now;
 
-       ret = bch2_dio_write_loop(dio);
-err:
-       if (locked)
-               inode_unlock(&inode->v);
-       return ret;
-err_put_bio:
-       bch2_pagecache_block_put(inode);
-       bio_put(bio);
-       inode_dio_end(&inode->v);
-       goto err;
+       return 0;
 }
 
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+int __must_check bch2_write_inode_size(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      loff_t new_size, unsigned fields)
 {
-       struct file *file = iocb->ki_filp;
-       struct bch_inode_info *inode = file_bch_inode(file);
-       ssize_t ret;
-
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               ret = bch2_direct_write(iocb, from);
-               goto out;
-       }
-
-       /* We can write back this queue in page reclaim */
-       current->backing_dev_info = inode_to_bdi(&inode->v);
-       inode_lock(&inode->v);
-
-       ret = generic_write_checks(iocb, from);
-       if (ret <= 0)
-               goto unlock;
+       struct inode_new_size s = {
+               .new_size       = new_size,
+               .now            = bch2_current_time(c),
+               .fields         = fields,
+       };
 
-       ret = file_remove_privs(file);
-       if (ret)
-               goto unlock;
+       return bch2_write_inode(c, inode, inode_set_size, &s, fields);
+}
 
-       ret = file_update_time(file);
-       if (ret)
-               goto unlock;
+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+                          struct quota_res *quota_res, s64 sectors)
+{
+       bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c,
+                               "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)",
+                               inode->v.i_ino, (u64) inode->v.i_blocks, sectors,
+                               inode->ei_inode.bi_sectors);
+       inode->v.i_blocks += sectors;
 
-       ret = bch2_buffered_write(iocb, from);
-       if (likely(ret > 0))
-               iocb->ki_pos += ret;
-unlock:
-       inode_unlock(&inode->v);
-       current->backing_dev_info = NULL;
+#ifdef CONFIG_BCACHEFS_QUOTA
+       if (quota_res &&
+           !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) &&
+           sectors > 0) {
+               BUG_ON(sectors > quota_res->sectors);
+               BUG_ON(sectors > inode->ei_quota_reserved);
 
-       if (ret > 0)
-               ret = generic_write_sync(iocb, ret);
-out:
-       return bch2_err_class(ret);
+               quota_res->sectors -= sectors;
+               inode->ei_quota_reserved -= sectors;
+       } else {
+               bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
+       }
+#endif
 }
 
 /* fsync: */
@@ -2814,31 +207,29 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol,
                                 struct bpos start,
                                 struct bpos end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, subvol, &start.snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents, start, end, 0, k, ret)
                if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) {
                        ret = 1;
                        break;
                }
        start = iter.pos;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -2848,8 +239,8 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_folio *s;
-       unsigned start_offset = start & (PAGE_SIZE - 1);
-       unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
+       unsigned start_offset;
+       unsigned end_offset;
        unsigned i;
        struct folio *folio;
        s64 i_sectors_delta = 0;
@@ -2870,7 +261,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
 
                folio = __filemap_get_folio(mapping, index,
                                            FGP_LOCK|FGP_CREAT, GFP_KERNEL);
-               if (unlikely(IS_ERR_OR_NULL(folio))) {
+               if (IS_ERR_OR_NULL(folio)) {
                        ret = -ENOMEM;
                        goto out;
                }
@@ -2911,10 +302,10 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode,
                s->s[i].nr_replicas     = 0;
 
                i_sectors_delta -= s->s[i].state == SECTOR_dirty;
-               folio_sector_set(folio, s, i, SECTOR_unallocated);
+               bch2_folio_sector_set(folio, s, i, SECTOR_unallocated);
        }
 
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
        /*
         * Caller needs to know whether this folio will be written out by
@@ -2998,31 +389,12 @@ static int bch2_extend(struct mnt_idmap *idmap,
        return bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
-static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
-                                  struct bch_inode_unpacked *bi,
-                                  void *p)
-{
-       bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
-       return 0;
-}
-
-static int bch2_truncate_start_fn(struct bch_inode_info *inode,
-                                 struct bch_inode_unpacked *bi, void *p)
-{
-       u64 *new_i_size = p;
-
-       bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
-       bi->bi_size = *new_i_size;
-       return 0;
-}
-
-int bch2_truncate(struct mnt_idmap *idmap,
+int bchfs_truncate(struct mnt_idmap *idmap,
                  struct bch_inode_info *inode, struct iattr *iattr)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
        struct bch_inode_unpacked inode_u;
-       u64 new_i_size = iattr->ia_size;
        s64 i_sectors_delta = 0;
        int ret = 0;
 
@@ -3071,6 +443,8 @@ int bch2_truncate(struct mnt_idmap *idmap,
        if (unlikely(ret < 0))
                goto err;
 
+       truncate_setsize(&inode->v, iattr->ia_size);
+
        /*
         * When extending, we're going to write the new i_size to disk
         * immediately so we need to flush anything above the current on disk
@@ -3092,32 +466,22 @@ int bch2_truncate(struct mnt_idmap *idmap,
        if (ret)
                goto err;
 
-       mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
-                              &new_i_size, 0);
-       mutex_unlock(&inode->ei_update_lock);
+       ret = bch2_truncate(c, inode_inum(inode), iattr->ia_size, &i_sectors_delta);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       if (unlikely(ret))
+       if (unlikely(ret)) {
+               /*
+                * If we error here, VFS caches are now inconsistent with btree
+                */
+               set_bit(EI_INODE_ERROR, &inode->ei_flags);
                goto err;
-
-       truncate_setsize(&inode->v, iattr->ia_size);
-
-       ret = bch2_fpunch(c, inode_inum(inode),
-                       round_up(iattr->ia_size, block_bytes(c)) >> 9,
-                       U64_MAX, &i_sectors_delta);
-       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+       }
 
        bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks &&
                                !bch2_journal_error(&c->journal), c,
                                "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)",
                                inode->v.i_ino, (u64) inode->v.i_blocks,
                                inode->ei_inode.bi_sectors);
-       if (unlikely(ret))
-               goto err;
-
-       mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0);
-       mutex_unlock(&inode->ei_update_lock);
 
        ret = bch2_setattr_nonsize(idmap, inode, iattr);
 err:
@@ -3127,7 +491,8 @@ err:
 
 /* fallocate: */
 
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+                                struct bch_inode_info *inode,
                                 struct bch_inode_unpacked *bi, void *p)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -3159,7 +524,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len
                ret = bch2_fpunch(c, inode_inum(inode),
                                  block_start >> 9, block_end >> 9,
                                  &i_sectors_delta);
-               i_sectors_acct(c, inode, NULL, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
        }
 
        mutex_lock(&inode->ei_update_lock);
@@ -3181,175 +546,33 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct bkey_buf copy;
-       struct btree_trans trans;
-       struct btree_iter src, dst, del;
-       loff_t shift, new_size;
-       u64 src_start;
+       s64 i_sectors_delta = 0;
        int ret = 0;
 
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
        if (insert) {
-               if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-                       return -EFBIG;
-
                if (offset >= inode->v.i_size)
                        return -EINVAL;
-
-               src_start       = U64_MAX;
-               shift           = len;
        } else {
                if (offset + len >= inode->v.i_size)
                        return -EINVAL;
-
-               src_start       = offset + len;
-               shift           = -len;
        }
 
-       new_size = inode->v.i_size + shift;
-
-       ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
+       ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
        if (ret)
                return ret;
 
-       if (insert) {
-               i_size_write(&inode->v, new_size);
-               mutex_lock(&inode->ei_update_lock);
-               ret = bch2_write_inode_size(c, inode, new_size,
-                                           ATTR_MTIME|ATTR_CTIME);
-               mutex_unlock(&inode->ei_update_lock);
-       } else {
-               s64 i_sectors_delta = 0;
-
-               ret = bch2_fpunch(c, inode_inum(inode),
-                                 offset >> 9, (offset + len) >> 9,
-                                 &i_sectors_delta);
-               i_sectors_acct(c, inode, NULL, i_sectors_delta);
-
-               if (ret)
-                       return ret;
-       }
-
-       bch2_bkey_buf_init(&copy);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
-                       POS(inode->v.i_ino, src_start >> 9),
-                       BTREE_ITER_INTENT);
-       bch2_trans_copy_iter(&dst, &src);
-       bch2_trans_copy_iter(&del, &src);
-
-       while (ret == 0 ||
-              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
-               struct bkey_s_c k;
-               struct bpos next_pos;
-               struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
-               struct bpos atomic_end;
-               unsigned trigger_flags = 0;
-               u32 snapshot;
-
-               bch2_trans_begin(&trans);
-
-               ret = bch2_subvolume_get_snapshot(&trans,
-                                       inode->ei_subvol, &snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(&src, snapshot);
-               bch2_btree_iter_set_snapshot(&dst, snapshot);
-               bch2_btree_iter_set_snapshot(&del, snapshot);
-
-               bch2_trans_begin(&trans);
-
-               k = insert
-                       ? bch2_btree_iter_peek_prev(&src)
-                       : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX));
-               if ((ret = bkey_err(k)))
-                       continue;
-
-               if (!k.k || k.k->p.inode != inode->v.i_ino)
-                       break;
-
-               if (insert &&
-                   bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9)))
-                       break;
-reassemble:
-               bch2_bkey_buf_reassemble(&copy, c, k);
-
-               if (insert &&
-                   bkey_lt(bkey_start_pos(k.k), move_pos))
-                       bch2_cut_front(move_pos, copy.k);
-
-               copy.k->k.p.offset += shift >> 9;
-               bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
-
-               ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
-               if (ret)
-                       continue;
-
-               if (!bkey_eq(atomic_end, copy.k->k.p)) {
-                       if (insert) {
-                               move_pos = atomic_end;
-                               move_pos.offset -= shift >> 9;
-                               goto reassemble;
-                       } else {
-                               bch2_cut_back(atomic_end, copy.k);
-                       }
-               }
-
-               bkey_init(&delete.k);
-               delete.k.p = copy.k->k.p;
-               delete.k.size = copy.k->k.size;
-               delete.k.p.offset -= shift >> 9;
-               bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
-
-               next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
-
-               if (copy.k->k.size != k.k->size) {
-                       /* We might end up splitting compressed extents: */
-                       unsigned nr_ptrs =
-                               bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
-
-                       ret = bch2_disk_reservation_get(c, &disk_res,
-                                       copy.k->k.size, nr_ptrs,
-                                       BCH_DISK_RESERVATION_NOFAIL);
-                       BUG_ON(ret);
-               }
-
-               ret =   bch2_btree_iter_traverse(&del) ?:
-                       bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
-                       bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
-                       bch2_trans_commit(&trans, &disk_res, NULL,
-                                         BTREE_INSERT_NOFAIL);
-               bch2_disk_reservation_put(c, &disk_res);
-
-               if (!ret)
-                       bch2_btree_iter_set_pos(&src, next_pos);
-       }
-       bch2_trans_iter_exit(&trans, &del);
-       bch2_trans_iter_exit(&trans, &dst);
-       bch2_trans_iter_exit(&trans, &src);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&copy, c);
+       if (insert)
+               i_size_write(&inode->v, inode->v.i_size + len);
 
-       if (ret)
-               return ret;
+       ret = bch2_fcollapse_finsert(c, inode_inum(inode), offset >> 9, len >> 9,
+                                    insert, &i_sectors_delta);
+       if (!ret && !insert)
+               i_size_write(&inode->v, inode->v.i_size - len);
+       bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
-       mutex_lock(&inode->ei_update_lock);
-       if (!insert) {
-               i_size_write(&inode->v, new_size);
-               ret = bch2_write_inode_size(c, inode, new_size,
-                                           ATTR_MTIME|ATTR_CTIME);
-       } else {
-               /* We need an inode update to update bi_journal_seq for fsync: */
-               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                                      ATTR_MTIME|ATTR_CTIME);
-       }
-       mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
 
@@ -3357,16 +580,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                             u64 start_sector, u64 end_sector)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bpos end_pos = POS(inode->v.i_ino, end_sector);
        struct bch_io_opts opts;
        int ret = 0;
 
        bch2_inode_opts_get(&opts, c, &inode->ei_inode);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                        POS(inode->v.i_ino, start_sector),
                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
@@ -3379,9 +601,9 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                u64 hole_start, hole_end;
                u32 snapshot;
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
-               ret = bch2_subvolume_get_snapshot(&trans,
+               ret = bch2_subvolume_get_snapshot(trans,
                                        inode->ei_subvol, &snapshot);
                if (ret)
                        goto bkey_err;
@@ -3410,11 +632,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                }
 
                if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+                       /*
+                        * Lock ordering - can't be holding btree locks while
+                        * blocking on a folio lock:
+                        */
                        if (bch2_clamp_data_hole(&inode->v,
                                                 &hole_start,
                                                 &hole_end,
                                                 opts.data_replicas, true))
-                               ret = drop_locks_do(&trans,
+                               ret = drop_locks_do(trans,
                                        (bch2_clamp_data_hole(&inode->v,
                                                              &hole_start,
                                                              &hole_end,
@@ -3437,16 +663,16 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                                goto bkey_err;
                }
 
-               ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter,
+               ret = bch2_extent_fallocate(trans, inode_inum(inode), &iter,
                                            sectors, opts, &i_sectors_delta,
                                            writepoint_hashed((unsigned long) current));
                if (ret)
                        goto bkey_err;
 
-               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 
-               drop_locks_do(&trans,
-                       (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+               drop_locks_do(trans,
+                       (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -3457,14 +683,14 @@ bkey_err:
                struct quota_res quota_res = { 0 };
                s64 i_sectors_delta = 0;
 
-               bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+               bch2_fpunch_at(trans, &iter, inode_inum(inode),
                               end_sector, &i_sectors_delta);
-               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
                bch2_quota_reservation_put(c, inode, &quota_res);
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -3570,26 +796,24 @@ static int quota_reserve_range(struct bch_inode_info *inode,
                               u64 start, u64 end)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        u32 snapshot;
        u64 sectors = end - start;
        u64 pos = start;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(inode->v.i_ino, pos, snapshot), 0);
 
-       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+       while (!(ret = btree_trans_too_many_iters(trans)) &&
               (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k &&
               !(ret = bkey_err(k))) {
                if (bkey_extent_is_allocation(k.k)) {
@@ -3601,17 +825,14 @@ retry:
                bch2_btree_iter_advance(&iter);
        }
        pos = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
-
-       if (ret)
-               return ret;
+       bch2_trans_put(trans);
 
-       return bch2_quota_reservation_add(c, inode, res, sectors, true);
+       return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true);
 }
 
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
@@ -3653,7 +874,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
        aligned_len = round_up((u64) len, block_bytes(c));
 
-       ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
+       ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping,
                                pos_dst, pos_dst + len - 1);
        if (ret)
                goto err;
@@ -3665,7 +886,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 
        file_update_time(file_dst);
 
-       mark_pagecache_unallocated(src, pos_src >> 9,
+       bch2_mark_pagecache_unallocated(src, pos_src >> 9,
                                   (pos_src + aligned_len) >> 9);
 
        ret = bch2_remap_range(c,
@@ -3681,7 +902,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
         */
        ret = min((u64) ret << 9, (u64) len);
 
-       i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
+       bch2_i_sectors_acct(c, dst, &quota_res, i_sectors_delta);
 
        spin_lock(&dst->v.i_lock);
        if (pos_dst + ret > dst->v.i_size)
@@ -3700,73 +921,11 @@ err:
 
 /* fseek: */
 
-static int folio_data_offset(struct folio *folio, loff_t pos,
-                            unsigned min_replicas)
-{
-       struct bch_folio *s = bch2_folio(folio);
-       unsigned i, sectors = folio_sectors(folio);
-
-       if (s)
-               for (i = folio_pos_to_s(folio, pos); i < sectors; i++)
-                       if (s->s[i].state >= SECTOR_dirty &&
-                           s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas)
-                               return i << SECTOR_SHIFT;
-
-       return -1;
-}
-
-static loff_t bch2_seek_pagecache_data(struct inode *vinode,
-                                      loff_t start_offset,
-                                      loff_t end_offset,
-                                      unsigned min_replicas,
-                                      bool nonblock)
-{
-       struct folio_batch fbatch;
-       pgoff_t start_index     = start_offset >> PAGE_SHIFT;
-       pgoff_t end_index       = end_offset >> PAGE_SHIFT;
-       pgoff_t index           = start_index;
-       unsigned i;
-       loff_t ret;
-       int offset;
-
-       folio_batch_init(&fbatch);
-
-       while (filemap_get_folios(vinode->i_mapping,
-                                 &index, end_index, &fbatch)) {
-               for (i = 0; i < folio_batch_count(&fbatch); i++) {
-                       struct folio *folio = fbatch.folios[i];
-
-                       if (!nonblock) {
-                               folio_lock(folio);
-                       } else if (!folio_trylock(folio)) {
-                               folio_batch_release(&fbatch);
-                               return -EAGAIN;
-                       }
-
-                       offset = folio_data_offset(folio,
-                                       max(folio_pos(folio), start_offset),
-                                       min_replicas);
-                       if (offset >= 0) {
-                               ret = clamp(folio_pos(folio) + offset,
-                                           start_offset, end_offset);
-                               folio_unlock(folio);
-                               folio_batch_release(&fbatch);
-                               return ret;
-                       }
-                       folio_unlock(folio);
-               }
-               folio_batch_release(&fbatch);
-               cond_resched();
-       }
-
-       return end_offset;
-}
-
 static loff_t bch2_seek_data(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        subvol_inum inum = inode_inum(inode);
@@ -3778,15 +937,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
        if (offset >= isize)
                return -ENXIO;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           POS(inode->v.i_ino, U64_MAX),
                           0, k, ret) {
@@ -3796,12 +955,12 @@ retry:
                } else if (k.k->p.offset >> 9 > isize)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return ret;
 
@@ -3815,93 +974,11 @@ err:
        return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
 }
 
-static int folio_hole_offset(struct address_space *mapping, loff_t *offset,
-                             unsigned min_replicas, bool nonblock)
-{
-       struct folio *folio;
-       struct bch_folio *s;
-       unsigned i, sectors;
-       bool ret = true;
-
-       folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT,
-                                   !nonblock ? FGP_LOCK : 0, 0);
-       if (IS_ERR_OR_NULL(folio))
-               return true;
-
-       if (nonblock && !folio_trylock(folio)) {
-               folio_put(folio);
-               return -EAGAIN;
-       }
-
-       s = bch2_folio(folio);
-       if (!s)
-               goto unlock;
-
-       sectors = folio_sectors(folio);
-       for (i = folio_pos_to_s(folio, *offset); i < sectors; i++)
-               if (s->s[i].state < SECTOR_dirty ||
-                   s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) {
-                       *offset = max(*offset,
-                                     folio_pos(folio) + (i << SECTOR_SHIFT));
-                       goto unlock;
-               }
-
-       *offset = folio_end_pos(folio);
-       ret = false;
-unlock:
-       folio_unlock(folio);
-       folio_put(folio);
-       return ret;
-}
-
-static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
-                                      loff_t start_offset,
-                                      loff_t end_offset,
-                                      unsigned min_replicas,
-                                      bool nonblock)
-{
-       struct address_space *mapping = vinode->i_mapping;
-       loff_t offset = start_offset;
-
-       while (offset < end_offset &&
-              !folio_hole_offset(mapping, &offset, min_replicas, nonblock))
-               ;
-
-       return min(offset, end_offset);
-}
-
-static int bch2_clamp_data_hole(struct inode *inode,
-                               u64 *hole_start,
-                               u64 *hole_end,
-                               unsigned min_replicas,
-                               bool nonblock)
-{
-       loff_t ret;
-
-       ret = bch2_seek_pagecache_hole(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_start = ret;
-
-       if (*hole_start == *hole_end)
-               return 0;
-
-       ret = bch2_seek_pagecache_data(inode,
-               *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9;
-       if (ret < 0)
-               return ret;
-
-       *hole_end = ret;
-       return 0;
-}
-
 static loff_t bch2_seek_hole(struct file *file, u64 offset)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        subvol_inum inum = inode_inum(inode);
@@ -3913,15 +990,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
        if (offset >= isize)
                return -ENXIO;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_extents,
                           SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
@@ -3939,12 +1016,12 @@ retry:
                        offset = max(offset, bkey_start_offset(k.k) << 9);
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return ret;
 
@@ -3981,28 +1058,10 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 void bch2_fs_fsio_exit(struct bch_fs *c)
 {
        bioset_exit(&c->nocow_flush_bioset);
-       bioset_exit(&c->dio_write_bioset);
-       bioset_exit(&c->dio_read_bioset);
-       bioset_exit(&c->writepage_bioset);
 }
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
-       if (bioset_init(&c->writepage_bioset,
-                       4, offsetof(struct bch_writepage_io, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_writepage_bioset_init;
-
-       if (bioset_init(&c->dio_read_bioset,
-                       4, offsetof(struct dio_read, rbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_read_bioset_init;
-
-       if (bioset_init(&c->dio_write_bioset,
-                       4, offsetof(struct dio_write, op.wbio.bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_dio_write_bioset_init;
-
        if (bioset_init(&c->nocow_flush_bioset,
                        1, offsetof(struct nocow_flush, bio), 0))
                return -BCH_ERR_ENOMEM_nocow_flush_bioset_init;
index af905331542dd99191b43866ee423991764a8e66..ca70346e68dc3d9196c85ce7768b1c9f53e6e792 100644 (file)
 #ifndef NO_BCACHEFS_FS
 
 #include "buckets.h"
-#include "io_types.h"
+#include "fs.h"
+#include "io_write_types.h"
+#include "quota.h"
 
 #include <linux/uio.h>
 
-struct quota_res;
+struct folio_vec {
+       struct folio    *fv_folio;
+       size_t          fv_offset;
+       size_t          fv_len;
+};
+
+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv)
+{
+
+       struct folio *folio     = page_folio(bv.bv_page);
+       size_t offset           = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) +
+               bv.bv_offset;
+       size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len);
+
+       return (struct folio_vec) {
+               .fv_folio       = folio,
+               .fv_offset      = offset,
+               .fv_len         = len,
+       };
+}
+
+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio,
+                                                   struct bvec_iter iter)
+{
+       return biovec_to_foliovec(bio_iter_iovec(bio, iter));
+}
+
+#define __bio_for_each_folio(bvl, bio, iter, start)                    \
+       for (iter = (start);                                            \
+            (iter).bi_size &&                                          \
+               ((bvl = bio_iter_iovec_folio((bio), (iter))), 1);       \
+            bio_advance_iter_single((bio), &(iter), (bvl).fv_len))
+
+/**
+ * bio_for_each_folio - iterate over folios within a bio
+ *
+ * Like other non-_all versions, this iterates over what bio->bi_iter currently
+ * points to. This version is for drivers, where the bio may have previously
+ * been split or cloned.
+ */
+#define bio_for_each_folio(bvl, bio, iter)                             \
+       __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter)
+
+struct quota_res {
+       u64                             sectors;
+};
+
+#ifdef CONFIG_BCACHEFS_QUOTA
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct quota_res *res)
+{
+       BUG_ON(res->sectors > inode->ei_quota_reserved);
+
+       bch2_quota_acct(c, inode->ei_qid, Q_SPC,
+                       -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
+       inode->ei_quota_reserved -= res->sectors;
+       res->sectors = 0;
+}
+
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res)
+{
+       if (res->sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_quota_reservation_put(c, inode, res);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+}
+
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     u64 sectors,
+                                     bool check_enospc)
+{
+       int ret;
+
+       if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags))
+               return 0;
+
+       mutex_lock(&inode->ei_quota_lock);
+       ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
+                             check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
+       if (likely(!ret)) {
+               inode->ei_quota_reserved += sectors;
+               res->sectors += sectors;
+       }
+       mutex_unlock(&inode->ei_quota_lock);
+
+       return ret;
+}
 
-int __must_check bch2_write_inode_size(struct bch_fs *,
-                                      struct bch_inode_info *,
-                                      loff_t, unsigned);
+#else
+
+static inline void __bch2_quota_reservation_put(struct bch_fs *c,
+                                        struct bch_inode_info *inode,
+                                        struct quota_res *res) {}
 
-int bch2_read_folio(struct file *, struct folio *);
+static inline void bch2_quota_reservation_put(struct bch_fs *c,
+                                      struct bch_inode_info *inode,
+                                      struct quota_res *res) {}
 
-int bch2_writepages(struct address_space *, struct writeback_control *);
-void bch2_readahead(struct readahead_control *);
+static inline int bch2_quota_reservation_add(struct bch_fs *c,
+                                     struct bch_inode_info *inode,
+                                     struct quota_res *res,
+                                     unsigned sectors,
+                                     bool check_enospc)
+{
+       return 0;
+}
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-                    unsigned, struct page **, void **);
-int bch2_write_end(struct file *, struct address_space *, loff_t,
-                  unsigned, unsigned, struct page *, void *);
+#endif
 
-ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
-ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *,
+                          struct quota_res *, s64);
+
+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+                                      struct quota_res *quota_res, s64 sectors)
+{
+       if (sectors) {
+               mutex_lock(&inode->ei_quota_lock);
+               __bch2_i_sectors_acct(c, inode, quota_res, sectors);
+               mutex_unlock(&inode->ei_quota_lock);
+       }
+}
+
+static inline struct address_space *faults_disabled_mapping(void)
+{
+       return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
+}
+
+static inline void set_fdm_dropped_locks(void)
+{
+       current->faults_disabled_mapping =
+               (void *) (((unsigned long) current->faults_disabled_mapping)|1);
+}
+
+static inline bool fdm_dropped_locks(void)
+{
+       return ((unsigned long) current->faults_disabled_mapping) & 1;
+}
+
+void bch2_inode_flush_nocow_writes_async(struct bch_fs *,
+                       struct bch_inode_info *, struct closure *);
+
+int __must_check bch2_write_inode_size(struct bch_fs *,
+                                      struct bch_inode_info *,
+                                      loff_t, unsigned);
 
 int bch2_fsync(struct file *, loff_t, loff_t, int);
 
-int bch2_truncate(struct mnt_idmap *,
+int bchfs_truncate(struct mnt_idmap *,
                  struct bch_inode_info *, struct iattr *);
 long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
 
@@ -39,11 +174,6 @@ loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
 
 loff_t bch2_llseek(struct file *, loff_t, int);
 
-vm_fault_t bch2_page_fault(struct vm_fault *);
-vm_fault_t bch2_page_mkwrite(struct vm_fault *);
-void bch2_invalidate_folio(struct folio *, size_t, size_t);
-bool bch2_release_folio(struct folio *, gfp_t);
-
 void bch2_fs_fsio_exit(struct bch_fs *);
 int bch2_fs_fsio_init(struct bch_fs *);
 #else
index dfa1bf73c8541187abbe87e7127213d337ee24e7..d7c1b05aa438568c4b7becceb00e20853ad87d3f 100644 (file)
@@ -31,7 +31,8 @@ struct flags_set {
        bool                    projinherit;
 };
 
-static int bch2_inode_flags_set(struct bch_inode_info *inode,
+static int bch2_inode_flags_set(struct btree_trans *trans,
+                               struct bch_inode_info *inode,
                                struct bch_inode_unpacked *bi,
                                void *p)
 {
@@ -44,13 +45,13 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode,
        unsigned newflags = s->flags;
        unsigned oldflags = bi->bi_flags & s->mask;
 
-       if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
+       if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;
 
        if (!S_ISREG(bi->bi_mode) &&
            !S_ISDIR(bi->bi_mode) &&
-           (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
+           (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags)
                return -EINVAL;
 
        if (s->set_projinherit) {
@@ -121,10 +122,14 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
 
        fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
 
-       return copy_to_user(arg, &fa, sizeof(fa));
+       if (copy_to_user(arg, &fa, sizeof(fa)))
+               return -EFAULT;
+
+       return 0;
 }
 
-static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
+static int fssetxattr_inode_update_fn(struct btree_trans *trans,
+                                     struct bch_inode_info *inode,
                                      struct bch_inode_unpacked *bi,
                                      void *p)
 {
@@ -135,7 +140,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
                bi->bi_project = s->projid;
        }
 
-       return bch2_inode_flags_set(inode, bi, p);
+       return bch2_inode_flags_set(trans, inode, bi, p);
 }
 
 static int bch2_ioc_fssetxattr(struct bch_fs *c,
@@ -192,7 +197,8 @@ err:
        return ret;
 }
 
-static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
+static int bch2_reinherit_attrs_fn(struct btree_trans *trans,
+                                  struct bch_inode_info *inode,
                                   struct bch_inode_unpacked *bi,
                                   void *p)
 {
@@ -312,8 +318,8 @@ err:
        return ret;
 }
 
-static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
-                               struct bch_ioctl_subvolume arg)
+static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+                                         struct bch_ioctl_subvolume arg)
 {
        struct inode *dir;
        struct bch_inode_info *inode;
@@ -434,36 +440,48 @@ err1:
        return error;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+                                       struct bch_ioctl_subvolume arg)
+{
+       down_write(&c->snapshot_create_lock);
+       long ret = __bch2_ioctl_subvolume_create(c, filp, arg);
+       up_write(&c->snapshot_create_lock);
+
+       return ret;
+}
+
 static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
                                struct bch_ioctl_subvolume arg)
 {
+       struct filename *name;
        struct path path;
        struct inode *dir;
+       struct dentry *victim;
        int ret = 0;
 
        if (arg.flags)
                return -EINVAL;
 
-       ret = user_path_at(arg.dirfd,
-                       (const char __user *)(unsigned long)arg.dst_ptr,
-                       LOOKUP_FOLLOW, &path);
-       if (ret)
-               return ret;
+       name = getname((const char __user *)(unsigned long)arg.dst_ptr);
+       victim = filename_path_locked(arg.dirfd, name, &path);
+       putname(name);
+       if (IS_ERR(victim))
+               return PTR_ERR(victim);
 
-       if (path.dentry->d_sb->s_fs_info != c) {
+       if (victim->d_sb->s_fs_info != c) {
                ret = -EXDEV;
                goto err;
        }
 
-       dir = path.dentry->d_parent->d_inode;
-
-       ret = __bch2_unlink(dir, path.dentry, true);
-       if (ret)
-               goto err;
-
-       fsnotify_rmdir(dir, path.dentry);
-       d_delete(path.dentry);
+       dir = d_inode(path.dentry);
+       ret = __bch2_unlink(dir, victim, true);
+       if (!ret) {
+               fsnotify_rmdir(dir, victim);
+               d_delete(victim);
+       }
+       inode_unlock(dir);
 err:
+       dput(victim);
        path_put(&path);
        return ret;
 }
index f201980ef2c38e2dbbe5faf7138f0d2a2c479f16..d30f9bb056fd9790f97c4b08f839b480bf46397c 100644 (file)
@@ -5,29 +5,29 @@
 /* Inode flags: */
 
 /* bcachefs inode flags -> vfs inode flags: */
-static const unsigned bch_flags_to_vfs[] = {
-       [__BCH_INODE_SYNC]      = S_SYNC,
-       [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
-       [__BCH_INODE_APPEND]    = S_APPEND,
-       [__BCH_INODE_NOATIME]   = S_NOATIME,
+static const __maybe_unused unsigned bch_flags_to_vfs[] = {
+       [__BCH_INODE_sync]      = S_SYNC,
+       [__BCH_INODE_immutable] = S_IMMUTABLE,
+       [__BCH_INODE_append]    = S_APPEND,
+       [__BCH_INODE_noatime]   = S_NOATIME,
 };
 
 /* bcachefs inode flags -> FS_IOC_GETFLAGS: */
-static const unsigned bch_flags_to_uflags[] = {
-       [__BCH_INODE_SYNC]      = FS_SYNC_FL,
-       [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
-       [__BCH_INODE_APPEND]    = FS_APPEND_FL,
-       [__BCH_INODE_NODUMP]    = FS_NODUMP_FL,
-       [__BCH_INODE_NOATIME]   = FS_NOATIME_FL,
+static const __maybe_unused unsigned bch_flags_to_uflags[] = {
+       [__BCH_INODE_sync]      = FS_SYNC_FL,
+       [__BCH_INODE_immutable] = FS_IMMUTABLE_FL,
+       [__BCH_INODE_append]    = FS_APPEND_FL,
+       [__BCH_INODE_nodump]    = FS_NODUMP_FL,
+       [__BCH_INODE_noatime]   = FS_NOATIME_FL,
 };
 
 /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
-static const unsigned bch_flags_to_xflags[] = {
-       [__BCH_INODE_SYNC]      = FS_XFLAG_SYNC,
-       [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
-       [__BCH_INODE_APPEND]    = FS_XFLAG_APPEND,
-       [__BCH_INODE_NODUMP]    = FS_XFLAG_NODUMP,
-       [__BCH_INODE_NOATIME]   = FS_XFLAG_NOATIME,
+static const __maybe_unused unsigned bch_flags_to_xflags[] = {
+       [__BCH_INODE_sync]      = FS_XFLAG_SYNC,
+       [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE,
+       [__BCH_INODE_append]    = FS_XFLAG_APPEND,
+       [__BCH_INODE_nodump]    = FS_XFLAG_NODUMP,
+       [__BCH_INODE_noatime]   = FS_XFLAG_NOATIME,
        //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
 };
 
index 8d2f388b4327db8196e884866a40590371b86a5f..f76d403ccb766d479b280c8faae33cd98f001548 100644 (file)
 #include "fs-common.h"
 #include "fs-io.h"
 #include "fs-ioctl.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
+#include "fs-io-pagecache.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
 #include "journal.h"
 #include "keylist.h"
 #include "quota.h"
+#include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -62,11 +66,11 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
        inode->v.i_mode = bi->bi_mode;
 
        if (fields & ATTR_ATIME)
-               inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
+               inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime));
        if (fields & ATTR_MTIME)
-               inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
+               inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime));
        if (fields & ATTR_CTIME)
-               inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
+               inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime));
 
        inode->ei_inode         = *bi;
 
@@ -78,29 +82,27 @@ int __must_check bch2_write_inode(struct bch_fs *c,
                                  inode_set_fn set,
                                  void *p, unsigned fields)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bch_inode_unpacked inode_u;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 512);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+       ret   = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode),
                                BTREE_ITER_INTENT) ?:
-               (set ? set(inode, &inode_u, p) : 0) ?:
-               bch2_inode_write(&trans, &iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+               (set ? set(trans, inode, &inode_u, p) : 0) ?:
+               bch2_inode_write(trans, &iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 
        /*
         * the btree node lock protects inode->ei_inode, not ei_update_lock;
         * this is important for inode updates via bchfs_write_index_update
         */
        if (!ret)
-               bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
+               bch2_inode_update_after_write(trans, inode, &inode_u, fields);
 
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
@@ -110,7 +112,7 @@ retry:
                             inode_inum(inode).subvol,
                             inode_inum(inode).inum);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret < 0 ? ret : 0;
 }
 
@@ -178,7 +180,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
        struct bch_inode_unpacked inode_u;
        struct bch_inode_info *inode;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct bch_subvolume subvol;
        int ret;
 
@@ -192,18 +194,18 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
        if (!(inode->v.i_state & I_NEW))
                return &inode->v;
 
-       bch2_trans_init(&trans, c, 8, 0);
-       ret = lockrestart_do(&trans,
-               bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
-               bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+       trans = bch2_trans_get(c);
+       ret = lockrestart_do(trans,
+               bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
 
        if (!ret)
-               bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
-       bch2_trans_exit(&trans);
+               bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+       bch2_trans_put(trans);
 
        if (ret) {
                iget_failed(&inode->v);
-               return ERR_PTR(ret);
+               return ERR_PTR(bch2_err_class(ret));
        }
 
        mutex_lock(&c->vfs_inodes_lock);
@@ -222,7 +224,7 @@ __bch2_create(struct mnt_idmap *idmap,
              unsigned flags)
 {
        struct bch_fs *c = dir->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct bch_inode_unpacked dir_u;
        struct bch_inode_info *inode, *old;
        struct bch_inode_unpacked inode_u;
@@ -252,13 +254,11 @@ __bch2_create(struct mnt_idmap *idmap,
        if (!(flags & BCH_CREATE_TMPFILE))
                mutex_lock(&dir->ei_update_lock);
 
-       bch2_trans_init(&trans, c, 8,
-                       2048 + (!(flags & BCH_CREATE_TMPFILE)
-                               ? dentry->d_name.len : 0));
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret   = bch2_create_trans(&trans,
+       ret   = bch2_create_trans(trans,
                                  inode_inum(dir), &dir_u, &inode_u,
                                  !(flags & BCH_CREATE_TMPFILE)
                                  ? &dentry->d_name : NULL,
@@ -274,9 +274,9 @@ retry:
        inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
        inum.inum = inode_u.bi_inum;
 
-       ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+       ret   = bch2_subvolume_get(trans, inum.subvol, true,
                                   BTREE_ITER_WITH_UPDATES, &subvol) ?:
-               bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+               bch2_trans_commit(trans, NULL, &journal_seq, 0);
        if (unlikely(ret)) {
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
@@ -287,13 +287,13 @@ err_before_quota:
        }
 
        if (!(flags & BCH_CREATE_TMPFILE)) {
-               bch2_inode_update_after_write(&trans, dir, &dir_u,
+               bch2_inode_update_after_write(trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
                mutex_unlock(&dir->ei_update_lock);
        }
 
        bch2_iget5_set(&inode->v, &inum);
-       bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+       bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
        set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -333,7 +333,7 @@ err_before_quota:
                unlock_new_inode(&inode->v);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 err:
        posix_acl_release(default_acl);
        posix_acl_release(acl);
@@ -342,7 +342,7 @@ err_trans:
        if (!(flags & BCH_CREATE_TMPFILE))
                mutex_unlock(&dir->ei_update_lock);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        make_bad_inode(&inode->v);
        iput(&inode->v);
        inode = ERR_PTR(ret);
@@ -397,26 +397,25 @@ static int __bch2_link(struct bch_fs *c,
                       struct bch_inode_info *dir,
                       struct dentry *dentry)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bch_inode_unpacked dir_u, inode_u;
        int ret;
 
        mutex_lock(&inode->ei_update_lock);
-       bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_link_trans(&trans,
+       ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_link_trans(trans,
                                        inode_inum(dir),   &dir_u,
                                        inode_inum(inode), &inode_u,
                                        &dentry->d_name));
 
        if (likely(!ret)) {
-               bch2_inode_update_after_write(&trans, dir, &dir_u,
+               bch2_inode_update_after_write(trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
+               bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
@@ -447,24 +446,23 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct bch_inode_unpacked dir_u, inode_u;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
 
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
-       bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = commit_do(&trans, NULL, NULL,
-                       BTREE_INSERT_NOFAIL,
-               bch2_unlink_trans(&trans,
+       ret = commit_do(trans, NULL, NULL,
+                       BCH_TRANS_COMMIT_no_enospc,
+               bch2_unlink_trans(trans,
                                  inode_inum(dir), &dir_u,
                                  &inode_u, &dentry->d_name,
                                  deleting_snapshot));
        if (unlikely(ret))
                goto err;
 
-       bch2_inode_update_after_write(&trans, dir, &dir_u,
+       bch2_inode_update_after_write(trans, dir, &dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
-       bch2_inode_update_after_write(&trans, inode, &inode_u,
+       bch2_inode_update_after_write(trans, inode, &inode_u,
                                      ATTR_MTIME);
 
        if (inode_u.bi_subvol) {
@@ -475,8 +473,8 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
                set_nlink(&inode->v, 0);
        }
 err:
-       bch2_trans_exit(&trans);
        bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -539,7 +537,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
        struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
        struct bch_inode_unpacked dst_dir_u, src_dir_u;
        struct bch_inode_unpacked src_inode_u, dst_inode_u;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        enum bch_rename_mode mode = flags & RENAME_EXCHANGE
                ? BCH_RENAME_EXCHANGE
                : dst_dentry->d_inode
@@ -556,7 +554,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
                        return ret;
        }
 
-       bch2_trans_init(&trans, c, 8, 2048);
+       trans = bch2_trans_get(c);
 
        bch2_lock_inodes(INODE_UPDATE_LOCK,
                         src_dir,
@@ -583,8 +581,8 @@ static int bch2_rename2(struct mnt_idmap *idmap,
                        goto err;
        }
 
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_rename_trans(&trans,
+       ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_rename_trans(trans,
                                          inode_inum(src_dir), &src_dir_u,
                                          inode_inum(dst_dir), &dst_dir_u,
                                          &src_inode_u,
@@ -599,21 +597,21 @@ static int bch2_rename2(struct mnt_idmap *idmap,
        BUG_ON(dst_inode &&
               dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-       bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
+       bch2_inode_update_after_write(trans, src_dir, &src_dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
 
        if (src_dir != dst_dir)
-               bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
+               bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
 
-       bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
+       bch2_inode_update_after_write(trans, src_inode, &src_inode_u,
                                      ATTR_CTIME);
 
        if (dst_inode)
-               bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
+               bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u,
                                              ATTR_CTIME);
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        bch2_fs_quota_transfer(c, src_inode,
                               bch_qid(&src_inode->ei_inode),
@@ -676,7 +674,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_qid qid;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl = NULL;
@@ -697,13 +695,13 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap,
        if (ret)
                goto err;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        kfree(acl);
        acl = NULL;
 
-       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inode_inum(inode),
                              BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
@@ -711,29 +709,29 @@ retry:
        bch2_setattr_copy(idmap, inode, &inode_u, attr);
 
        if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+               ret = bch2_acl_chmod(trans, inode_inum(inode), &inode_u,
                                     inode_u.bi_mode, &acl);
                if (ret)
                        goto btree_err;
        }
 
-       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL);
+       ret =   bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BCH_TRANS_COMMIT_no_enospc);
 btree_err:
-       bch2_trans_iter_exit(&trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
 
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
        if (unlikely(ret))
                goto err_trans;
 
-       bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
+       bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid);
 
        if (acl)
                set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
 err_trans:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 err:
        mutex_unlock(&inode->ei_update_lock);
 
@@ -755,9 +753,9 @@ static int bch2_getattr(struct mnt_idmap *idmap,
        stat->gid       = inode->v.i_gid;
        stat->rdev      = inode->v.i_rdev;
        stat->size      = i_size_read(&inode->v);
-       stat->atime     = inode->v.i_atime;
-       stat->mtime     = inode->v.i_mtime;
-       stat->ctime     = inode->v.i_ctime;
+       stat->atime     = inode_get_atime(&inode->v);
+       stat->mtime     = inode_get_mtime(&inode->v);
+       stat->ctime     = inode_get_ctime(&inode->v);
        stat->blksize   = block_bytes(c);
        stat->blocks    = inode->v.i_blocks;
 
@@ -766,15 +764,15 @@ static int bch2_getattr(struct mnt_idmap *idmap,
                stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
        }
 
-       if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
+       if (inode->ei_inode.bi_flags & BCH_INODE_immutable)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        stat->attributes_mask    |= STATX_ATTR_IMMUTABLE;
 
-       if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
+       if (inode->ei_inode.bi_flags & BCH_INODE_append)
                stat->attributes |= STATX_ATTR_APPEND;
        stat->attributes_mask    |= STATX_ATTR_APPEND;
 
-       if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
+       if (inode->ei_inode.bi_flags & BCH_INODE_nodump)
                stat->attributes |= STATX_ATTR_NODUMP;
        stat->attributes_mask    |= STATX_ATTR_NODUMP;
 
@@ -794,7 +792,7 @@ static int bch2_setattr(struct mnt_idmap *idmap,
                return ret;
 
        return iattr->ia_valid & ATTR_SIZE
-               ? bch2_truncate(idmap, inode, iattr)
+               ? bchfs_truncate(idmap, inode, iattr)
                : bch2_setattr_nonsize(idmap, inode, iattr);
 }
 
@@ -875,7 +873,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 {
        struct bch_fs *c = vinode->i_sb->s_fs_info;
        struct bch_inode_info *ei = to_bch_ei(vinode);
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
@@ -896,18 +894,18 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
 
        bch2_bkey_buf_init(&cur);
        bch2_bkey_buf_init(&prev);
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, ei->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(ei->v.i_ino, start, snapshot), 0);
 
-       while (!(ret = btree_trans_too_many_iters(&trans)) &&
+       while (!(ret = btree_trans_too_many_iters(trans)) &&
               (k = bch2_btree_iter_peek_upto(&iter, end)).k &&
               !(ret = bkey_err(k))) {
                enum btree_id data_btree = BTREE_ID_extents;
@@ -924,7 +922,7 @@ retry:
 
                bch2_bkey_buf_reassemble(&cur, c, k);
 
-               ret = bch2_read_indirect_extent(&trans, &data_btree,
+               ret = bch2_read_indirect_extent(trans, &data_btree,
                                        &offset_into_extent, &cur);
                if (ret)
                        break;
@@ -943,7 +941,7 @@ retry:
                cur.k->k.p.offset += cur.k->k.size;
 
                if (have_extent) {
-                       bch2_trans_unlock(&trans);
+                       bch2_trans_unlock(trans);
                        ret = bch2_fill_extent(c, info,
                                        bkey_i_to_s_c(prev.k), 0);
                        if (ret)
@@ -957,18 +955,18 @@ retry:
                        POS(iter.pos.inode, iter.pos.offset + sectors));
        }
        start = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
        if (!ret && have_extent) {
-               bch2_trans_unlock(&trans);
+               bch2_trans_unlock(trans);
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
        return ret < 0 ? ret : 0;
@@ -1000,11 +998,16 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret;
 
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       return bch2_readdir(c, inode_inum(inode), ctx);
+       ret = bch2_readdir(c, inode_inum(inode), ctx);
+       if (ret)
+               bch_err_fn(c, ret);
+
+       return bch2_err_class(ret);
 }
 
 static const struct file_operations bch_file_operations = {
@@ -1210,9 +1213,6 @@ static struct dentry *bch2_get_parent(struct dentry *child)
                .inum = inode->ei_inode.bi_dir,
        };
 
-       if (!parent_inum.inum)
-               return NULL;
-
        return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
 }
 
@@ -1221,7 +1221,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
        struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
        struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter1;
        struct btree_iter iter2;
        struct bkey_s_c k;
@@ -1229,29 +1229,30 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child
        struct bch_inode_unpacked inode_u;
        subvol_inum target;
        u32 snapshot;
-       unsigned name_len;
+       struct qstr dirent_name;
+       unsigned name_len = 0;
        int ret;
 
        if (!S_ISDIR(dir->v.i_mode))
                return -EINVAL;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+       bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents,
                             POS(dir->ei_inode.bi_inum, 0), 0);
-       bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+       bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents,
                             POS(dir->ei_inode.bi_inum, 0), 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, dir->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
        bch2_btree_iter_set_snapshot(&iter1, snapshot);
        bch2_btree_iter_set_snapshot(&iter2, snapshot);
 
-       ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+       ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u);
        if (ret)
                goto err;
 
@@ -1269,7 +1270,7 @@ retry:
                }
 
                d = bkey_s_c_to_dirent(k);
-               ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+               ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
                if (ret > 0)
                        ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
                if (ret)
@@ -1291,7 +1292,7 @@ retry:
                                continue;
 
                        d = bkey_s_c_to_dirent(k);
-                       ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+                       ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target);
                        if (ret < 0)
                                break;
                        if (ret)
@@ -1306,17 +1307,18 @@ retry:
        ret = -ENOENT;
        goto err;
 found:
-       name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+       dirent_name = bch2_dirent_get_name(d);
 
-       memcpy(name, d.v->d_name, name_len);
+       name_len = min_t(unsigned, dirent_name.len, NAME_MAX);
+       memcpy(name, dirent_name.name, name_len);
        name[name_len] = '\0';
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_iter_exit(&trans, &iter1);
-       bch2_trans_iter_exit(&trans, &iter2);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter1);
+       bch2_trans_iter_exit(trans, &iter2);
+       bch2_trans_put(trans);
 
        return ret;
 }
@@ -1406,15 +1408,16 @@ static void bch2_destroy_inode(struct inode *vinode)
        call_rcu(&vinode->i_rcu, bch2_i_callback);
 }
 
-static int inode_update_times_fn(struct bch_inode_info *inode,
+static int inode_update_times_fn(struct btree_trans *trans,
+                                struct bch_inode_info *inode,
                                 struct bch_inode_unpacked *bi,
                                 void *p)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
-       bi->bi_atime    = timespec_to_bch2_time(c, inode->v.i_atime);
-       bi->bi_mtime    = timespec_to_bch2_time(c, inode->v.i_mtime);
-       bi->bi_ctime    = timespec_to_bch2_time(c, inode->v.i_ctime);
+       bi->bi_atime    = timespec_to_bch2_time(c, inode_get_atime(&inode->v));
+       bi->bi_mtime    = timespec_to_bch2_time(c, inode_get_mtime(&inode->v));
+       bi->bi_ctime    = timespec_to_bch2_time(c, inode_get_ctime(&inode->v));
 
        return 0;
 }
@@ -1589,7 +1592,7 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
 static char **split_devs(const char *_dev_name, unsigned *nr)
 {
        char *dev_name = NULL, **devs = NULL, *s;
-       size_t i, nr_devs = 0;
+       size_t i = 0, nr_devs = 0;
 
        dev_name = kstrdup(_dev_name, GFP_KERNEL);
        if (!dev_name)
@@ -1604,9 +1607,7 @@ static char **split_devs(const char *_dev_name, unsigned *nr)
                return NULL;
        }
 
-       for (i = 0, s = dev_name;
-            s;
-            (s = strchr(s, ':')) && (*s++ = '\0'))
+       while ((s = strsep(&dev_name, ":")))
                devs[i++] = s;
 
        *nr = nr_devs;
@@ -1649,7 +1650,7 @@ static int bch2_remount(struct super_block *sb, int *flags, char *data)
                up_write(&c->state_lock);
        }
 
-       if (opts.errors >= 0)
+       if (opt_defined(opts, errors))
                c->opts.errors = opts.errors;
 err:
        return bch2_err_class(ret);
@@ -1710,6 +1711,35 @@ static void bch2_put_super(struct super_block *sb)
        __bch2_fs_stop(c);
 }
 
+/*
+ * bcachefs doesn't currently integrate intwrite freeze protection but the
+ * internal write references serve the same purpose. Therefore reuse the
+ * read-only transition code to perform the quiesce. The caveat is that we don't
+ * currently have the ability to block tasks that want a write reference while
+ * the superblock is frozen. This is fine for now, but we should either add
+ * blocking support or find a way to integrate sb_start_intwrite() and friends.
+ */
+static int bch2_freeze(struct super_block *sb)
+{
+       struct bch_fs *c = sb->s_fs_info;
+
+       down_write(&c->state_lock);
+       bch2_fs_read_only(c);
+       up_write(&c->state_lock);
+       return 0;
+}
+
+static int bch2_unfreeze(struct super_block *sb)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       int ret;
+
+       down_write(&c->state_lock);
+       ret = bch2_fs_read_write(c);
+       up_write(&c->state_lock);
+       return ret;
+}
+
 static const struct super_operations bch_super_operations = {
        .alloc_inode    = bch2_alloc_inode,
        .destroy_inode  = bch2_destroy_inode,
@@ -1721,10 +1751,8 @@ static const struct super_operations bch_super_operations = {
        .show_options   = bch2_show_options,
        .remount_fs     = bch2_remount,
        .put_super      = bch2_put_super,
-#if 0
        .freeze_fs      = bch2_freeze,
        .unfreeze_fs    = bch2_unfreeze,
-#endif
 };
 
 static int bch2_set_super(struct super_block *s, void *data)
@@ -1878,7 +1906,7 @@ got_sb:
        vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
        ret = PTR_ERR_OR_ZERO(vinode);
        if (ret) {
-               bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "mounting: error getting root inode");
                goto err_put_super;
        }
 
index 6170d214d6489720791edd25a3811c2274baa64b..5edf1d4b9e6bdfa9a992bf895727228c79de4267 100644 (file)
@@ -174,7 +174,8 @@ static inline int bch2_set_projid(struct bch_fs *c,
 struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 
 /* returns 0 if we want to do the update, or error is passed up */
-typedef int (*inode_set_fn)(struct bch_inode_info *,
+typedef int (*inode_set_fn)(struct btree_trans *,
+                           struct bch_inode_info *,
                            struct bch_inode_unpacked *, void *);
 
 void bch2_inode_update_after_write(struct btree_trans *,
@@ -196,7 +197,7 @@ int bch2_vfs_init(void);
 
 #else
 
-#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)       do {} while (0)
+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)       ({ do {} while (0); })
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
                                               snapshot_id_list *s) {}
index 0852dbe988ad1fb5bc6cdcd8f2da821c287017a1..cc90279fdf4ee85c8d6db9f8108ce188c9fd19aa 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey_buf.h"
+#include "btree_cache.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "darray.h"
@@ -11,7 +12,8 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
-#include "subvolume.h"
+#include "recovery.h"
+#include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
 
@@ -79,7 +81,7 @@ static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
        if (!ret)
                *subvol = le32_to_cpu(s.subvol);
        else if (bch2_err_matches(ret, ENOENT))
-               bch_err(trans->c, "snapshot %u not fonud", snapshot);
+               bch_err(trans->c, "snapshot %u not found", snapshot);
        return ret;
 
 }
@@ -125,9 +127,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
 
        ret = bch2_inode_unpack(k, inode);
 err:
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(trans->c, "error fetching inode %llu: %s",
-                       inode_nr, bch2_err_str(ret));
+       bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -152,9 +152,7 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
        if (!ret)
                *snapshot = iter.pos.snapshot;
 err:
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(trans->c, "error fetching inode %llu:%u: %s",
-                       inode_nr, *snapshot, bch2_err_str(ret));
+       bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -205,17 +203,16 @@ static int __write_inode(struct btree_trans *trans,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 }
 
-static int write_inode(struct btree_trans *trans,
-                      struct bch_inode_unpacked *inode,
-                      u32 snapshot)
+static int fsck_write_inode(struct btree_trans *trans,
+                           struct bch_inode_unpacked *inode,
+                           u32 snapshot)
 {
        int ret = commit_do(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW,
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_lazy_rw,
                                  __write_inode(trans, inode, snapshot));
        if (ret)
-               bch_err(trans->c, "error in fsck: error updating inode: %s",
-                       bch2_err_str(ret));
+               bch_err_fn(trans->c, ret);
        return ret;
 }
 
@@ -240,8 +237,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
        bch2_trans_iter_exit(trans, &iter);
 err:
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -276,14 +272,13 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
                goto create_lostfound;
        }
 
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret));
+       bch_err_fn(c, ret);
        if (ret)
                return ret;
 
        if (d_type != DT_DIR) {
                bch_err(c, "error looking up lost+found: not a directory");
-               return ret;
+               return -BCH_ERR_ENOENT_not_directory;
        }
 
        /*
@@ -299,8 +294,7 @@ create_lostfound:
                                lostfound, &lostfound_str,
                                0, 0, S_IFDIR|0700, 0, NULL, NULL,
                                (subvol_inum) { }, 0);
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err(c, "error creating lost+found: %s", bch2_err_str(ret));
+       bch_err_msg(c, ret, "creating lost+found");
        return ret;
 }
 
@@ -360,15 +354,10 @@ static int reattach_inode(struct btree_trans *trans,
                          u32 inode_snapshot)
 {
        int ret = commit_do(trans, NULL, NULL,
-                                 BTREE_INSERT_LAZY_RW|
-                                 BTREE_INSERT_NOFAIL,
+                                 BCH_TRANS_COMMIT_lazy_rw|
+                                 BCH_TRANS_COMMIT_no_enospc,
                        __reattach_inode(trans, inode, inode_snapshot));
-       if (ret) {
-               bch_err(trans->c, "error reattaching inode %llu: %s",
-                       inode->bi_inum, bch2_err_str(ret));
-               return ret;
-       }
-
+       bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
        return ret;
 }
 
@@ -408,6 +397,28 @@ static inline void snapshots_seen_init(struct snapshots_seen *s)
        memset(s, 0, sizeof(*s));
 }
 
+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+       struct snapshots_seen_entry *i, n = {
+               .id     = id,
+               .equiv  = bch2_snapshot_equiv(c, id),
+       };
+       int ret = 0;
+
+       darray_for_each(s->ids, i) {
+               if (i->id == id)
+                       return 0;
+               if (i->id > id)
+                       break;
+       }
+
+       ret = darray_insert_item(&s->ids, i - s->ids.data, n);
+       if (ret)
+               bch_err(c, "error reallocating snapshots_seen table (size %zu)",
+                       s->ids.size);
+       return ret;
+}
+
 static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
                                 enum btree_id btree_id, struct bpos pos)
 {
@@ -434,9 +445,10 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
                if (i->equiv == n.equiv) {
                        bch_err(c, "snapshot deletion did not finish:\n"
                                "  duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n",
-                               bch2_btree_ids[btree_id],
+                               bch2_btree_id_str(btree_id),
                                pos.inode, pos.offset,
                                i->id, n.id, n.equiv);
+                       set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
                        return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
                }
        }
@@ -452,7 +464,12 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
  * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
  * and @ancestor hasn't been overwritten in @seen
  *
- * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ * @c:         filesystem handle
+ * @seen:      list of snapshot ids already seen at current position
+ * @id:                descendent snapshot id
+ * @ancestor:  ancestor snapshot id
+ *
+ * Returns:    whether key in @ancestor snapshot is visible in @id snapshot
  */
 static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
                                    u32 id, u32 ancestor)
@@ -497,14 +514,16 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see
  * snapshot id @dst, test whether there is some snapshot in which @dst is
  * visible.
  *
- * This assumes we're visiting @src keys in natural key order.
+ * @c:         filesystem handle
+ * @s:         list of snapshot IDs already seen at @src
+ * @src:       snapshot ID of src key
+ * @dst:       snapshot ID of dst key
+ * Returns:    true if there is some snapshot in which @dst is visible
  *
- * @s  - list of snapshot IDs already seen at @src
- * @src        - snapshot ID of src key
- * @dst        - snapshot ID of dst key
+ * Assumes we're visiting @src keys in natural key order
  */
-static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
-                      u32 src, u32 dst)
+static bool ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+                       u32 src, u32 dst)
 {
        return dst <= src
                ? key_visible_in_snapshot(c, s, dst, src)
@@ -595,10 +614,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 
        w->first_this_inode = true;
 
-       if (trans_was_restarted(trans, restart_count))
-               return -BCH_ERR_transaction_restart_nested;
-
-       return 0;
+       return trans_was_restarted(trans, restart_count);
 }
 
 static struct inode_walker_entry *
@@ -705,8 +721,9 @@ static int check_key_has_snapshot(struct btree_trans *trans,
        int ret = 0;
 
        if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
-                       "key in missing snapshot: %s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+                               bkey_in_missing_snapshot,
+                               "key in missing snapshot: %s",
+                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
                ret = bch2_btree_delete_at(trans, iter,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
 fsck_err:
@@ -740,8 +757,8 @@ static int hash_redo_key(struct btree_trans *trans,
                                       BCH_HASH_SET_MUST_CREATE,
                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW);
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_lazy_rw);
 }
 
 static int hash_check_key(struct btree_trans *trans,
@@ -775,6 +792,7 @@ static int hash_check_key(struct btree_trans *trans,
 
                if (fsck_err_on(k.k->type == desc.key_type &&
                                !desc.cmp_bkey(k, hash_k), c,
+                               hash_table_key_duplicate,
                                "duplicate hash table keys:\n%s",
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, hash_k),
@@ -793,13 +811,13 @@ out:
        printbuf_exit(&buf);
        return ret;
 bad_hash:
-       if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
-                    bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash,
+       if (fsck_err(c, hash_table_key_wrong_offset,
+                    "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+                    bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
                     (printbuf_reset(&buf),
                      bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
                ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
-               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "hash_redo_key err %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
                if (ret)
                        return ret;
                ret = -BCH_ERR_transaction_restart_nested;
@@ -830,52 +848,65 @@ static int check_inode(struct btree_trans *trans,
        if (ret)
                goto err;
 
-       /*
-        * if snapshot id isn't a leaf node, skip it - deletion in
-        * particular is not atomic, so on the internal snapshot nodes
-        * we can see inodes marked for deletion after a clean shutdown
-        */
-       if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot))
-               return 0;
-
        if (!bkey_is_inode(k.k))
                return 0;
 
        BUG_ON(bch2_inode_unpack(k, &u));
 
        if (!full &&
-           !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-                           BCH_INODE_I_SECTORS_DIRTY|
-                           BCH_INODE_UNLINKED)))
+           !(u.bi_flags & (BCH_INODE_i_size_dirty|
+                           BCH_INODE_i_sectors_dirty|
+                           BCH_INODE_unlinked)))
                return 0;
 
        if (prev->bi_inum != u.bi_inum)
                *prev = u;
 
        if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
-                       inode_d_type(prev)      != inode_d_type(&u), c,
+                       inode_d_type(prev)      != inode_d_type(&u),
+                       c, inode_snapshot_mismatch,
                        "inodes in different snapshots don't match")) {
                bch_err(c, "repair not implemented yet");
                return -EINVAL;
        }
 
-       if (u.bi_flags & BCH_INODE_UNLINKED &&
+       if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
+           bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
+               struct bpos new_min_pos;
+
+               ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+               if (ret)
+                       goto err;
+
+               u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
+
+               ret = __write_inode(trans, &u, iter->pos.snapshot);
+               bch_err_msg(c, ret, "in fsck updating inode");
+               if (ret)
+                       return ret;
+
+               if (!bpos_eq(new_min_pos, POS_MIN))
+                       bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
+               return 0;
+       }
+
+       if (u.bi_flags & BCH_INODE_unlinked &&
            (!c->sb.clean ||
-            fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
+            fsck_err(c, inode_unlinked_but_clean,
+                     "filesystem marked clean, but inode %llu unlinked",
                      u.bi_inum))) {
                bch2_trans_unlock(trans);
                bch2_fs_lazy_rw(c);
 
                ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
-               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "error in fsck: error while deleting inode: %s",
-                               bch2_err_str(ret));
+               bch_err_msg(c, ret, "in fsck deleting inode");
                return ret;
        }
 
-       if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
+       if (u.bi_flags & BCH_INODE_i_size_dirty &&
            (!c->sb.clean ||
-            fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
+            fsck_err(c, inode_i_size_dirty_but_clean,
+                     "filesystem marked clean, but inode %llu has i_size dirty",
                      u.bi_inum))) {
                bch_verbose(c, "truncating inode %llu", u.bi_inum);
 
@@ -891,9 +922,7 @@ static int check_inode(struct btree_trans *trans,
                                     iter->pos.snapshot),
                                POS(u.bi_inum, U64_MAX),
                                0, NULL);
-               if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       bch_err(c, "error in fsck: error truncating inode: %s",
-                               bch2_err_str(ret));
+               bch_err_msg(c, ret, "in fsck truncating inode");
                if (ret)
                        return ret;
 
@@ -901,15 +930,16 @@ static int check_inode(struct btree_trans *trans,
                 * We truncated without our normal sector accounting hook, just
                 * make sure we recalculate it:
                 */
-               u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
+               u.bi_flags |= BCH_INODE_i_sectors_dirty;
 
-               u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
+               u.bi_flags &= ~BCH_INODE_i_size_dirty;
                do_update = true;
        }
 
-       if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
+       if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
            (!c->sb.clean ||
-            fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
+            fsck_err(c, inode_i_sectors_dirty_but_clean,
+                     "filesystem marked clean, but inode %llu has i_sectors dirty",
                      u.bi_inum))) {
                s64 sectors;
 
@@ -918,33 +948,31 @@ static int check_inode(struct btree_trans *trans,
 
                sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
                if (sectors < 0) {
-                       bch_err(c, "error in fsck: error recounting inode sectors: %s",
-                               bch2_err_str(sectors));
+                       bch_err_msg(c, sectors, "in fsck recounting inode sectors");
                        return sectors;
                }
 
                u.bi_sectors = sectors;
-               u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
+               u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
                do_update = true;
        }
 
-       if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
+       if (u.bi_flags & BCH_INODE_backptr_untrusted) {
                u.bi_dir = 0;
                u.bi_dir_offset = 0;
-               u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
+               u.bi_flags &= ~BCH_INODE_backptr_untrusted;
                do_update = true;
        }
 
        if (do_update) {
                ret = __write_inode(trans, &u, iter->pos.snapshot);
+               bch_err_msg(c, ret, "in fsck updating inode");
                if (ret)
-                       bch_err(c, "error in fsck: error updating inode: %s",
-                               bch2_err_str(ret));
+                       return ret;
        }
 err:
 fsck_err:
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -952,7 +980,7 @@ noinline_for_stack
 int bch2_check_inodes(struct bch_fs *c)
 {
        bool full = c->opts.fsck;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bch_inode_unpacked prev = { 0 };
        struct snapshots_seen s;
@@ -960,18 +988,16 @@ int bch2_check_inodes(struct bch_fs *c)
        int ret;
 
        snapshots_seen_init(&s);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
                        POS_MIN,
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_inode(&trans, &iter, k, &prev, &s, full));
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_inode(trans, &iter, k, &prev, &s, full));
 
-       bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
-       if (ret)
-               bch_err_fn(c, ret);
+       bch2_trans_put(trans);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -997,25 +1023,6 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
                : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
 }
 
-static int inode_backpointer_exists(struct btree_trans *trans,
-                                   struct bch_inode_unpacked *inode,
-                                   u32 snapshot)
-{
-       struct btree_iter iter;
-       struct bkey_s_c_dirent d;
-       int ret;
-
-       d = dirent_get_by_pos(trans, &iter,
-                       SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
-       ret = bkey_err(d);
-       if (ret)
-               return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
-       ret = dirent_points_to_inode(d, inode);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
 static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 {
        struct bch_fs *c = trans->c;
@@ -1039,22 +1046,20 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
                        return -BCH_ERR_internal_fsck_err;
                }
 
-               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
-                           "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
-                           w->last_pos.inode, i->snapshot,
-                           i->inode.bi_sectors, i->count)) {
+               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty),
+                               c, inode_i_sectors_wrong,
+                               "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+                               w->last_pos.inode, i->snapshot,
+                               i->inode.bi_sectors, i->count)) {
                        i->inode.bi_sectors = i->count;
-                       ret = write_inode(trans, &i->inode, i->snapshot);
+                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
        }
 fsck_err:
-       if (ret)
-               bch_err_fn(c, ret);
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
+       bch_err_fn(c, ret);
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1122,7 +1127,8 @@ static int extent_ends_at(struct bch_fs *c,
 
 static int overlapping_extents_found(struct btree_trans *trans,
                                     enum btree_id btree,
-                                    struct bpos pos1, struct bkey pos2,
+                                    struct bpos pos1, struct snapshots_seen *pos1_seen,
+                                    struct bkey pos2,
                                     bool *fixed,
                                     struct extent_end *extent_end)
 {
@@ -1185,7 +1191,8 @@ static int overlapping_extents_found(struct btree_trans *trans,
        prt_printf(&buf, "\n  overwriting %s extent",
                   pos1.snapshot >= pos2.p.snapshot ? "first" : "second");
 
-       if (fsck_err(c, "overlapping extents%s", buf.buf)) {
+       if (fsck_err(c, extent_overlapping,
+                    "overlapping extents%s", buf.buf)) {
                struct btree_iter *old_iter = &iter1;
                struct disk_reservation res = { 0 };
 
@@ -1200,7 +1207,7 @@ static int overlapping_extents_found(struct btree_trans *trans,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
                                k1, k2) ?:
                        bch2_trans_commit(trans, &res, NULL,
-                               BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc);
                bch2_disk_reservation_put(c, &res);
 
                if (ret)
@@ -1208,10 +1215,24 @@ static int overlapping_extents_found(struct btree_trans *trans,
 
                *fixed = true;
 
-               if (pos1.snapshot == pos2.p.snapshot)
+               if (pos1.snapshot == pos2.p.snapshot) {
+                       /*
+                        * We overwrote the first extent, and did the overwrite
+                        * in the same snapshot:
+                        */
                        extent_end->offset = bkey_start_offset(&pos2);
-               else
+               } else if (pos1.snapshot > pos2.p.snapshot) {
+                       /*
+                        * We overwrote the first extent in pos2's snapshot:
+                        */
+                       ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot);
+               } else {
+                       /*
+                        * We overwrote the second extent - restart
+                        * check_extent() from the top:
+                        */
                        ret = -BCH_ERR_transaction_restart_nested;
+               }
        }
 fsck_err:
 err:
@@ -1253,6 +1274,7 @@ static int check_overlapping_extents(struct btree_trans *trans,
                                                SPOS(iter->pos.inode,
                                                     i->offset,
                                                     i->snapshot),
+                                               &i->seen,
                                                *k.k, fixed, i);
                if (ret)
                        goto err;
@@ -1267,6 +1289,28 @@ err:
        return ret;
 }
 
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+                               struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+       unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+       bkey_for_each_crc(k.k, ptrs, crc, i)
+               if (crc_is_encoded(crc) &&
+                   crc.uncompressed_size > encoded_extent_max_sectors) {
+                       struct printbuf buf = PRINTBUF;
+
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
+                       printbuf_exit(&buf);
+               }
+
+       return 0;
+}
+
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bkey_s_c k,
                        struct inode_walker *inode,
@@ -1303,7 +1347,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                goto err;
 
        if (k.k->type != KEY_TYPE_whiteout) {
-               if (fsck_err_on(!i, c,
+               if (fsck_err_on(!i, c, extent_in_missing_inode,
                                "extent in missing inode:\n  %s",
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
@@ -1311,7 +1355,8 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 
                if (fsck_err_on(i &&
                                !S_ISREG(i->inode.bi_mode) &&
-                               !S_ISLNK(i->inode.bi_mode), c,
+                               !S_ISLNK(i->inode.bi_mode),
+                               c, extent_in_non_reg_inode,
                                "extent in non regular inode mode %o:\n  %s",
                                i->inode.bi_mode,
                                (printbuf_reset(&buf),
@@ -1341,9 +1386,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                        continue;
 
                if (k.k->type != KEY_TYPE_whiteout) {
-                       if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                       if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
                                        k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
-                                       !bkey_extent_is_reservation(k), c,
+                                       !bkey_extent_is_reservation(k),
+                                       c, extent_past_end_of_inode,
                                        "extent type past end of inode %llu:%u, i_size %llu\n  %s",
                                        i->inode.bi_inum, i->snapshot, i->inode.bi_size,
                                        (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1371,9 +1417,7 @@ out:
 err:
 fsck_err:
        printbuf_exit(&buf);
-
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 delete:
        ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
@@ -1388,7 +1432,7 @@ int bch2_check_extents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
        struct snapshots_seen s;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct extent_ends extent_ends;
@@ -1397,26 +1441,49 @@ int bch2_check_extents(struct bch_fs *c)
 
        snapshots_seen_init(&s);
        extent_ends_init(&extent_ends);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
                        &res, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
                bch2_disk_reservation_put(c, &res);
-               check_extent(&trans, &iter, k, &w, &s, &extent_ends);
+               check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+               check_extent_overbig(trans, &iter, k);
        })) ?:
-       check_i_sectors(&trans, &w);
+       check_i_sectors(trans, &w);
 
        bch2_disk_reservation_put(c, &res);
        extent_ends_exit(&extent_ends);
        inode_walker_exit(&w);
-       bch2_trans_exit(&trans);
        snapshots_seen_exit(&s);
+       bch2_trans_put(trans);
 
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
+       return ret;
+}
+
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct disk_reservation res = { 0 };
+       int ret = 0;
+
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+                       POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       &res, NULL,
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc, ({
+               bch2_disk_reservation_put(c, &res);
+               check_extent_overbig(trans, &iter, k);
+       }));
+
+       bch2_disk_reservation_put(c, &res);
+       bch2_trans_put(trans);
+
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1444,21 +1511,19 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
                                continue;
                }
 
-               if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+               if (fsck_err_on(i->inode.bi_nlink != i->count,
+                               c, inode_dir_wrong_nlink,
                                "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
                                w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
                        i->inode.bi_nlink = i->count;
-                       ret = write_inode(trans, &i->inode, i->snapshot);
+                       ret = fsck_write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
                }
        }
 fsck_err:
-       if (ret)
-               bch_err_fn(c, ret);
-       if (!ret && trans_was_restarted(trans, restart_count))
-               ret = -BCH_ERR_transaction_restart_nested;
-       return ret;
+       bch_err_fn(c, ret);
+       return ret ?: trans_was_restarted(trans, restart_count);
 }
 
 static int check_dirent_target(struct btree_trans *trans,
@@ -1469,8 +1534,8 @@ static int check_dirent_target(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bkey_i_dirent *n;
-       bool backpointer_exists = true;
        struct printbuf buf = PRINTBUF;
+       struct btree_iter bp_iter = { NULL };
        int ret = 0;
 
        if (!target->bi_dir &&
@@ -1484,34 +1549,47 @@ static int check_dirent_target(struct btree_trans *trans,
        }
 
        if (!inode_points_to_dirent(target, d)) {
-               ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
-               if (ret < 0)
+               struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+                                     SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+               ret = bkey_err(bp_dirent);
+               if (ret && !bch2_err_matches(ret, ENOENT))
                        goto err;
 
-               backpointer_exists = ret;
+               bool backpointer_exists = !ret;
                ret = 0;
 
-               if (fsck_err_on(S_ISDIR(target->bi_mode) &&
-                               backpointer_exists, c,
-                               "directory %llu with multiple links",
-                               target->bi_inum)) {
+               bch2_bkey_val_to_text(&buf, c, d.s_c);
+               prt_newline(&buf);
+               if (backpointer_exists)
+                       bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+               if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
+                               c, inode_dir_multiple_links,
+                               "directory %llu:%u with multiple links\n%s",
+                               target->bi_inum, target_snapshot, buf.buf)) {
                        ret = __remove_dirent(trans, d.k->p);
                        goto out;
                }
 
-               if (fsck_err_on(backpointer_exists &&
-                               !target->bi_nlink, c,
-                               "inode %llu type %s has multiple links but i_nlink 0",
-                               target->bi_inum, bch2_d_types[d.v->d_type])) {
+               /*
+                * hardlinked file with nlink 0:
+                * We're just adjusting nlink here so check_nlinks() will pick
+                * it up, it ignores inodes with nlink 0
+                */
+               if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+                               c, inode_multiple_links_but_nlink_0,
+                               "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+                               target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
                        target->bi_nlink++;
-                       target->bi_flags &= ~BCH_INODE_UNLINKED;
+                       target->bi_flags &= ~BCH_INODE_unlinked;
 
                        ret = __write_inode(trans, target, target_snapshot);
                        if (ret)
                                goto err;
                }
 
-               if (fsck_err_on(!backpointer_exists, c,
+               if (fsck_err_on(!backpointer_exists,
+                               c, inode_wrong_backpointer,
                                "inode %llu:%u has wrong backpointer:\n"
                                "got       %llu:%llu\n"
                                "should be %llu:%llu",
@@ -1529,7 +1607,8 @@ static int check_dirent_target(struct btree_trans *trans,
                }
        }
 
-       if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+       if (fsck_err_on(d.v->d_type != inode_d_type(target),
+                       c, dirent_d_type_wrong,
                        "incorrect d_type: got %s, should be %s:\n%s",
                        bch2_d_type_str(d.v->d_type),
                        bch2_d_type_str(inode_d_type(target)),
@@ -1553,7 +1632,8 @@ static int check_dirent_target(struct btree_trans *trans,
        if (d.v->d_type == DT_SUBVOL &&
            target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
            (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
-            fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+            fsck_err(c, dirent_d_parent_subvol_wrong,
+                     "dirent has wrong d_parent_subvol field: got %u, should be %u",
                      le32_to_cpu(d.v->d_parent_subvol),
                      target->bi_parent_subvol))) {
                n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
@@ -1573,10 +1653,9 @@ static int check_dirent_target(struct btree_trans *trans,
 out:
 err:
 fsck_err:
+       bch2_trans_iter_exit(trans, &bp_iter);
        printbuf_exit(&buf);
-
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1627,7 +1706,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
        dir->first_this_inode = false;
 
-       if (fsck_err_on(!i, c,
+       if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
                        "dirent in nonexisting directory:\n%s",
                        (printbuf_reset(&buf),
                         bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
@@ -1639,7 +1718,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
        if (!i)
                goto out;
 
-       if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+       if (fsck_err_on(!S_ISDIR(i->inode.bi_mode),
+                       c, dirent_in_non_dir_inode,
                        "dirent in non directory inode type %s:\n%s",
                        bch2_d_type_str(inode_d_type(&i->inode)),
                        (printbuf_reset(&buf),
@@ -1673,7 +1753,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                if (ret && !bch2_err_matches(ret, ENOENT))
                        goto err;
 
-               if (fsck_err_on(ret, c,
+               if (fsck_err_on(ret, c, dirent_to_missing_subvol,
                                "dirent points to missing subvolume %u",
                                le32_to_cpu(d.v->d_child_subvol))) {
                        ret = __remove_dirent(trans, d.k->p);
@@ -1685,7 +1765,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                if (ret && !bch2_err_matches(ret, ENOENT))
                        goto err;
 
-               if (fsck_err_on(ret, c,
+               if (fsck_err_on(ret, c, subvol_to_missing_root,
                                "subvolume %u points to missing subvolume root %llu",
                                target_subvol,
                                target_inum)) {
@@ -1694,7 +1774,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        goto err;
                }
 
-               if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+               if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
+                               c, subvol_root_wrong_bi_subvol,
                                "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
                                target_inum,
                                subvol_root.bi_subvol, target_subvol)) {
@@ -1713,7 +1794,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                if (ret)
                        goto err;
 
-               if (fsck_err_on(!target->inodes.nr, c,
+               if (fsck_err_on(!target->inodes.nr,
+                               c, dirent_to_missing_inode,
                                "dirent points to missing inode: (equiv %u)\n%s",
                                equiv.snapshot,
                                (printbuf_reset(&buf),
@@ -1740,9 +1822,7 @@ out:
 err:
 fsck_err:
        printbuf_exit(&buf);
-
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1756,29 +1836,26 @@ int bch2_check_dirents(struct bch_fs *c)
        struct inode_walker target = inode_walker_init();
        struct snapshots_seen s;
        struct bch_hash_info hash_info;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
        snapshots_seen_init(&s);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents,
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s));
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        snapshots_seen_exit(&s);
        inode_walker_exit(&dir);
        inode_walker_exit(&target);
-
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1804,7 +1881,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
                *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode);
        inode->first_this_inode = false;
 
-       if (fsck_err_on(!i, c,
+       if (fsck_err_on(!i, c, xattr_in_missing_inode,
                        "xattr for missing inode %llu",
                        k.k->p.inode))
                return bch2_btree_delete_at(trans, iter, 0);
@@ -1814,8 +1891,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 
        ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
 fsck_err:
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1826,25 +1902,19 @@ int bch2_check_xattrs(struct bch_fs *c)
 {
        struct inode_walker inode = inode_walker_init();
        struct bch_hash_info hash_info;
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+       ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                        POS(BCACHEFS_ROOT_INO, 0),
                        BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
                        k,
                        NULL, NULL,
-                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_xattr(&trans, &iter, k, &hash_info, &inode));
-
-       bch2_trans_exit(&trans);
-
-       if (ret)
-               bch_err_fn(c, ret);
+                       BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_xattr(trans, &iter, k, &hash_info, &inode)));
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1860,7 +1930,8 @@ static int check_root_trans(struct btree_trans *trans)
        if (ret && !bch2_err_matches(ret, ENOENT))
                return ret;
 
-       if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+       if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
+                               "root subvol missing")) {
                struct bkey_i_subvolume root_subvol;
 
                snapshot        = U32_MAX;
@@ -1872,14 +1943,13 @@ static int check_root_trans(struct btree_trans *trans)
                root_subvol.v.snapshot  = cpu_to_le32(snapshot);
                root_subvol.v.inode     = cpu_to_le64(inum);
                ret = commit_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                       __bch2_btree_insert(trans, BTREE_ID_subvolumes,
+                                     BCH_TRANS_COMMIT_no_enospc|
+                                     BCH_TRANS_COMMIT_lazy_rw,
+                       bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
                                            &root_subvol.k_i, 0));
-               if (ret) {
-                       bch_err(c, "error writing root subvol: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "writing root subvol");
+               if (ret)
                        goto err;
-               }
 
        }
 
@@ -1887,16 +1957,17 @@ static int check_root_trans(struct btree_trans *trans)
        if (ret && !bch2_err_matches(ret, ENOENT))
                return ret;
 
-       if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
-           mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+       if (mustfix_fsck_err_on(ret, c, root_dir_missing,
+                               "root directory missing") ||
+           mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode),
+                               c, root_inode_not_dir,
                                "root inode not a directory")) {
                bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
                                0, NULL);
                root_inode.bi_inum = inum;
 
                ret = __write_inode(trans, &root_inode, snapshot);
-               if (ret)
-                       bch_err(c, "error writing root inode: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "writing root inode");
        }
 err:
 fsck_err:
@@ -1909,12 +1980,10 @@ int bch2_check_root(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_do(c, NULL, NULL,
-                            BTREE_INSERT_NOFAIL|
-                            BTREE_INSERT_LAZY_RW,
-               check_root_trans(&trans));
-
-       if (ret)
-               bch_err_fn(c, ret);
+                            BCH_TRANS_COMMIT_no_enospc|
+                            BCH_TRANS_COMMIT_lazy_rw,
+               check_root_trans(trans));
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1995,7 +2064,8 @@ static int check_path(struct btree_trans *trans,
                }
 
                if (bch2_err_matches(ret, ENOENT)) {
-                       if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+                       if (fsck_err(c,  inode_unreachable,
+                                    "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
                                     inode->bi_inum, snapshot,
                                     bch2_d_type_str(inode_d_type(inode)),
                                     inode->bi_nlink,
@@ -2035,12 +2105,13 @@ static int check_path(struct btree_trans *trans,
                                pr_err("%llu:%u", i->inum, i->snapshot);
                        pr_err("%llu:%u", inode->bi_inum, snapshot);
 
-                       if (!fsck_err(c, "directory structure loop"))
+                       if (!fsck_err(c, dir_loop,
+                                     "directory structure loop"))
                                return 0;
 
                        ret = commit_do(trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL|
-                                             BTREE_INSERT_LAZY_RW,
+                                             BCH_TRANS_COMMIT_no_enospc|
+                                             BCH_TRANS_COMMIT_lazy_rw,
                                        remove_backpointer(trans, inode));
                        if (ret) {
                                bch_err(c, "error removing dirent: %i", ret);
@@ -2051,8 +2122,7 @@ static int check_path(struct btree_trans *trans,
                }
        }
 fsck_err:
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -2063,16 +2133,14 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        pathbuf path = { 0, };
        int ret;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2086,24 +2154,20 @@ int bch2_check_directory_structure(struct bch_fs *c)
                        break;
                }
 
-               if (u.bi_flags & BCH_INODE_UNLINKED)
+               if (u.bi_flags & BCH_INODE_unlinked)
                        continue;
 
-               ret = check_path(&trans, &path, &u, iter.pos.snapshot);
+               ret = check_path(trans, &path, &u, iter.pos.snapshot);
                if (ret)
                        break;
        }
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        darray_exit(&path);
-
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
-/* check_nlink pass: */
-
 struct nlink_table {
        size_t          nr;
        size_t          size;
@@ -2150,7 +2214,7 @@ static int nlink_cmp(const void *_l, const void *_r)
        const struct nlink *l = _l;
        const struct nlink *r = _r;
 
-       return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+       return cmp_int(l->inum, r->inum);
 }
 
 static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
@@ -2185,15 +2249,13 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                                       struct nlink_table *t,
                                       u64 start, u64 *end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+       for_each_btree_key(trans, iter, BTREE_ID_inodes,
                           POS(0, start),
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
@@ -2222,8 +2284,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                }
 
        }
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
@@ -2235,7 +2297,7 @@ noinline_for_stack
 static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
                                     u64 range_start, u64 range_end)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct snapshots_seen s;
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -2244,9 +2306,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
 
        snapshots_seen_init(&s);
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
                           BTREE_ITER_INTENT|
                           BTREE_ITER_PREFETCH|
                           BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
@@ -2266,12 +2326,12 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                        break;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        snapshots_seen_exit(&s);
        return ret;
 }
@@ -2306,7 +2366,8 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
                link = &links->d[++*idx];
        }
 
-       if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+       if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count,
+                       c, inode_wrong_nlink,
                        "inode %llu type %s has wrong i_nlink (%u, should be %u)",
                        u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
                        bch2_inode_nlink_get(&u), link->count)) {
@@ -2322,24 +2383,19 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                               struct nlink_table *links,
                               u64 range_start, u64 range_end)
 {
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        size_t idx = 0;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes,
-                       POS(0, range_start),
-                       BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end));
-
-       bch2_trans_exit(&trans);
-
+       ret = bch2_trans_run(c,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+                               POS(0, range_start),
+                               BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+                               NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+                       check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
        if (ret < 0) {
-               bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+               bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
                return ret;
        }
 
@@ -2376,9 +2432,7 @@ int bch2_check_nlinks(struct bch_fs *c)
        } while (next_iter_range_start != U64_MAX);
 
        kvfree(links.d);
-
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -2419,14 +2473,12 @@ int bch2_fix_reflink_p(struct bch_fs *c)
                return 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                                BTREE_ID_extents, POS_MIN,
                                BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
                                BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       fix_reflink_p_key(&trans, &iter, k)));
-
-       if (ret)
-               bch_err_fn(c, ret);
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+                       fix_reflink_p_key(trans, &iter, k)));
+       bch_err_fn(c, ret);
        return ret;
 }
index 90c87b5089a01403bceabeeed390fa3adfea760e..da991e8cf27eb493ed5aac5a3e3da606ae089968 100644 (file)
@@ -4,6 +4,7 @@
 
 int bch2_check_inodes(struct bch_fs *);
 int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
 int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
index fea21e1e5721e302504a751443835e3b91941590..b9d6dbf3a54b26bacc211b3d4779fddc68460af6 100644 (file)
@@ -6,11 +6,13 @@
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "compress.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "varint.h"
 
 
 #include <asm/unaligned.h>
 
-const char * const bch2_inode_opts[] = {
 #define x(name, ...)   #name,
+const char * const bch2_inode_opts[] = {
        BCH_INODE_OPTS()
-#undef  x
        NULL,
 };
 
+static const char * const bch2_inode_flag_strs[] = {
+       BCH_INODE_FLAGS()
+       NULL
+};
+#undef  x
+
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 
 static int inode_decode_field(const u8 *in, const u8 *end,
@@ -119,8 +126,7 @@ static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed,
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
 
-               int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
-                                          &unpacked);
+               ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked);
                BUG_ON(ret);
                BUG_ON(unpacked.bi_inum         != inode->bi_inum);
                BUG_ON(unpacked.bi_hash_seed    != inode->bi_hash_seed);
@@ -317,7 +323,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
        return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-int bch2_inode_peek(struct btree_trans *trans,
+static int bch2_inode_peek_nowarn(struct btree_trans *trans,
                    struct btree_iter *iter,
                    struct bch_inode_unpacked *inode,
                    subvol_inum inum, unsigned flags)
@@ -351,9 +357,20 @@ err:
        return ret;
 }
 
-int bch2_inode_write(struct btree_trans *trans,
+int bch2_inode_peek(struct btree_trans *trans,
+                   struct btree_iter *iter,
+                   struct bch_inode_unpacked *inode,
+                   subvol_inum inum, unsigned flags)
+{
+       int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+       bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum);
+       return ret;
+}
+
+int bch2_inode_write_flags(struct btree_trans *trans,
                     struct btree_iter *iter,
-                    struct bch_inode_unpacked *inode)
+                    struct bch_inode_unpacked *inode,
+                    enum btree_update_flags flags)
 {
        struct bkey_inode_buf *inode_p;
 
@@ -363,7 +380,7 @@ int bch2_inode_write(struct btree_trans *trans,
 
        bch2_inode_pack_inlined(inode_p, inode);
        inode_p->inode.k.p.snapshot = iter->snapshot;
-       return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
+       return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
@@ -387,117 +404,121 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
        return &inode_p->inode.k_i;
 }
 
-static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
+static int __bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k, struct printbuf *err)
 {
        struct bch_inode_unpacked unpacked;
+       int ret = 0;
 
-       if (k.k->p.inode) {
-               prt_printf(err, "nonzero k.p.inode");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(k.k->p.inode, c, err,
+                        inode_pos_inode_nonzero,
+                        "nonzero k.p.inode");
 
-       if (k.k->p.offset < BLOCKDEV_INODE_MAX) {
-               prt_printf(err, "fs inode in blockdev range");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(k.k->p.offset < BLOCKDEV_INODE_MAX, c, err,
+                        inode_pos_blockdev_range,
+                        "fs inode in blockdev range");
 
-       if (bch2_inode_unpack(k, &unpacked)) {
-               prt_printf(err, "invalid variable length fields");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) {
-               prt_printf(err, "invalid data checksum type (%u >= %u",
-                       unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bch2_inode_unpack(k, &unpacked), c, err,
+                        inode_unpack_error,
+                        "invalid variable length fields");
 
-       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
-               prt_printf(err, "invalid data checksum type (%u >= %u)",
-                      unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1, c, err,
+                        inode_checksum_type_invalid,
+                        "invalid data checksum type (%u >= %u",
+                        unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1);
 
-       if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
-           unpacked.bi_nlink != 0) {
-               prt_printf(err, "flagged as unlinked but bi_nlink != 0");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(unpacked.bi_compression &&
+                        !bch2_compression_opt_valid(unpacked.bi_compression - 1), c, err,
+                        inode_compression_type_invalid,
+                        "invalid compression opt %u", unpacked.bi_compression - 1);
 
-       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) {
-               prt_printf(err, "subvolume root but not a directory");
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on((unpacked.bi_flags & BCH_INODE_unlinked) &&
+                        unpacked.bi_nlink != 0, c, err,
+                        inode_unlinked_but_nlink_nonzero,
+                        "flagged as unlinked but bi_nlink != 0");
 
-       return 0;
+       bkey_fsck_err_on(unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode), c, err,
+                        inode_subvol_root_but_not_dir,
+                        "subvolume root but not a directory");
+fsck_err:
+       return ret;
 }
 
-int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_invalid(struct bch_fs *c, struct bkey_s_c k,
                       enum bkey_invalid_flags flags,
                       struct printbuf *err)
 {
        struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+       int ret = 0;
 
-       if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
-               prt_printf(err, "invalid str hash type (%llu >= %u)",
-                      INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+                        inode_str_hash_invalid,
+                        "invalid str hash type (%llu >= %u)",
+                        INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-       return __bch2_inode_invalid(k, err);
+       ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+       return ret;
 }
 
-int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+       int ret = 0;
 
-       if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
-               prt_printf(err, "invalid str hash type (%llu >= %u)",
-                      INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+                        inode_str_hash_invalid,
+                        "invalid str hash type (%llu >= %u)",
+                        INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-       return __bch2_inode_invalid(k, err);
+       ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+       return ret;
 }
 
-int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_v3_invalid(struct bch_fs *c, struct bkey_s_c k,
                          enum bkey_invalid_flags flags,
                          struct printbuf *err)
 {
        struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k);
+       int ret = 0;
 
-       if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
-           INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) {
-               prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)",
-                      INODEv3_FIELDS_START(inode.v),
-                      INODEv3_FIELDS_START_INITIAL,
-                      bkey_val_u64s(inode.k));
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL ||
+                        INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k), c, err,
+                        inode_v3_fields_start_bad,
+                        "invalid fields_start (got %llu, min %u max %zu)",
+                        INODEv3_FIELDS_START(inode.v),
+                        INODEv3_FIELDS_START_INITIAL,
+                        bkey_val_u64s(inode.k));
 
-       if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) {
-               prt_printf(err, "invalid str hash type (%llu >= %u)",
-                      INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR, c, err,
+                        inode_str_hash_invalid,
+                        "invalid str hash type (%llu >= %u)",
+                        INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR);
 
-       return __bch2_inode_invalid(k, err);
+       ret = __bch2_inode_invalid(c, k, err);
+fsck_err:
+       return ret;
 }
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out,
                                          struct bch_inode_unpacked *inode)
 {
-       prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu",
-              inode->bi_mode, inode->bi_flags,
+       prt_printf(out, "mode=%o ", inode->bi_mode);
+
+       prt_str(out, "flags=");
+       prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
+       prt_printf(out, " (%x)", inode->bi_flags);
+
+       prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
               inode->bi_journal_seq,
               inode->bi_size,
               inode->bi_sectors,
               inode->bi_version);
 
 #define x(_name, _bits)                                                \
-       prt_printf(out, " "#_name " %llu", (u64) inode->_name);
+       prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
        BCH_INODE_FIELDS_v3()
 #undef  x
 }
@@ -520,23 +541,25 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
        __bch2_inode_unpacked_to_text(out, &inode);
 }
 
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline u64 bkey_inode_flags(struct bkey_s_c k)
 {
        switch (k.k->type) {
        case KEY_TYPE_inode:
-               return bkey_s_c_to_inode(k).v->bi_flags &
-                       cpu_to_le32(BCH_INODE_UNLINKED);
+               return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
        case KEY_TYPE_inode_v2:
-               return bkey_s_c_to_inode_v2(k).v->bi_flags &
-                       cpu_to_le32(BCH_INODE_UNLINKED);
+               return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
        case KEY_TYPE_inode_v3:
-               return bkey_s_c_to_inode_v3(k).v->bi_flags &
-                       cpu_to_le64(BCH_INODE_UNLINKED);
+               return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
        default:
-               return false;
+               return 0;
        }
 }
 
+static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+{
+       return bkey_inode_flags(k) & BCH_INODE_unlinked;
+}
+
 int bch2_trans_mark_inode(struct btree_trans *trans,
                          enum btree_id btree_id, unsigned level,
                          struct bkey_s_c old,
@@ -598,16 +621,17 @@ int bch2_mark_inode(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_inode_generation_invalid(struct bch_fs *c, struct bkey_s_c k,
                                  enum bkey_invalid_flags flags,
                                  struct printbuf *err)
 {
-       if (k.k->p.inode) {
-               prt_printf(err, "nonzero k.p.inode");
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       return 0;
+       bkey_fsck_err_on(k.k->p.inode, c, err,
+                        inode_pos_inode_nonzero,
+                        "nonzero k.p.inode");
+fsck_err:
+       return ret;
 }
 
 void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -768,6 +792,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i delete;
+       struct bpos end = POS(inum.inum, U64_MAX);
        u32 snapshot;
        int ret = 0;
 
@@ -776,7 +801,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
         * extent iterator:
         */
        bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
-                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+                            BTREE_ITER_INTENT);
 
        while (1) {
                bch2_trans_begin(trans);
@@ -787,7 +812,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
 
                bch2_btree_iter_set_snapshot(&iter, snapshot);
 
-               k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX));
+               k = bch2_btree_iter_peek_upto(&iter, end);
                ret = bkey_err(k);
                if (ret)
                        goto err;
@@ -798,9 +823,14 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
                bkey_init(&delete.k);
                delete.k.p = iter.pos;
 
+               if (iter.flags & BTREE_ITER_IS_EXTENTS)
+                       bch2_key_resize(&delete.k,
+                                       bpos_min(end, k.k->p).offset -
+                                       iter.pos.offset);
+
                ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
                      bch2_trans_commit(trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL);
+                                       BCH_TRANS_COMMIT_no_enospc);
 err:
                if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        break;
@@ -812,7 +842,7 @@ err:
 
 int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
        struct bch_inode_unpacked inode_u;
@@ -820,8 +850,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
        u32 snapshot;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 1024);
-
        /*
         * If this was a directory, there shouldn't be any real dirents left -
         * but there could be whiteouts (from hash collisions) that we should
@@ -830,19 +858,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
-               bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
-               bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
+       ret   = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?:
+               bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?:
+               bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents);
        if (ret)
                goto err;
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
 
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                goto err;
 
-       k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes,
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
                               SPOS(0, inum.inum, snapshot),
                               BTREE_ITER_INTENT|BTREE_ITER_CACHED);
        ret = bkey_err(k);
@@ -850,7 +878,7 @@ retry:
                goto err;
 
        if (!bkey_is_inode(k.k)) {
-               bch2_fs_inconsistent(trans.c,
+               bch2_fs_inconsistent(c,
                                     "inode %llu:%u not found when deleting",
                                     inum.inum, snapshot);
                ret = -EIO;
@@ -863,15 +891,28 @@ retry:
        delete.k.p = iter.pos;
        delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-       ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
-               bch2_trans_commit(&trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL);
+       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
+       return ret;
+}
+
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans,
+                                 subvol_inum inum,
+                                 struct bch_inode_unpacked *inode)
+{
+       struct btree_iter iter;
+       int ret;
+
+       ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0);
+       if (!ret)
+               bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -892,13 +933,13 @@ int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
                            struct bch_inode_unpacked *inode)
 {
        return bch2_trans_do(c, NULL, NULL, 0,
-               bch2_inode_find_by_inum_trans(&trans, inum, inode));
+               bch2_inode_find_by_inum_trans(trans, inum, inode));
 }
 
 int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
 {
-       if (bi->bi_flags & BCH_INODE_UNLINKED)
-               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+       if (bi->bi_flags & BCH_INODE_unlinked)
+               bi->bi_flags &= ~BCH_INODE_unlinked;
        else {
                if (bi->bi_nlink == U32_MAX)
                        return -EINVAL;
@@ -911,13 +952,13 @@ int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
 
 void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi)
 {
-       if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) {
+       if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_unlinked)) {
                bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero",
                                        bi->bi_inum);
                return;
        }
 
-       if (bi->bi_flags & BCH_INODE_UNLINKED) {
+       if (bi->bi_flags & BCH_INODE_unlinked) {
                bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum);
                return;
        }
@@ -925,7 +966,7 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *
        if (bi->bi_nlink)
                bi->bi_nlink--;
        else
-               bi->bi_flags |= BCH_INODE_UNLINKED;
+               bi->bi_flags |= BCH_INODE_unlinked;
 }
 
 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode)
@@ -950,6 +991,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
                opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 }
 
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+       struct bch_inode_unpacked inode;
+       int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+       if (ret)
+               return ret;
+
+       bch2_inode_opts_get(opts, trans->c, &inode);
+       return 0;
+}
+
 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
        struct bch_fs *c = trans->c;
@@ -1004,7 +1057,7 @@ retry:
 
        ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
                bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL);
+                               BCH_TRANS_COMMIT_no_enospc);
 err:
        bch2_trans_iter_exit(trans, &iter);
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1013,60 +1066,98 @@ err:
        return ret ?: -BCH_ERR_transaction_restart_nested;
 }
 
-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos)
+static int may_delete_deleted_inode(struct btree_trans *trans,
+                                   struct btree_iter *iter,
+                                   struct bpos pos,
+                                   bool *need_another_pass)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter iter;
+       struct btree_iter inode_iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked inode;
        int ret;
 
-       if (bch2_snapshot_is_internal_node(c, pos.snapshot))
-               return 0;
-
-       if (!fsck_err_on(c->sb.clean, c,
-                        "filesystem marked as clean but have deleted inode %llu:%u",
-                        pos.offset, pos.snapshot))
-               return 0;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
+       k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED);
        ret = bkey_err(k);
        if (ret)
                return ret;
 
        ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode;
        if (fsck_err_on(!bkey_is_inode(k.k), c,
+                       deleted_inode_missing,
                        "nonexistent inode %llu:%u in deleted_inodes btree",
                        pos.offset, pos.snapshot))
                goto delete;
 
        ret = bch2_inode_unpack(k, &inode);
        if (ret)
-               goto err;
+               goto out;
+
+       if (fsck_err_on(S_ISDIR(inode.bi_mode), c,
+                       deleted_inode_is_dir,
+                       "directory %llu:%u in deleted_inodes btree",
+                       pos.offset, pos.snapshot))
+               goto delete;
 
-       if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c,
+       if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), c,
+                       deleted_inode_not_unlinked,
                        "non-deleted inode %llu:%u in deleted_inodes btree",
                        pos.offset, pos.snapshot))
                goto delete;
 
-       return 1;
-err:
+       if (c->sb.clean &&
+           !fsck_err(c,
+                     deleted_inode_but_clean,
+                     "filesystem marked as clean but have deleted inode %llu:%u",
+                     pos.offset, pos.snapshot)) {
+               ret = 0;
+               goto out;
+       }
+
+       if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
+               struct bpos new_min_pos;
+
+               ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
+               if (ret)
+                       goto out;
+
+               inode.bi_flags &= ~BCH_INODE_unlinked;
+
+               ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
+                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+               bch_err_msg(c, ret, "clearing inode unlinked flag");
+               if (ret)
+                       goto out;
+
+               /*
+                * We'll need another write buffer flush to pick up the new
+                * unlinked inodes in the snapshot leaves:
+                */
+               *need_another_pass = true;
+               goto out;
+       }
+
+       ret = 1;
+out:
 fsck_err:
+       bch2_trans_iter_exit(trans, &inode_iter);
        return ret;
 delete:
-       return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+       ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+       goto out;
 }
 
 int bch2_delete_dead_inodes(struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
+       bool need_another_pass;
        int ret;
+again:
+       need_another_pass = false;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = bch2_btree_write_buffer_flush_sync(&trans);
+       ret = bch2_btree_write_buffer_flush_sync(trans);
        if (ret)
                goto err;
 
@@ -1076,26 +1167,34 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
         * but we can't retry because the btree write buffer won't have been
         * flushed and we'd spin:
         */
-       for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+       for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
                           BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
-               ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p));
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_lazy_rw,
+                       may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
                if (ret < 0)
                        break;
 
                if (ret) {
                        if (!test_bit(BCH_FS_RW, &c->flags)) {
-                               bch2_trans_unlock(&trans);
+                               bch2_trans_unlock(trans);
                                bch2_fs_lazy_rw(c);
                        }
 
-                       ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot);
+                       bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
+
+                       ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
                        if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                break;
                }
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (!ret && need_another_pass)
+               goto again;
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
index 22b24405649f0200cc3785bb0e4b431dc02ea72a..88818a332b1e5fcaa5fd9b350d958ef582c05161 100644 (file)
@@ -3,16 +3,17 @@
 #define _BCACHEFS_INODE_H
 
 #include "bkey.h"
+#include "bkey_methods.h"
 #include "opts.h"
 
 enum bkey_invalid_flags;
 extern const char * const bch2_inode_opts[];
 
-int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_invalid(struct bch_fs *, struct bkey_s_c,
                       enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v2_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
-int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
                          enum bkey_invalid_flags, struct printbuf *);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -52,7 +53,7 @@ static inline bool bkey_is_inode(const struct bkey *k)
                k->type == KEY_TYPE_inode_v3;
 }
 
-int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_inode_generation_invalid(struct bch_fs *, struct bkey_s_c,
                                  enum bkey_invalid_flags, struct printbuf *);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
@@ -101,8 +102,16 @@ void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *)
 
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
                    struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_write(struct btree_trans *, struct btree_iter *,
-                    struct bch_inode_unpacked *);
+
+int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
+                    struct bch_inode_unpacked *, enum btree_update_flags);
+
+static inline int bch2_inode_write(struct btree_trans *trans,
+                    struct btree_iter *iter,
+                    struct bch_inode_unpacked *inode)
+{
+       return bch2_inode_write_flags(trans, iter, inode, 0);
+}
 
 void bch2_inode_init_early(struct bch_fs *,
                           struct bch_inode_unpacked *);
@@ -118,6 +127,9 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *,
 
 int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
+int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *,
+                                 subvol_inum,
+                                 struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
                                  struct bch_inode_unpacked *);
 int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
@@ -174,7 +186,7 @@ static inline unsigned nlink_bias(umode_t mode)
 
 static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
 {
-       return bi->bi_flags & BCH_INODE_UNLINKED
+       return bi->bi_flags & BCH_INODE_unlinked
                  ? 0
                  : bi->bi_nlink + nlink_bias(bi->bi_mode);
 }
@@ -184,10 +196,10 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
 {
        if (nlink) {
                bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
-               bi->bi_flags &= ~BCH_INODE_UNLINKED;
+               bi->bi_flags &= ~BCH_INODE_unlinked;
        } else {
                bi->bi_nlink = 0;
-               bi->bi_flags |= BCH_INODE_UNLINKED;
+               bi->bi_flags |= BCH_INODE_unlinked;
        }
 }
 
@@ -197,6 +209,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
 void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
                         struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
 
 int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
 int bch2_delete_dead_inodes(struct bch_fs *);
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
deleted file mode 100644 (file)
index 5bacc6a..0000000
+++ /dev/null
@@ -1,3059 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some low level IO code, and hacks for various block layer limitations
- *
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
- * Copyright 2012 Google, Inc.
- */
-
-#include "bcachefs.h"
-#include "alloc_background.h"
-#include "alloc_foreground.h"
-#include "bkey_buf.h"
-#include "bset.h"
-#include "btree_update.h"
-#include "buckets.h"
-#include "checksum.h"
-#include "compress.h"
-#include "clock.h"
-#include "data_update.h"
-#include "debug.h"
-#include "disk_groups.h"
-#include "ec.h"
-#include "error.h"
-#include "extent_update.h"
-#include "inode.h"
-#include "io.h"
-#include "journal.h"
-#include "keylist.h"
-#include "move.h"
-#include "nocow_locking.h"
-#include "rebalance.h"
-#include "subvolume.h"
-#include "super.h"
-#include "super-io.h"
-#include "trace.h"
-
-#include <linux/blkdev.h>
-#include <linux/prefetch.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-
-const char *bch2_blk_status_to_str(blk_status_t status)
-{
-       if (status == BLK_STS_REMOVED)
-               return "device removed";
-       return blk_status_to_str(status);
-}
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       const struct bch_devs_mask *devs;
-       unsigned d, nr = 0, total = 0;
-       u64 now = local_clock(), last;
-       s64 congested;
-       struct bch_dev *ca;
-
-       if (!target)
-               return false;
-
-       rcu_read_lock();
-       devs = bch2_target_to_mask(c, target) ?:
-               &c->rw_devs[BCH_DATA_user];
-
-       for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
-               ca = rcu_dereference(c->devs[d]);
-               if (!ca)
-                       continue;
-
-               congested = atomic_read(&ca->congested);
-               last = READ_ONCE(ca->congested_last);
-               if (time_after64(now, last))
-                       congested -= (now - last) >> 12;
-
-               total += max(congested, 0LL);
-               nr++;
-       }
-       rcu_read_unlock();
-
-       return bch2_rand_range(nr * CONGESTED_MAX) < total;
-}
-
-static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
-                                      u64 now, int rw)
-{
-       u64 latency_capable =
-               ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
-       /* ideally we'd be taking into account the device's variance here: */
-       u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
-       s64 latency_over = io_latency - latency_threshold;
-
-       if (latency_threshold && latency_over > 0) {
-               /*
-                * bump up congested by approximately latency_over * 4 /
-                * latency_threshold - we don't need much accuracy here so don't
-                * bother with the divide:
-                */
-               if (atomic_read(&ca->congested) < CONGESTED_MAX)
-                       atomic_add(latency_over >>
-                                  max_t(int, ilog2(latency_threshold) - 2, 0),
-                                  &ca->congested);
-
-               ca->congested_last = now;
-       } else if (atomic_read(&ca->congested) > 0) {
-               atomic_dec(&ca->congested);
-       }
-}
-
-void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
-{
-       atomic64_t *latency = &ca->cur_latency[rw];
-       u64 now = local_clock();
-       u64 io_latency = time_after64(now, submit_time)
-               ? now - submit_time
-               : 0;
-       u64 old, new, v = atomic64_read(latency);
-
-       do {
-               old = v;
-
-               /*
-                * If the io latency was reasonably close to the current
-                * latency, skip doing the update and atomic operation - most of
-                * the time:
-                */
-               if (abs((int) (old - io_latency)) < (old >> 1) &&
-                   now & ~(~0U << 5))
-                       break;
-
-               new = ewma_add(old, io_latency, 5);
-       } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
-
-       bch2_congested_acct(ca, io_latency, now, rw);
-
-       __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
-}
-
-#else
-
-static bool bch2_target_congested(struct bch_fs *c, u16 target)
-{
-       return false;
-}
-
-#endif
-
-/* Allocate, free from mempool: */
-
-void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
-{
-       struct bvec_iter_all iter;
-       struct bio_vec *bv;
-
-       bio_for_each_segment_all(bv, bio, iter)
-               if (bv->bv_page != ZERO_PAGE(0))
-                       mempool_free(bv->bv_page, &c->bio_bounce_pages);
-       bio->bi_vcnt = 0;
-}
-
-static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
-{
-       struct page *page;
-
-       if (likely(!*using_mempool)) {
-               page = alloc_page(GFP_NOFS);
-               if (unlikely(!page)) {
-                       mutex_lock(&c->bio_bounce_pages_lock);
-                       *using_mempool = true;
-                       goto pool_alloc;
-
-               }
-       } else {
-pool_alloc:
-               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
-       }
-
-       return page;
-}
-
-void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-                              size_t size)
-{
-       bool using_mempool = false;
-
-       while (size) {
-               struct page *page = __bio_alloc_page_pool(c, &using_mempool);
-               unsigned len = min_t(size_t, PAGE_SIZE, size);
-
-               BUG_ON(!bio_add_page(bio, page, len, 0));
-               size -= len;
-       }
-
-       if (using_mempool)
-               mutex_unlock(&c->bio_bounce_pages_lock);
-}
-
-/* Extent update path: */
-
-int bch2_sum_sector_overwrites(struct btree_trans *trans,
-                              struct btree_iter *extent_iter,
-                              struct bkey_i *new,
-                              bool *usage_increasing,
-                              s64 *i_sectors_delta,
-                              s64 *disk_sectors_delta)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c old;
-       unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
-       bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
-       int ret = 0;
-
-       *usage_increasing       = false;
-       *i_sectors_delta        = 0;
-       *disk_sectors_delta     = 0;
-
-       bch2_trans_copy_iter(&iter, extent_iter);
-
-       for_each_btree_key_upto_continue_norestart(iter,
-                               new->k.p, BTREE_ITER_SLOTS, old, ret) {
-               s64 sectors = min(new->k.p.offset, old.k->p.offset) -
-                       max(bkey_start_offset(&new->k),
-                           bkey_start_offset(old.k));
-
-               *i_sectors_delta += sectors *
-                       (bkey_extent_is_allocation(&new->k) -
-                        bkey_extent_is_allocation(old.k));
-
-               *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
-               *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
-                       ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
-                       : 0;
-
-               if (!*usage_increasing &&
-                   (new->k.p.snapshot != old.k->p.snapshot ||
-                    new_replicas > bch2_bkey_replicas(c, old) ||
-                    (!new_compressed && bch2_bkey_sectors_compressed(old))))
-                       *usage_increasing = true;
-
-               if (bkey_ge(old.k->p, new->k.p))
-                       break;
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
-                                                   struct btree_iter *extent_iter,
-                                                   u64 new_i_size,
-                                                   s64 i_sectors_delta)
-{
-       struct btree_iter iter;
-       struct bkey_i *k;
-       struct bkey_i_inode_v3 *inode;
-       unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
-       int ret;
-
-       k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
-                             SPOS(0,
-                                  extent_iter->pos.inode,
-                                  extent_iter->snapshot),
-                             BTREE_ITER_CACHED);
-       ret = PTR_ERR_OR_ZERO(k);
-       if (unlikely(ret))
-               return ret;
-
-       if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
-               k = bch2_inode_to_v3(trans, k);
-               ret = PTR_ERR_OR_ZERO(k);
-               if (unlikely(ret))
-                       goto err;
-       }
-
-       inode = bkey_i_to_inode_v3(k);
-
-       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
-           new_i_size > le64_to_cpu(inode->v.bi_size)) {
-               inode->v.bi_size = cpu_to_le64(new_i_size);
-               inode_update_flags = 0;
-       }
-
-       if (i_sectors_delta) {
-               le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
-               inode_update_flags = 0;
-       }
-
-       if (inode->k.p.snapshot != iter.snapshot) {
-               inode->k.p.snapshot = iter.snapshot;
-               inode_update_flags = 0;
-       }
-
-       ret = bch2_trans_update(trans, &iter, &inode->k_i,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                               inode_update_flags);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-int bch2_extent_update(struct btree_trans *trans,
-                      subvol_inum inum,
-                      struct btree_iter *iter,
-                      struct bkey_i *k,
-                      struct disk_reservation *disk_res,
-                      u64 new_i_size,
-                      s64 *i_sectors_delta_total,
-                      bool check_enospc)
-{
-       struct bpos next_pos;
-       bool usage_increasing;
-       s64 i_sectors_delta = 0, disk_sectors_delta = 0;
-       int ret;
-
-       /*
-        * This traverses us the iterator without changing iter->path->pos to
-        * search_key() (which is pos + 1 for extents): we want there to be a
-        * path already traversed at iter->pos because
-        * bch2_trans_extent_update() will use it to attempt extent merging
-        */
-       ret = __bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
-
-       ret = bch2_extent_trim_atomic(trans, iter, k);
-       if (ret)
-               return ret;
-
-       next_pos = k->k.p;
-
-       ret = bch2_sum_sector_overwrites(trans, iter, k,
-                       &usage_increasing,
-                       &i_sectors_delta,
-                       &disk_sectors_delta);
-       if (ret)
-               return ret;
-
-       if (disk_res &&
-           disk_sectors_delta > (s64) disk_res->sectors) {
-               ret = bch2_disk_reservation_add(trans->c, disk_res,
-                                       disk_sectors_delta - disk_res->sectors,
-                                       !check_enospc || !usage_increasing
-                                       ? BCH_DISK_RESERVATION_NOFAIL : 0);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * Note:
-        * We always have to do an inode update - even when i_size/i_sectors
-        * aren't changing - for fsync to work properly; fsync relies on
-        * inode->bi_journal_seq which is updated by the trigger code:
-        */
-       ret =   bch2_extent_update_i_size_sectors(trans, iter,
-                                                 min(k->k.p.offset << 9, new_i_size),
-                                                 i_sectors_delta) ?:
-               bch2_trans_update(trans, iter, k, 0) ?:
-               bch2_trans_commit(trans, disk_res, NULL,
-                               BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL);
-       if (unlikely(ret))
-               return ret;
-
-       if (i_sectors_delta_total)
-               *i_sectors_delta_total += i_sectors_delta;
-       bch2_btree_iter_set_pos(iter, next_pos);
-       return 0;
-}
-
-/* Overwrites whatever was present with zeroes: */
-int bch2_extent_fallocate(struct btree_trans *trans,
-                         subvol_inum inum,
-                         struct btree_iter *iter,
-                         unsigned sectors,
-                         struct bch_io_opts opts,
-                         s64 *i_sectors_delta,
-                         struct write_point_specifier write_point)
-{
-       struct bch_fs *c = trans->c;
-       struct disk_reservation disk_res = { 0 };
-       struct closure cl;
-       struct open_buckets open_buckets;
-       struct bkey_s_c k;
-       struct bkey_buf old, new;
-       unsigned sectors_allocated;
-       bool have_reservation = false;
-       bool unwritten = opts.nocow &&
-           c->sb.version >= bcachefs_metadata_version_unwritten_extents;
-       int ret;
-
-       bch2_bkey_buf_init(&old);
-       bch2_bkey_buf_init(&new);
-       closure_init_stack(&cl);
-       open_buckets.nr = 0;
-retry:
-       sectors_allocated = 0;
-
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
-
-       if (!have_reservation) {
-               unsigned new_replicas =
-                       max(0, (int) opts.data_replicas -
-                           (int) bch2_bkey_nr_ptrs_fully_allocated(k));
-               /*
-                * Get a disk reservation before (in the nocow case) calling
-                * into the allocator:
-                */
-               ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
-               if (unlikely(ret))
-                       goto out;
-
-               bch2_bkey_buf_reassemble(&old, c, k);
-       }
-
-       if (have_reservation) {
-               if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
-                       goto out;
-
-               bch2_key_resize(&new.k->k, sectors);
-       } else if (!unwritten) {
-               struct bkey_i_reservation *reservation;
-
-               bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
-               reservation = bkey_reservation_init(new.k);
-               reservation->k.p = iter->pos;
-               bch2_key_resize(&reservation->k, sectors);
-               reservation->v.nr_replicas = opts.data_replicas;
-       } else {
-               struct bkey_i_extent *e;
-               struct bch_devs_list devs_have;
-               struct write_point *wp;
-               struct bch_extent_ptr *ptr;
-
-               devs_have.nr = 0;
-
-               bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX);
-
-               e = bkey_extent_init(new.k);
-               e->k.p = iter->pos;
-
-               ret = bch2_alloc_sectors_start_trans(trans,
-                               opts.foreground_target,
-                               false,
-                               write_point,
-                               &devs_have,
-                               opts.data_replicas,
-                               opts.data_replicas,
-                               BCH_WATERMARK_normal, 0, &cl, &wp);
-               if (ret) {
-                       bch2_trans_unlock(trans);
-                       closure_sync(&cl);
-                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                               goto retry;
-                       return ret;
-               }
-
-               sectors = min(sectors, wp->sectors_free);
-               sectors_allocated = sectors;
-
-               bch2_key_resize(&e->k, sectors);
-
-               bch2_open_bucket_get(c, wp, &open_buckets);
-               bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false);
-               bch2_alloc_sectors_done(c, wp);
-
-               extent_for_each_ptr(extent_i_to_s(e), ptr)
-                       ptr->unwritten = true;
-       }
-
-       have_reservation = true;
-
-       ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
-                                0, i_sectors_delta, true);
-out:
-       if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-       }
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               bch2_trans_begin(trans);
-               goto retry;
-       }
-
-       if (!ret && sectors_allocated)
-               bch2_increment_clock(c, sectors_allocated, WRITE);
-
-       bch2_open_buckets_put(c, &open_buckets);
-       bch2_disk_reservation_put(c, &disk_res);
-       bch2_bkey_buf_exit(&new, c);
-       bch2_bkey_buf_exit(&old, c);
-
-       return ret;
-}
-
-/*
- * Returns -BCH_ERR_transacton_restart if we had to drop locks:
- */
-int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-                  subvol_inum inum, u64 end,
-                  s64 *i_sectors_delta)
-{
-       struct bch_fs *c        = trans->c;
-       unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
-       struct bpos end_pos = POS(inum.inum, end);
-       struct bkey_s_c k;
-       int ret = 0, ret2 = 0;
-       u32 snapshot;
-
-       while (!ret ||
-              bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-               struct disk_reservation disk_res =
-                       bch2_disk_reservation_init(c, 0);
-               struct bkey_i delete;
-
-               if (ret)
-                       ret2 = ret;
-
-               bch2_trans_begin(trans);
-
-               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-               if (ret)
-                       continue;
-
-               bch2_btree_iter_set_snapshot(iter, snapshot);
-
-               /*
-                * peek_upto() doesn't have ideal semantics for extents:
-                */
-               k = bch2_btree_iter_peek_upto(iter, end_pos);
-               if (!k.k)
-                       break;
-
-               ret = bkey_err(k);
-               if (ret)
-                       continue;
-
-               bkey_init(&delete.k);
-               delete.k.p = iter->pos;
-
-               /* create the biggest key we can */
-               bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end_pos, &delete);
-
-               ret = bch2_extent_update(trans, inum, iter, &delete,
-                               &disk_res, 0, i_sectors_delta, false);
-               bch2_disk_reservation_put(c, &disk_res);
-       }
-
-       return ret ?: ret2;
-}
-
-int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
-               s64 *i_sectors_delta)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       int ret;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            POS(inum.inum, start),
-                            BTREE_ITER_INTENT);
-
-       ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
-
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               ret = 0;
-
-       return ret;
-}
-
-static int bch2_write_index_default(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct bkey_buf sk;
-       struct keylist *keys = &op->insert_keys;
-       struct bkey_i *k = bch2_keylist_front(keys);
-       struct btree_trans trans;
-       struct btree_iter iter;
-       subvol_inum inum = {
-               .subvol = op->subvol,
-               .inum   = k->k.p.inode,
-       };
-       int ret;
-
-       BUG_ON(!inum.subvol);
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-
-       do {
-               bch2_trans_begin(&trans);
-
-               k = bch2_keylist_front(keys);
-               bch2_bkey_buf_copy(&sk, c, k);
-
-               ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
-                                                 &sk.k->k.p.snapshot);
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                                    bkey_start_pos(&sk.k->k),
-                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-               ret = bch2_extent_update(&trans, inum, &iter, sk.k,
-                                        &op->res,
-                                        op->new_i_size, &op->i_sectors_delta,
-                                        op->flags & BCH_WRITE_CHECK_ENOSPC);
-               bch2_trans_iter_exit(&trans, &iter);
-
-               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-                       continue;
-               if (ret)
-                       break;
-
-               if (bkey_ge(iter.pos, k->k.p))
-                       bch2_keylist_pop_front(&op->insert_keys);
-               else
-                       bch2_cut_front(iter.pos, k);
-       } while (!bch2_keylist_empty(keys));
-
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-
-       return ret;
-}
-
-/* Writes */
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
-                              enum bch_data_type type,
-                              const struct bkey_i *k,
-                              bool nocow)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-       const struct bch_extent_ptr *ptr;
-       struct bch_write_bio *n;
-       struct bch_dev *ca;
-
-       BUG_ON(c->opts.nochanges);
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
-                      !c->devs[ptr->dev]);
-
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-
-               if (to_entry(ptr + 1) < ptrs.end) {
-                       n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
-                                               GFP_NOFS, &ca->replica_set));
-
-                       n->bio.bi_end_io        = wbio->bio.bi_end_io;
-                       n->bio.bi_private       = wbio->bio.bi_private;
-                       n->parent               = wbio;
-                       n->split                = true;
-                       n->bounce               = false;
-                       n->put_bio              = true;
-                       n->bio.bi_opf           = wbio->bio.bi_opf;
-                       bio_inc_remaining(&wbio->bio);
-               } else {
-                       n = wbio;
-                       n->split                = false;
-               }
-
-               n->c                    = c;
-               n->dev                  = ptr->dev;
-               n->have_ioref           = nocow || bch2_dev_get_ioref(ca,
-                                       type == BCH_DATA_btree ? READ : WRITE);
-               n->nocow                = nocow;
-               n->submit_time          = local_clock();
-               n->inode_offset         = bkey_start_offset(&k->k);
-               n->bio.bi_iter.bi_sector = ptr->offset;
-
-               if (likely(n->have_ioref)) {
-                       this_cpu_add(ca->io_done->sectors[WRITE][type],
-                                    bio_sectors(&n->bio));
-
-                       bio_set_dev(&n->bio, ca->disk_sb.bdev);
-
-                       if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
-                               bio_endio(&n->bio);
-                               continue;
-                       }
-
-                       submit_bio(&n->bio);
-               } else {
-                       n->bio.bi_status        = BLK_STS_REMOVED;
-                       bio_endio(&n->bio);
-               }
-       }
-}
-
-static void __bch2_write(struct bch_write_op *);
-
-static void bch2_write_done(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_fs *c = op->c;
-
-       bch2_disk_reservation_put(c, &op->res);
-       if (!(op->flags & BCH_WRITE_MOVE))
-               bch2_write_ref_put(c, BCH_WRITE_REF_write);
-       bch2_keylist_free(&op->insert_keys, op->inline_keys);
-
-       bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
-
-       EBUG_ON(cl->parent);
-       closure_debug_destroy(cl);
-       if (op->end_io)
-               op->end_io(op);
-}
-
-static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
-{
-       struct keylist *keys = &op->insert_keys;
-       struct bch_extent_ptr *ptr;
-       struct bkey_i *src, *dst = keys->keys, *n;
-
-       for (src = keys->keys; src != keys->top; src = n) {
-               n = bkey_next(src);
-
-               if (bkey_extent_is_direct_data(&src->k)) {
-                       bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
-                                           test_bit(ptr->dev, op->failed.d));
-
-                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src)))
-                               return -EIO;
-               }
-
-               if (dst != src)
-                       memmove_u64s_down(dst, src, src->k.u64s);
-               dst = bkey_next(dst);
-       }
-
-       keys->top = dst;
-       return 0;
-}
-
-/**
- * bch_write_index - after a write, update index to point to new data
- */
-static void __bch2_write_index(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct keylist *keys = &op->insert_keys;
-       struct bkey_i *k;
-       unsigned dev;
-       int ret = 0;
-
-       if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
-               ret = bch2_write_drop_io_error_ptrs(op);
-               if (ret)
-                       goto err;
-       }
-
-       /*
-        * probably not the ideal place to hook this in, but I don't
-        * particularly want to plumb io_opts all the way through the btree
-        * update stack right now
-        */
-       for_each_keylist_key(keys, k)
-               bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
-       if (!bch2_keylist_empty(keys)) {
-               u64 sectors_start = keylist_sectors(keys);
-
-               ret = !(op->flags & BCH_WRITE_MOVE)
-                       ? bch2_write_index_default(op)
-                       : bch2_data_update_index_update(op);
-
-               BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
-               BUG_ON(keylist_sectors(keys) && !ret);
-
-               op->written += sectors_start - keylist_sectors(keys);
-
-               if (ret && !bch2_err_matches(ret, EROFS)) {
-                       struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
-                       bch_err_inum_offset_ratelimited(c,
-                               k->k.p.inode, k->k.p.offset << 9,
-                               "write error while doing btree update: %s",
-                               bch2_err_str(ret));
-               }
-
-               if (ret)
-                       goto err;
-       }
-out:
-       /* If some a bucket wasn't written, we can't erasure code it: */
-       for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
-               bch2_open_bucket_write_error(c, &op->open_buckets, dev);
-
-       bch2_open_buckets_put(c, &op->open_buckets);
-       return;
-err:
-       keys->top = keys->keys;
-       op->error = ret;
-       op->flags |= BCH_WRITE_DONE;
-       goto out;
-}
-
-static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
-{
-       if (state != wp->state) {
-               u64 now = ktime_get_ns();
-
-               if (wp->last_state_change &&
-                   time_after64(now, wp->last_state_change))
-                       wp->time[wp->state] += now - wp->last_state_change;
-               wp->state = state;
-               wp->last_state_change = now;
-       }
-}
-
-static inline void wp_update_state(struct write_point *wp, bool running)
-{
-       enum write_point_state state;
-
-       state = running                  ? WRITE_POINT_running :
-               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
-                                        : WRITE_POINT_stopped;
-
-       __wp_update_state(wp, state);
-}
-
-static void bch2_write_index(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct write_point *wp = op->wp;
-       struct workqueue_struct *wq = index_update_wq(op);
-       unsigned long flags;
-
-       if ((op->flags & BCH_WRITE_DONE) &&
-           (op->flags & BCH_WRITE_MOVE))
-               bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
-
-       spin_lock_irqsave(&wp->writes_lock, flags);
-       if (wp->state == WRITE_POINT_waiting_io)
-               __wp_update_state(wp, WRITE_POINT_waiting_work);
-       list_add_tail(&op->wp_list, &wp->writes);
-       spin_unlock_irqrestore (&wp->writes_lock, flags);
-
-       queue_work(wq, &wp->index_update_work);
-}
-
-static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
-{
-       op->wp = wp;
-
-       if (wp->state == WRITE_POINT_stopped) {
-               spin_lock_irq(&wp->writes_lock);
-               __wp_update_state(wp, WRITE_POINT_waiting_io);
-               spin_unlock_irq(&wp->writes_lock);
-       }
-}
-
-void bch2_write_point_do_index_updates(struct work_struct *work)
-{
-       struct write_point *wp =
-               container_of(work, struct write_point, index_update_work);
-       struct bch_write_op *op;
-
-       while (1) {
-               spin_lock_irq(&wp->writes_lock);
-               op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
-               if (op)
-                       list_del(&op->wp_list);
-               wp_update_state(wp, op != NULL);
-               spin_unlock_irq(&wp->writes_lock);
-
-               if (!op)
-                       break;
-
-               op->flags |= BCH_WRITE_IN_WORKER;
-
-               __bch2_write_index(op);
-
-               if (!(op->flags & BCH_WRITE_DONE))
-                       __bch2_write(op);
-               else
-                       bch2_write_done(&op->cl);
-       }
-}
-
-static void bch2_write_endio(struct bio *bio)
-{
-       struct closure *cl              = bio->bi_private;
-       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
-       struct bch_write_bio *wbio      = to_wbio(bio);
-       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
-       struct bch_fs *c                = wbio->c;
-       struct bch_dev *ca              = bch_dev_bkey_exists(c, wbio->dev);
-
-       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-                                   op->pos.inode,
-                                   wbio->inode_offset << 9,
-                                   "data write error: %s",
-                                   bch2_blk_status_to_str(bio->bi_status))) {
-               set_bit(wbio->dev, op->failed.d);
-               op->flags |= BCH_WRITE_IO_ERROR;
-       }
-
-       if (wbio->nocow)
-               set_bit(wbio->dev, op->devs_need_flush->d);
-
-       if (wbio->have_ioref) {
-               bch2_latency_acct(ca, wbio->submit_time, WRITE);
-               percpu_ref_put(&ca->io_ref);
-       }
-
-       if (wbio->bounce)
-               bch2_bio_free_pages_pool(c, bio);
-
-       if (wbio->put_bio)
-               bio_put(bio);
-
-       if (parent)
-               bio_endio(&parent->bio);
-       else
-               closure_put(cl);
-}
-
-static void init_append_extent(struct bch_write_op *op,
-                              struct write_point *wp,
-                              struct bversion version,
-                              struct bch_extent_crc_unpacked crc)
-{
-       struct bkey_i_extent *e;
-
-       op->pos.offset += crc.uncompressed_size;
-
-       e = bkey_extent_init(op->insert_keys.top);
-       e->k.p          = op->pos;
-       e->k.size       = crc.uncompressed_size;
-       e->k.version    = version;
-
-       if (crc.csum_type ||
-           crc.compression_type ||
-           crc.nonce)
-               bch2_extent_crc_append(&e->k_i, crc);
-
-       bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
-                                      op->flags & BCH_WRITE_CACHED);
-
-       bch2_keylist_push(&op->insert_keys);
-}
-
-static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
-                                       struct write_point *wp,
-                                       struct bio *src,
-                                       bool *page_alloc_failed,
-                                       void *buf)
-{
-       struct bch_write_bio *wbio;
-       struct bio *bio;
-       unsigned output_available =
-               min(wp->sectors_free << 9, src->bi_iter.bi_size);
-       unsigned pages = DIV_ROUND_UP(output_available +
-                                     (buf
-                                      ? ((unsigned long) buf & (PAGE_SIZE - 1))
-                                      : 0), PAGE_SIZE);
-
-       pages = min(pages, BIO_MAX_VECS);
-
-       bio = bio_alloc_bioset(NULL, pages, 0,
-                              GFP_NOFS, &c->bio_write);
-       wbio                    = wbio_init(bio);
-       wbio->put_bio           = true;
-       /* copy WRITE_SYNC flag */
-       wbio->bio.bi_opf        = src->bi_opf;
-
-       if (buf) {
-               bch2_bio_map(bio, buf, output_available);
-               return bio;
-       }
-
-       wbio->bounce            = true;
-
-       /*
-        * We can't use mempool for more than c->sb.encoded_extent_max
-        * worth of pages, but we'd like to allocate more if we can:
-        */
-       bch2_bio_alloc_pages_pool(c, bio,
-                                 min_t(unsigned, output_available,
-                                       c->opts.encoded_extent_max));
-
-       if (bio->bi_iter.bi_size < output_available)
-               *page_alloc_failed =
-                       bch2_bio_alloc_pages(bio,
-                                            output_available -
-                                            bio->bi_iter.bi_size,
-                                            GFP_NOFS) != 0;
-
-       return bio;
-}
-
-static int bch2_write_rechecksum(struct bch_fs *c,
-                                struct bch_write_op *op,
-                                unsigned new_csum_type)
-{
-       struct bio *bio = &op->wbio.bio;
-       struct bch_extent_crc_unpacked new_crc;
-       int ret;
-
-       /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
-
-       if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
-           bch2_csum_type_is_encryption(new_csum_type))
-               new_csum_type = op->crc.csum_type;
-
-       ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
-                                 NULL, &new_crc,
-                                 op->crc.offset, op->crc.live_size,
-                                 new_csum_type);
-       if (ret)
-               return ret;
-
-       bio_advance(bio, op->crc.offset << 9);
-       bio->bi_iter.bi_size = op->crc.live_size << 9;
-       op->crc = new_crc;
-       return 0;
-}
-
-static int bch2_write_decrypt(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct nonce nonce = extent_nonce(op->version, op->crc);
-       struct bch_csum csum;
-       int ret;
-
-       if (!bch2_csum_type_is_encryption(op->crc.csum_type))
-               return 0;
-
-       /*
-        * If we need to decrypt data in the write path, we'll no longer be able
-        * to verify the existing checksum (poly1305 mac, in this case) after
-        * it's decrypted - this is the last point we'll be able to reverify the
-        * checksum:
-        */
-       csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-       if (bch2_crc_cmp(op->crc.csum, csum))
-               return -EIO;
-
-       ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
-       op->crc.csum_type = 0;
-       op->crc.csum = (struct bch_csum) { 0, 0 };
-       return ret;
-}
-
-static enum prep_encoded_ret {
-       PREP_ENCODED_OK,
-       PREP_ENCODED_ERR,
-       PREP_ENCODED_CHECKSUM_ERR,
-       PREP_ENCODED_DO_WRITE,
-} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
-{
-       struct bch_fs *c = op->c;
-       struct bio *bio = &op->wbio.bio;
-
-       if (!(op->flags & BCH_WRITE_DATA_ENCODED))
-               return PREP_ENCODED_OK;
-
-       BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
-
-       /* Can we just write the entire extent as is? */
-       if (op->crc.uncompressed_size == op->crc.live_size &&
-           op->crc.compressed_size <= wp->sectors_free &&
-           (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
-            op->incompressible)) {
-               if (!crc_is_compressed(op->crc) &&
-                   op->csum_type != op->crc.csum_type &&
-                   bch2_write_rechecksum(c, op, op->csum_type) &&
-                   !c->opts.no_data_io)
-                       return PREP_ENCODED_CHECKSUM_ERR;
-
-               return PREP_ENCODED_DO_WRITE;
-       }
-
-       /*
-        * If the data is compressed and we couldn't write the entire extent as
-        * is, we have to decompress it:
-        */
-       if (crc_is_compressed(op->crc)) {
-               struct bch_csum csum;
-
-               if (bch2_write_decrypt(op))
-                       return PREP_ENCODED_CHECKSUM_ERR;
-
-               /* Last point we can still verify checksum: */
-               csum = bch2_checksum_bio(c, op->crc.csum_type,
-                                        extent_nonce(op->version, op->crc),
-                                        bio);
-               if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
-                       return PREP_ENCODED_CHECKSUM_ERR;
-
-               if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
-                       return PREP_ENCODED_ERR;
-       }
-
-       /*
-        * No longer have compressed data after this point - data might be
-        * encrypted:
-        */
-
-       /*
-        * If the data is checksummed and we're only writing a subset,
-        * rechecksum and adjust bio to point to currently live data:
-        */
-       if ((op->crc.live_size != op->crc.uncompressed_size ||
-            op->crc.csum_type != op->csum_type) &&
-           bch2_write_rechecksum(c, op, op->csum_type) &&
-           !c->opts.no_data_io)
-               return PREP_ENCODED_CHECKSUM_ERR;
-
-       /*
-        * If we want to compress the data, it has to be decrypted:
-        */
-       if ((op->compression_opt ||
-            bch2_csum_type_is_encryption(op->crc.csum_type) !=
-            bch2_csum_type_is_encryption(op->csum_type)) &&
-           bch2_write_decrypt(op))
-               return PREP_ENCODED_CHECKSUM_ERR;
-
-       return PREP_ENCODED_OK;
-}
-
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
-                            struct bio **_dst)
-{
-       struct bch_fs *c = op->c;
-       struct bio *src = &op->wbio.bio, *dst = src;
-       struct bvec_iter saved_iter;
-       void *ec_buf;
-       unsigned total_output = 0, total_input = 0;
-       bool bounce = false;
-       bool page_alloc_failed = false;
-       int ret, more = 0;
-
-       BUG_ON(!bio_sectors(src));
-
-       ec_buf = bch2_writepoint_ec_buf(c, wp);
-
-       switch (bch2_write_prep_encoded_data(op, wp)) {
-       case PREP_ENCODED_OK:
-               break;
-       case PREP_ENCODED_ERR:
-               ret = -EIO;
-               goto err;
-       case PREP_ENCODED_CHECKSUM_ERR:
-               goto csum_err;
-       case PREP_ENCODED_DO_WRITE:
-               /* XXX look for bug here */
-               if (ec_buf) {
-                       dst = bch2_write_bio_alloc(c, wp, src,
-                                                  &page_alloc_failed,
-                                                  ec_buf);
-                       bio_copy_data(dst, src);
-                       bounce = true;
-               }
-               init_append_extent(op, wp, op->version, op->crc);
-               goto do_write;
-       }
-
-       if (ec_buf ||
-           op->compression_opt ||
-           (op->csum_type &&
-            !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
-           (bch2_csum_type_is_encryption(op->csum_type) &&
-            !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-               dst = bch2_write_bio_alloc(c, wp, src,
-                                          &page_alloc_failed,
-                                          ec_buf);
-               bounce = true;
-       }
-
-       saved_iter = dst->bi_iter;
-
-       do {
-               struct bch_extent_crc_unpacked crc = { 0 };
-               struct bversion version = op->version;
-               size_t dst_len, src_len;
-
-               if (page_alloc_failed &&
-                   dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
-                   dst->bi_iter.bi_size < c->opts.encoded_extent_max)
-                       break;
-
-               BUG_ON(op->compression_opt &&
-                      (op->flags & BCH_WRITE_DATA_ENCODED) &&
-                      bch2_csum_type_is_encryption(op->crc.csum_type));
-               BUG_ON(op->compression_opt && !bounce);
-
-               crc.compression_type = op->incompressible
-                       ? BCH_COMPRESSION_TYPE_incompressible
-                       : op->compression_opt
-                       ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-                                           op->compression_opt)
-                       : 0;
-               if (!crc_is_compressed(crc)) {
-                       dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-                       dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
-
-                       if (op->csum_type)
-                               dst_len = min_t(unsigned, dst_len,
-                                               c->opts.encoded_extent_max);
-
-                       if (bounce) {
-                               swap(dst->bi_iter.bi_size, dst_len);
-                               bio_copy_data(dst, src);
-                               swap(dst->bi_iter.bi_size, dst_len);
-                       }
-
-                       src_len = dst_len;
-               }
-
-               BUG_ON(!src_len || !dst_len);
-
-               if (bch2_csum_type_is_encryption(op->csum_type)) {
-                       if (bversion_zero(version)) {
-                               version.lo = atomic64_inc_return(&c->key_version);
-                       } else {
-                               crc.nonce = op->nonce;
-                               op->nonce += src_len >> 9;
-                       }
-               }
-
-               if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-                   !crc_is_compressed(crc) &&
-                   bch2_csum_type_is_encryption(op->crc.csum_type) ==
-                   bch2_csum_type_is_encryption(op->csum_type)) {
-                       u8 compression_type = crc.compression_type;
-                       u16 nonce = crc.nonce;
-                       /*
-                        * Note: when we're using rechecksum(), we need to be
-                        * checksumming @src because it has all the data our
-                        * existing checksum covers - if we bounced (because we
-                        * were trying to compress), @dst will only have the
-                        * part of the data the new checksum will cover.
-                        *
-                        * But normally we want to be checksumming post bounce,
-                        * because part of the reason for bouncing is so the
-                        * data can't be modified (by userspace) while it's in
-                        * flight.
-                        */
-                       if (bch2_rechecksum_bio(c, src, version, op->crc,
-                                       &crc, &op->crc,
-                                       src_len >> 9,
-                                       bio_sectors(src) - (src_len >> 9),
-                                       op->csum_type))
-                               goto csum_err;
-                       /*
-                        * rchecksum_bio sets compression_type on crc from op->crc,
-                        * this isn't always correct as sometimes we're changing
-                        * an extent from uncompressed to incompressible.
-                        */
-                       crc.compression_type = compression_type;
-                       crc.nonce = nonce;
-               } else {
-                       if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
-                           bch2_rechecksum_bio(c, src, version, op->crc,
-                                       NULL, &op->crc,
-                                       src_len >> 9,
-                                       bio_sectors(src) - (src_len >> 9),
-                                       op->crc.csum_type))
-                               goto csum_err;
-
-                       crc.compressed_size     = dst_len >> 9;
-                       crc.uncompressed_size   = src_len >> 9;
-                       crc.live_size           = src_len >> 9;
-
-                       swap(dst->bi_iter.bi_size, dst_len);
-                       ret = bch2_encrypt_bio(c, op->csum_type,
-                                              extent_nonce(version, crc), dst);
-                       if (ret)
-                               goto err;
-
-                       crc.csum = bch2_checksum_bio(c, op->csum_type,
-                                        extent_nonce(version, crc), dst);
-                       crc.csum_type = op->csum_type;
-                       swap(dst->bi_iter.bi_size, dst_len);
-               }
-
-               init_append_extent(op, wp, version, crc);
-
-               if (dst != src)
-                       bio_advance(dst, dst_len);
-               bio_advance(src, src_len);
-               total_output    += dst_len;
-               total_input     += src_len;
-       } while (dst->bi_iter.bi_size &&
-                src->bi_iter.bi_size &&
-                wp->sectors_free &&
-                !bch2_keylist_realloc(&op->insert_keys,
-                                     op->inline_keys,
-                                     ARRAY_SIZE(op->inline_keys),
-                                     BKEY_EXTENT_U64s_MAX));
-
-       more = src->bi_iter.bi_size != 0;
-
-       dst->bi_iter = saved_iter;
-
-       if (dst == src && more) {
-               BUG_ON(total_output != total_input);
-
-               dst = bio_split(src, total_input >> 9,
-                               GFP_NOFS, &c->bio_write);
-               wbio_init(dst)->put_bio = true;
-               /* copy WRITE_SYNC flag */
-               dst->bi_opf             = src->bi_opf;
-       }
-
-       dst->bi_iter.bi_size = total_output;
-do_write:
-       *_dst = dst;
-       return more;
-csum_err:
-       bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
-       ret = -EIO;
-err:
-       if (to_wbio(dst)->bounce)
-               bch2_bio_free_pages_pool(c, dst);
-       if (to_wbio(dst)->put_bio)
-               bio_put(dst);
-
-       return ret;
-}
-
-static bool bch2_extent_is_writeable(struct bch_write_op *op,
-                                    struct bkey_s_c k)
-{
-       struct bch_fs *c = op->c;
-       struct bkey_s_c_extent e;
-       struct extent_ptr_decoded p;
-       const union bch_extent_entry *entry;
-       unsigned replicas = 0;
-
-       if (k.k->type != KEY_TYPE_extent)
-               return false;
-
-       e = bkey_s_c_to_extent(k);
-       extent_for_each_ptr_decode(e, p, entry) {
-               if (p.crc.csum_type ||
-                   crc_is_compressed(p.crc) ||
-                   p.has_ec)
-                       return false;
-
-               replicas += bch2_extent_ptr_durability(c, &p);
-       }
-
-       return replicas >= op->opts.data_replicas;
-}
-
-static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       const struct bch_extent_ptr *ptr;
-       struct bkey_i *k;
-
-       for_each_keylist_key(&op->insert_keys, k) {
-               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
-
-               bkey_for_each_ptr(ptrs, ptr)
-                       bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                              PTR_BUCKET_POS(c, ptr),
-                                              BUCKET_NOCOW_LOCK_UPDATE);
-       }
-}
-
-static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
-                                                 struct btree_iter *iter,
-                                                 struct bkey_i *orig,
-                                                 struct bkey_s_c k,
-                                                 u64 new_i_size)
-{
-       struct bkey_i *new;
-       struct bkey_ptrs ptrs;
-       struct bch_extent_ptr *ptr;
-       int ret;
-
-       if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
-               /* trace this */
-               return 0;
-       }
-
-       new = bch2_bkey_make_mut_noupdate(trans, k);
-       ret = PTR_ERR_OR_ZERO(new);
-       if (ret)
-               return ret;
-
-       bch2_cut_front(bkey_start_pos(&orig->k), new);
-       bch2_cut_back(orig->k.p, new);
-
-       ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
-       bkey_for_each_ptr(ptrs, ptr)
-               ptr->unwritten = 0;
-
-       /*
-        * Note that we're not calling bch2_subvol_get_snapshot() in this path -
-        * that was done when we kicked off the write, and here it's important
-        * that we update the extent that we wrote to - even if a snapshot has
-        * since been created. The write is still outstanding, so we're ok
-        * w.r.t. snapshot atomicity:
-        */
-       return  bch2_extent_update_i_size_sectors(trans, iter,
-                                       min(new->k.p.offset << 9, new_i_size), 0) ?:
-               bch2_trans_update(trans, iter, new,
-                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_i *orig;
-       struct bkey_s_c k;
-       int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_keylist_key(&op->insert_keys, orig) {
-               ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
-                                    bkey_start_pos(&orig->k), orig->k.p,
-                                    BTREE_ITER_INTENT, k,
-                                    NULL, NULL, BTREE_INSERT_NOFAIL, ({
-                       bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
-               }));
-
-               if (ret && !bch2_err_matches(ret, EROFS)) {
-                       struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
-
-                       bch_err_inum_offset_ratelimited(c,
-                               k->k.p.inode, k->k.p.offset << 9,
-                               "write error while doing btree update: %s",
-                               bch2_err_str(ret));
-               }
-
-               if (ret) {
-                       op->error = ret;
-                       break;
-               }
-       }
-
-       bch2_trans_exit(&trans);
-}
-
-static void __bch2_nocow_write_done(struct bch_write_op *op)
-{
-       bch2_nocow_write_unlock(op);
-
-       if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) {
-               op->error = -EIO;
-       } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN))
-               bch2_nocow_write_convert_unwritten(op);
-}
-
-static void bch2_nocow_write_done(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-
-       __bch2_nocow_write_done(op);
-       bch2_write_done(cl);
-}
-
-static void bch2_nocow_write(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr;
-       struct {
-               struct bpos     b;
-               unsigned        gen;
-               struct nocow_lock_bucket *l;
-       } buckets[BCH_REPLICAS_MAX];
-       unsigned nr_buckets = 0;
-       u32 snapshot;
-       int ret, i;
-
-       if (op->flags & BCH_WRITE_MOVE)
-               return;
-
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot);
-       if (unlikely(ret))
-               goto err;
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            SPOS(op->pos.inode, op->pos.offset, snapshot),
-                            BTREE_ITER_SLOTS);
-       while (1) {
-               struct bio *bio = &op->wbio.bio;
-
-               nr_buckets = 0;
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               /* fall back to normal cow write path? */
-               if (unlikely(k.k->p.snapshot != snapshot ||
-                            !bch2_extent_is_writeable(op, k)))
-                       break;
-
-               if (bch2_keylist_realloc(&op->insert_keys,
-                                       op->inline_keys,
-                                       ARRAY_SIZE(op->inline_keys),
-                                       k.k->u64s))
-                       break;
-
-               /* Get iorefs before dropping btree locks: */
-               ptrs = bch2_bkey_ptrs_c(k);
-               bkey_for_each_ptr(ptrs, ptr) {
-                       buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
-                       buckets[nr_buckets].gen = ptr->gen;
-                       buckets[nr_buckets].l =
-                               bucket_nocow_lock(&c->nocow_locks,
-                                                 bucket_to_u64(buckets[nr_buckets].b));
-
-                       prefetch(buckets[nr_buckets].l);
-
-                       if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
-                               goto err_get_ioref;
-
-                       nr_buckets++;
-
-                       if (ptr->unwritten)
-                               op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
-               }
-
-               /* Unlock before taking nocow locks, doing IO: */
-               bkey_reassemble(op->insert_keys.top, k);
-               bch2_trans_unlock(&trans);
-
-               bch2_cut_front(op->pos, op->insert_keys.top);
-               if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
-                       bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
-
-               for (i = 0; i < nr_buckets; i++) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
-                       struct nocow_lock_bucket *l = buckets[i].l;
-                       bool stale;
-
-                       __bch2_bucket_nocow_lock(&c->nocow_locks, l,
-                                                bucket_to_u64(buckets[i].b),
-                                                BUCKET_NOCOW_LOCK_UPDATE);
-
-                       rcu_read_lock();
-                       stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
-                       rcu_read_unlock();
-
-                       if (unlikely(stale))
-                               goto err_bucket_stale;
-               }
-
-               bio = &op->wbio.bio;
-               if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) {
-                       bio = bio_split(bio, k.k->p.offset - op->pos.offset,
-                                       GFP_KERNEL, &c->bio_write);
-                       wbio_init(bio)->put_bio = true;
-                       bio->bi_opf = op->wbio.bio.bi_opf;
-               } else {
-                       op->flags |= BCH_WRITE_DONE;
-               }
-
-               op->pos.offset += bio_sectors(bio);
-               op->written += bio_sectors(bio);
-
-               bio->bi_end_io  = bch2_write_endio;
-               bio->bi_private = &op->cl;
-               bio->bi_opf |= REQ_OP_WRITE;
-               closure_get(&op->cl);
-               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-                                         op->insert_keys.top, true);
-
-               bch2_keylist_push(&op->insert_keys);
-               if (op->flags & BCH_WRITE_DONE)
-                       break;
-               bch2_btree_iter_advance(&iter);
-       }
-out:
-       bch2_trans_iter_exit(&trans, &iter);
-err:
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               goto retry;
-
-       if (ret) {
-               bch_err_inum_offset_ratelimited(c,
-                               op->pos.inode,
-                               op->pos.offset << 9,
-                               "%s: btree lookup error %s",
-                               __func__, bch2_err_str(ret));
-               op->error = ret;
-               op->flags |= BCH_WRITE_DONE;
-       }
-
-       bch2_trans_exit(&trans);
-
-       /* fallback to cow write path? */
-       if (!(op->flags & BCH_WRITE_DONE)) {
-               closure_sync(&op->cl);
-               __bch2_nocow_write_done(op);
-               op->insert_keys.top = op->insert_keys.keys;
-       } else if (op->flags & BCH_WRITE_SYNC) {
-               closure_sync(&op->cl);
-               bch2_nocow_write_done(&op->cl);
-       } else {
-               /*
-                * XXX
-                * needs to run out of process context because ei_quota_lock is
-                * a mutex
-                */
-               continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op));
-       }
-       return;
-err_get_ioref:
-       for (i = 0; i < nr_buckets; i++)
-               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
-       /* Fall back to COW path: */
-       goto out;
-err_bucket_stale:
-       while (--i >= 0)
-               bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                        buckets[i].b,
-                                        BUCKET_NOCOW_LOCK_UPDATE);
-       for (i = 0; i < nr_buckets; i++)
-               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
-
-       /* We can retry this: */
-       ret = -BCH_ERR_transaction_restart;
-       goto out;
-}
-
-static void __bch2_write(struct bch_write_op *op)
-{
-       struct bch_fs *c = op->c;
-       struct write_point *wp = NULL;
-       struct bio *bio = NULL;
-       unsigned nofs_flags;
-       int ret;
-
-       nofs_flags = memalloc_nofs_save();
-
-       if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
-               bch2_nocow_write(op);
-               if (op->flags & BCH_WRITE_DONE)
-                       goto out_nofs_restore;
-       }
-again:
-       memset(&op->failed, 0, sizeof(op->failed));
-
-       do {
-               struct bkey_i *key_to_write;
-               unsigned key_to_write_offset = op->insert_keys.top_p -
-                       op->insert_keys.keys_p;
-
-               /* +1 for possible cache device: */
-               if (op->open_buckets.nr + op->nr_replicas + 1 >
-                   ARRAY_SIZE(op->open_buckets.v))
-                       break;
-
-               if (bch2_keylist_realloc(&op->insert_keys,
-                                       op->inline_keys,
-                                       ARRAY_SIZE(op->inline_keys),
-                                       BKEY_EXTENT_U64s_MAX))
-                       break;
-
-               /*
-                * The copygc thread is now global, which means it's no longer
-                * freeing up space on specific disks, which means that
-                * allocations for specific disks may hang arbitrarily long:
-                */
-               ret = bch2_trans_do(c, NULL, NULL, 0,
-                       bch2_alloc_sectors_start_trans(&trans,
-                               op->target,
-                               op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
-                               op->write_point,
-                               &op->devs_have,
-                               op->nr_replicas,
-                               op->nr_replicas_required,
-                               op->watermark,
-                               op->flags,
-                               (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
-                                             BCH_WRITE_ONLY_SPECIFIED_DEVS))
-                               ? NULL : &op->cl, &wp));
-               if (unlikely(ret)) {
-                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
-                               break;
-
-                       goto err;
-               }
-
-               EBUG_ON(!wp);
-
-               bch2_open_bucket_get(c, wp, &op->open_buckets);
-               ret = bch2_write_extent(op, wp, &bio);
-
-               bch2_alloc_sectors_done_inlined(c, wp);
-err:
-               if (ret <= 0) {
-                       op->flags |= BCH_WRITE_DONE;
-
-                       if (ret < 0) {
-                               op->error = ret;
-                               break;
-                       }
-               }
-
-               bio->bi_end_io  = bch2_write_endio;
-               bio->bi_private = &op->cl;
-               bio->bi_opf |= REQ_OP_WRITE;
-
-               closure_get(bio->bi_private);
-
-               key_to_write = (void *) (op->insert_keys.keys_p +
-                                        key_to_write_offset);
-
-               bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
-                                         key_to_write, false);
-       } while (ret);
-
-       /*
-        * Sync or no?
-        *
-        * If we're running asynchronously, wne may still want to block
-        * synchronously here if we weren't able to submit all of the IO at
-        * once, as that signals backpressure to the caller.
-        */
-       if ((op->flags & BCH_WRITE_SYNC) ||
-           (!(op->flags & BCH_WRITE_DONE) &&
-            !(op->flags & BCH_WRITE_IN_WORKER))) {
-               closure_sync(&op->cl);
-               __bch2_write_index(op);
-
-               if (!(op->flags & BCH_WRITE_DONE))
-                       goto again;
-               bch2_write_done(&op->cl);
-       } else {
-               bch2_write_queue(op, wp);
-               continue_at(&op->cl, bch2_write_index, NULL);
-       }
-out_nofs_restore:
-       memalloc_nofs_restore(nofs_flags);
-}
-
-static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
-{
-       struct bio *bio = &op->wbio.bio;
-       struct bvec_iter iter;
-       struct bkey_i_inline_data *id;
-       unsigned sectors;
-       int ret;
-
-       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-       op->flags |= BCH_WRITE_DONE;
-
-       bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
-
-       ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
-                                  ARRAY_SIZE(op->inline_keys),
-                                  BKEY_U64s + DIV_ROUND_UP(data_len, 8));
-       if (ret) {
-               op->error = ret;
-               goto err;
-       }
-
-       sectors = bio_sectors(bio);
-       op->pos.offset += sectors;
-
-       id = bkey_inline_data_init(op->insert_keys.top);
-       id->k.p         = op->pos;
-       id->k.version   = op->version;
-       id->k.size      = sectors;
-
-       iter = bio->bi_iter;
-       iter.bi_size = data_len;
-       memcpy_from_bio(id->v.data, bio, iter);
-
-       while (data_len & 7)
-               id->v.data[data_len++] = '\0';
-       set_bkey_val_bytes(&id->k, data_len);
-       bch2_keylist_push(&op->insert_keys);
-
-       __bch2_write_index(op);
-err:
-       bch2_write_done(&op->cl);
-}
-
-/**
- * bch_write - handle a write to a cache device or flash only volume
- *
- * This is the starting point for any data to end up in a cache device; it could
- * be from a normal write, or a writeback write, or a write to a flash only
- * volume - it's also used by the moving garbage collector to compact data in
- * mostly empty buckets.
- *
- * It first writes the data to the cache, creating a list of keys to be inserted
- * (if the data won't fit in a single open bucket, there will be multiple keys);
- * after the data is written it calls bch_journal, and after the keys have been
- * added to the next journal write they're inserted into the btree.
- *
- * If op->discard is true, instead of inserting the data it invalidates the
- * region of the cache represented by op->bio and op->inode.
- */
-void bch2_write(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->wbio.bio;
-       struct bch_fs *c = op->c;
-       unsigned data_len;
-
-       EBUG_ON(op->cl.parent);
-       BUG_ON(!op->nr_replicas);
-       BUG_ON(!op->write_point.v);
-       BUG_ON(bkey_eq(op->pos, POS_MAX));
-
-       op->start_time = local_clock();
-       bch2_keylist_init(&op->insert_keys, op->inline_keys);
-       wbio_init(bio)->put_bio = false;
-
-       if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
-               bch_err_inum_offset_ratelimited(c,
-                       op->pos.inode,
-                       op->pos.offset << 9,
-                       "misaligned write");
-               op->error = -EIO;
-               goto err;
-       }
-
-       if (c->opts.nochanges) {
-               op->error = -BCH_ERR_erofs_no_writes;
-               goto err;
-       }
-
-       if (!(op->flags & BCH_WRITE_MOVE) &&
-           !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
-               op->error = -BCH_ERR_erofs_no_writes;
-               goto err;
-       }
-
-       this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio));
-       bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
-       data_len = min_t(u64, bio->bi_iter.bi_size,
-                        op->new_i_size - (op->pos.offset << 9));
-
-       if (c->opts.inline_data &&
-           data_len <= min(block_bytes(c) / 2, 1024U)) {
-               bch2_write_data_inline(op, data_len);
-               return;
-       }
-
-       __bch2_write(op);
-       return;
-err:
-       bch2_disk_reservation_put(c, &op->res);
-
-       closure_debug_destroy(&op->cl);
-       if (op->end_io)
-               op->end_io(op);
-}
-
-static const char * const bch2_write_flags[] = {
-#define x(f)   #f,
-       BCH_WRITE_FLAGS()
-#undef x
-       NULL
-};
-
-void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
-{
-       prt_str(out, "pos: ");
-       bch2_bpos_to_text(out, op->pos);
-       prt_newline(out);
-       printbuf_indent_add(out, 2);
-
-       prt_str(out, "started: ");
-       bch2_pr_time_units(out, local_clock() - op->start_time);
-       prt_newline(out);
-
-       prt_str(out, "flags: ");
-       prt_bitflags(out, bch2_write_flags, op->flags);
-       prt_newline(out);
-
-       prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
-       prt_newline(out);
-
-       printbuf_indent_sub(out, 2);
-}
-
-/* Cache promotion on read */
-
-struct promote_op {
-       struct rcu_head         rcu;
-       u64                     start_time;
-
-       struct rhash_head       hash;
-       struct bpos             pos;
-
-       struct data_update      write;
-       struct bio_vec          bi_inline_vecs[0]; /* must be last */
-};
-
-static const struct rhashtable_params bch_promote_params = {
-       .head_offset    = offsetof(struct promote_op, hash),
-       .key_offset     = offsetof(struct promote_op, pos),
-       .key_len        = sizeof(struct bpos),
-};
-
-static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
-                                 struct bpos pos,
-                                 struct bch_io_opts opts,
-                                 unsigned flags)
-{
-       if (!(flags & BCH_READ_MAY_PROMOTE))
-               return false;
-
-       if (!opts.promote_target)
-               return false;
-
-       if (bch2_bkey_has_target(c, k, opts.promote_target))
-               return false;
-
-       if (bkey_extent_is_unwritten(k))
-               return false;
-
-       if (bch2_target_congested(c, opts.promote_target)) {
-               /* XXX trace this */
-               return false;
-       }
-
-       if (rhashtable_lookup_fast(&c->promote_table, &pos,
-                                  bch_promote_params))
-               return false;
-
-       return true;
-}
-
-static void promote_free(struct bch_fs *c, struct promote_op *op)
-{
-       int ret;
-
-       bch2_data_update_exit(&op->write);
-
-       ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                    bch_promote_params);
-       BUG_ON(ret);
-       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-       kfree_rcu(op, rcu);
-}
-
-static void promote_done(struct bch_write_op *wop)
-{
-       struct promote_op *op =
-               container_of(wop, struct promote_op, write.op);
-       struct bch_fs *c = op->write.op.c;
-
-       bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
-                              op->start_time);
-       promote_free(c, op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-       struct bio *bio = &op->write.op.wbio.bio;
-
-       trace_and_count(op->write.op.c, read_promote, &rbio->bio);
-
-       /* we now own pages: */
-       BUG_ON(!rbio->bounce);
-       BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-
-       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-              sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-
-       bch2_data_update_read_done(&op->write, rbio->pick.crc);
-}
-
-static struct promote_op *__promote_alloc(struct btree_trans *trans,
-                                         enum btree_id btree_id,
-                                         struct bkey_s_c k,
-                                         struct bpos pos,
-                                         struct extent_ptr_decoded *pick,
-                                         struct bch_io_opts opts,
-                                         unsigned sectors,
-                                         struct bch_read_bio **rbio)
-{
-       struct bch_fs *c = trans->c;
-       struct promote_op *op = NULL;
-       struct bio *bio;
-       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-       int ret;
-
-       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
-               return NULL;
-
-       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
-       if (!op)
-               goto err;
-
-       op->start_time = local_clock();
-       op->pos = pos;
-
-       /*
-        * We don't use the mempool here because extents that aren't
-        * checksummed or compressed can be too big for the mempool:
-        */
-       *rbio = kzalloc(sizeof(struct bch_read_bio) +
-                       sizeof(struct bio_vec) * pages,
-                       GFP_NOFS);
-       if (!*rbio)
-               goto err;
-
-       rbio_init(&(*rbio)->bio, opts);
-       bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
-
-       if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-                                GFP_NOFS))
-               goto err;
-
-       (*rbio)->bounce         = true;
-       (*rbio)->split          = true;
-       (*rbio)->kmalloc        = true;
-
-       if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
-                                         bch_promote_params))
-               goto err;
-
-       bio = &op->write.op.wbio.bio;
-       bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
-
-       ret = bch2_data_update_init(trans, NULL, &op->write,
-                       writepoint_hashed((unsigned long) current),
-                       opts,
-                       (struct data_update_opts) {
-                               .target         = opts.promote_target,
-                               .extra_replicas = 1,
-                               .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
-                       },
-                       btree_id, k);
-       /*
-        * possible errors: -BCH_ERR_nocow_lock_blocked,
-        * -BCH_ERR_ENOSPC_disk_reservation:
-        */
-       if (ret) {
-               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
-                                       bch_promote_params);
-               BUG_ON(ret);
-               goto err;
-       }
-
-       op->write.op.end_io = promote_done;
-
-       return op;
-err:
-       if (*rbio)
-               bio_free_pages(&(*rbio)->bio);
-       kfree(*rbio);
-       *rbio = NULL;
-       kfree(op);
-       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
-       return NULL;
-}
-
-noinline
-static struct promote_op *promote_alloc(struct btree_trans *trans,
-                                       struct bvec_iter iter,
-                                       struct bkey_s_c k,
-                                       struct extent_ptr_decoded *pick,
-                                       struct bch_io_opts opts,
-                                       unsigned flags,
-                                       struct bch_read_bio **rbio,
-                                       bool *bounce,
-                                       bool *read_full)
-{
-       struct bch_fs *c = trans->c;
-       bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
-       /* data might have to be decompressed in the write path: */
-       unsigned sectors = promote_full
-               ? max(pick->crc.compressed_size, pick->crc.live_size)
-               : bvec_iter_sectors(iter);
-       struct bpos pos = promote_full
-               ? bkey_start_pos(k.k)
-               : POS(k.k->p.inode, iter.bi_sector);
-       struct promote_op *promote;
-
-       if (!should_promote(c, k, pos, opts, flags))
-               return NULL;
-
-       promote = __promote_alloc(trans,
-                                 k.k->type == KEY_TYPE_reflink_v
-                                 ? BTREE_ID_reflink
-                                 : BTREE_ID_extents,
-                                 k, pos, pick, opts, sectors, rbio);
-       if (!promote)
-               return NULL;
-
-       *bounce         = true;
-       *read_full      = promote_full;
-       return promote;
-}
-
-/* Read */
-
-#define READ_RETRY_AVOID       1
-#define READ_RETRY             2
-#define READ_ERR               3
-
-enum rbio_context {
-       RBIO_CONTEXT_NULL,
-       RBIO_CONTEXT_HIGHPRI,
-       RBIO_CONTEXT_UNBOUND,
-};
-
-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-       return rbio->split ? rbio->parent : rbio;
-}
-
-__always_inline
-static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
-                          enum rbio_context context,
-                          struct workqueue_struct *wq)
-{
-       if (context <= rbio->context) {
-               fn(&rbio->work);
-       } else {
-               rbio->work.func         = fn;
-               rbio->context           = context;
-               queue_work(wq, &rbio->work);
-       }
-}
-
-static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
-{
-       BUG_ON(rbio->bounce && !rbio->split);
-
-       if (rbio->promote)
-               promote_free(rbio->c, rbio->promote);
-       rbio->promote = NULL;
-
-       if (rbio->bounce)
-               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-
-       if (rbio->split) {
-               struct bch_read_bio *parent = rbio->parent;
-
-               if (rbio->kmalloc)
-                       kfree(rbio);
-               else
-                       bio_put(&rbio->bio);
-
-               rbio = parent;
-       }
-
-       return rbio;
-}
-
-/*
- * Only called on a top level bch_read_bio to complete an entire read request,
- * not a split:
- */
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-       if (rbio->start_time)
-               bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
-                                      rbio->start_time);
-       bio_endio(&rbio->bio);
-}
-
-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-                                    struct bvec_iter bvec_iter,
-                                    struct bch_io_failures *failed,
-                                    unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       int ret;
-
-       flags &= ~BCH_READ_LAST_FRAGMENT;
-       flags |= BCH_READ_MUST_CLONE;
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
-                            rbio->read_pos, BTREE_ITER_SLOTS);
-retry:
-       rbio->bio.bi_status = 0;
-
-       k = bch2_btree_iter_peek_slot(&iter);
-       if (bkey_err(k))
-               goto err;
-
-       bch2_bkey_buf_reassemble(&sk, c, k);
-       k = bkey_i_to_s_c(sk.k);
-       bch2_trans_unlock(&trans);
-
-       if (!bch2_bkey_matches_ptr(c, k,
-                                  rbio->pick.ptr,
-                                  rbio->data_pos.offset -
-                                  rbio->pick.crc.offset)) {
-               /* extent we wanted to read no longer exists: */
-               rbio->hole = true;
-               goto out;
-       }
-
-       ret = __bch2_read_extent(&trans, rbio, bvec_iter,
-                                rbio->read_pos,
-                                rbio->data_btree,
-                                k, 0, failed, flags);
-       if (ret == READ_RETRY)
-               goto retry;
-       if (ret)
-               goto err;
-out:
-       bch2_rbio_done(rbio);
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-       return;
-err:
-       rbio->bio.bi_status = BLK_STS_IOERR;
-       goto out;
-}
-
-static void bch2_rbio_retry(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bvec_iter iter   = rbio->bvec_iter;
-       unsigned flags          = rbio->flags;
-       subvol_inum inum = {
-               .subvol = rbio->subvol,
-               .inum   = rbio->read_pos.inode,
-       };
-       struct bch_io_failures failed = { .nr = 0 };
-
-       trace_and_count(c, read_retry, &rbio->bio);
-
-       if (rbio->retry == READ_RETRY_AVOID)
-               bch2_mark_io_failure(&failed, &rbio->pick);
-
-       rbio->bio.bi_status = 0;
-
-       rbio = bch2_rbio_free(rbio);
-
-       flags |= BCH_READ_IN_RETRY;
-       flags &= ~BCH_READ_MAY_PROMOTE;
-
-       if (flags & BCH_READ_NODECODE) {
-               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
-       } else {
-               flags &= ~BCH_READ_LAST_FRAGMENT;
-               flags |= BCH_READ_MUST_CLONE;
-
-               __bch2_read(c, rbio, iter, inum, &failed, flags);
-       }
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
-                           blk_status_t error)
-{
-       rbio->retry = retry;
-
-       if (rbio->flags & BCH_READ_IN_RETRY)
-               return;
-
-       if (retry == READ_ERR) {
-               rbio = bch2_rbio_free(rbio);
-
-               rbio->bio.bi_status = error;
-               bch2_rbio_done(rbio);
-       } else {
-               bch2_rbio_punt(rbio, bch2_rbio_retry,
-                              RBIO_CONTEXT_UNBOUND, system_unbound_wq);
-       }
-}
-
-static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
-                                  struct bch_read_bio *rbio)
-{
-       struct bch_fs *c = rbio->c;
-       u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
-       struct bch_extent_crc_unpacked new_crc;
-       struct btree_iter iter;
-       struct bkey_i *new;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       if (crc_is_compressed(rbio->pick.crc))
-               return 0;
-
-       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
-                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       if ((ret = bkey_err(k)))
-               goto out;
-
-       if (bversion_cmp(k.k->version, rbio->version) ||
-           !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
-               goto out;
-
-       /* Extent was merged? */
-       if (bkey_start_offset(k.k) < data_offset ||
-           k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
-               goto out;
-
-       if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
-                       rbio->pick.crc, NULL, &new_crc,
-                       bkey_start_offset(k.k) - data_offset, k.k->size,
-                       rbio->pick.crc.csum_type)) {
-               bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
-               ret = 0;
-               goto out;
-       }
-
-       /*
-        * going to be temporarily appending another checksum entry:
-        */
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-                                sizeof(struct bch_extent_crc128));
-       if ((ret = PTR_ERR_OR_ZERO(new)))
-               goto out;
-
-       bkey_reassemble(new, k);
-
-       if (!bch2_bkey_narrow_crcs(new, new_crc))
-               goto out;
-
-       ret = bch2_trans_update(trans, &iter, new,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-out:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
-{
-       bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
-                     __bch2_rbio_narrow_crcs(&trans, rbio));
-}
-
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-       struct bch_read_bio *rbio =
-               container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-       struct bio *src         = &rbio->bio;
-       struct bio *dst         = &bch2_rbio_parent(rbio)->bio;
-       struct bvec_iter dst_iter = rbio->bvec_iter;
-       struct bch_extent_crc_unpacked crc = rbio->pick.crc;
-       struct nonce nonce = extent_nonce(rbio->version, crc);
-       unsigned nofs_flags;
-       struct bch_csum csum;
-       int ret;
-
-       nofs_flags = memalloc_nofs_save();
-
-       /* Reset iterator for checksumming and copying bounced data: */
-       if (rbio->bounce) {
-               src->bi_iter.bi_size            = crc.compressed_size << 9;
-               src->bi_iter.bi_idx             = 0;
-               src->bi_iter.bi_bvec_done       = 0;
-       } else {
-               src->bi_iter                    = rbio->bvec_iter;
-       }
-
-       csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
-               goto csum_err;
-
-       /*
-        * XXX
-        * We need to rework the narrow_crcs path to deliver the read completion
-        * first, and then punt to a different workqueue, otherwise we're
-        * holding up reads while doing btree updates which is bad for memory
-        * reclaim.
-        */
-       if (unlikely(rbio->narrow_crcs))
-               bch2_rbio_narrow_crcs(rbio);
-
-       if (rbio->flags & BCH_READ_NODECODE)
-               goto nodecode;
-
-       /* Adjust crc to point to subset of data we want: */
-       crc.offset     += rbio->offset_into_extent;
-       crc.live_size   = bvec_iter_sectors(rbio->bvec_iter);
-
-       if (crc_is_compressed(crc)) {
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
-                   !c->opts.no_data_io)
-                       goto decompression_err;
-       } else {
-               /* don't need to decrypt the entire bio: */
-               nonce = nonce_add(nonce, crc.offset << 9);
-               bio_advance(src, crc.offset << 9);
-
-               BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-               src->bi_iter.bi_size = dst_iter.bi_size;
-
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               if (rbio->bounce) {
-                       struct bvec_iter src_iter = src->bi_iter;
-                       bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
-               }
-       }
-
-       if (rbio->promote) {
-               /*
-                * Re encrypt data we decrypted, so it's consistent with
-                * rbio->crc:
-                */
-               ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src);
-               if (ret)
-                       goto decrypt_err;
-
-               promote_start(rbio->promote, rbio);
-               rbio->promote = NULL;
-       }
-nodecode:
-       if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
-               rbio = bch2_rbio_free(rbio);
-               bch2_rbio_done(rbio);
-       }
-out:
-       memalloc_nofs_restore(nofs_flags);
-       return;
-csum_err:
-       /*
-        * Checksum error: if the bio wasn't bounced, we may have been
-        * reading into buffers owned by userspace (that userspace can
-        * scribble over) - retry the read, bouncing it this time:
-        */
-       if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-               rbio->flags |= BCH_READ_MUST_BOUNCE;
-               bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
-               goto out;
-       }
-
-       bch_err_inum_offset_ratelimited(ca,
-               rbio->read_pos.inode,
-               rbio->read_pos.offset << 9,
-               "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
-               rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-               csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
-       bch2_io_error(ca);
-       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-       goto out;
-decompression_err:
-       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-                                       rbio->read_pos.offset << 9,
-                                       "decompression error");
-       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-       goto out;
-decrypt_err:
-       bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode,
-                                       rbio->read_pos.offset << 9,
-                                       "decrypt error");
-       bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
-       goto out;
-}
-
-static void bch2_read_endio(struct bio *bio)
-{
-       struct bch_read_bio *rbio =
-               container_of(bio, struct bch_read_bio, bio);
-       struct bch_fs *c        = rbio->c;
-       struct bch_dev *ca      = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
-       struct workqueue_struct *wq = NULL;
-       enum rbio_context context = RBIO_CONTEXT_NULL;
-
-       if (rbio->have_ioref) {
-               bch2_latency_acct(ca, rbio->submit_time, READ);
-               percpu_ref_put(&ca->io_ref);
-       }
-
-       if (!rbio->split)
-               rbio->bio.bi_end_io = rbio->end_io;
-
-       if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
-                                   rbio->read_pos.inode,
-                                   rbio->read_pos.offset,
-                                   "data read error: %s",
-                              bch2_blk_status_to_str(bio->bi_status))) {
-               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
-               return;
-       }
-
-       if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-           ptr_stale(ca, &rbio->pick.ptr)) {
-               trace_and_count(c, read_reuse_race, &rbio->bio);
-
-               if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-                       bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
-               else
-                       bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
-               return;
-       }
-
-       if (rbio->narrow_crcs ||
-           rbio->promote ||
-           crc_is_compressed(rbio->pick.crc) ||
-           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-               context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
-       else if (rbio->pick.crc.csum_type)
-               context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
-
-       bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
-}
-
-int __bch2_read_indirect_extent(struct btree_trans *trans,
-                               unsigned *offset_into_extent,
-                               struct bkey_buf *orig_k)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       u64 reflink_offset;
-       int ret;
-
-       reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
-               *offset_into_extent;
-
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
-                              POS(0, reflink_offset), 0);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       if (k.k->type != KEY_TYPE_reflink_v &&
-           k.k->type != KEY_TYPE_indirect_inline_data) {
-               bch_err_inum_offset_ratelimited(trans->c,
-                       orig_k->k->k.p.inode,
-                       orig_k->k->k.p.offset << 9,
-                       "%llu len %u points to nonexistent indirect extent %llu",
-                       orig_k->k->k.p.offset,
-                       orig_k->k->k.size,
-                       reflink_offset);
-               bch2_inconsistent_error(trans->c);
-               ret = -EIO;
-               goto err;
-       }
-
-       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
-       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
-                                                  struct bkey_s_c k,
-                                                  struct bch_extent_ptr ptr)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
-       struct btree_iter iter;
-       struct printbuf buf = PRINTBUF;
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            PTR_BUCKET_POS(c, &ptr),
-                            BTREE_ITER_CACHED);
-
-       prt_printf(&buf, "Attempting to read from stale dirty pointer:");
-       printbuf_indent_add(&buf, 2);
-       prt_newline(&buf);
-
-       bch2_bkey_val_to_text(&buf, c, k);
-       prt_newline(&buf);
-
-       prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
-
-       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
-       if (!ret) {
-               prt_newline(&buf);
-               bch2_bkey_val_to_text(&buf, c, k);
-       }
-
-       bch2_fs_inconsistent(c, "%s", buf.buf);
-
-       bch2_trans_iter_exit(trans, &iter);
-       printbuf_exit(&buf);
-}
-
-int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
-                      struct bvec_iter iter, struct bpos read_pos,
-                      enum btree_id data_btree, struct bkey_s_c k,
-                      unsigned offset_into_extent,
-                      struct bch_io_failures *failed, unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct extent_ptr_decoded pick;
-       struct bch_read_bio *rbio = NULL;
-       struct bch_dev *ca = NULL;
-       struct promote_op *promote = NULL;
-       bool bounce = false, read_full = false, narrow_crcs = false;
-       struct bpos data_pos = bkey_start_pos(k.k);
-       int pick_ret;
-
-       if (bkey_extent_is_inline_data(k.k)) {
-               unsigned bytes = min_t(unsigned, iter.bi_size,
-                                      bkey_inline_data_bytes(k.k));
-
-               swap(iter.bi_size, bytes);
-               memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
-               swap(iter.bi_size, bytes);
-               bio_advance_iter(&orig->bio, &iter, bytes);
-               zero_fill_bio_iter(&orig->bio, iter);
-               goto out_read_done;
-       }
-retry_pick:
-       pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
-
-       /* hole or reservation - just zero fill: */
-       if (!pick_ret)
-               goto hole;
-
-       if (pick_ret < 0) {
-               bch_err_inum_offset_ratelimited(c,
-                               read_pos.inode, read_pos.offset << 9,
-                               "no device to read from");
-               goto err;
-       }
-
-       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
-
-       /*
-        * Stale dirty pointers are treated as IO errors, but @failed isn't
-        * allocated unless we're in the retry path - so if we're not in the
-        * retry path, don't check here, it'll be caught in bch2_read_endio()
-        * and we'll end up in the retry path:
-        */
-       if ((flags & BCH_READ_IN_RETRY) &&
-           !pick.ptr.cached &&
-           unlikely(ptr_stale(ca, &pick.ptr))) {
-               read_from_stale_dirty_pointer(trans, k, pick.ptr);
-               bch2_mark_io_failure(failed, &pick);
-               goto retry_pick;
-       }
-
-       /*
-        * Unlock the iterator while the btree node's lock is still in
-        * cache, before doing the IO:
-        */
-       bch2_trans_unlock(trans);
-
-       if (flags & BCH_READ_NODECODE) {
-               /*
-                * can happen if we retry, and the extent we were going to read
-                * has been merged in the meantime:
-                */
-               if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
-                       goto hole;
-
-               iter.bi_size    = pick.crc.compressed_size << 9;
-               goto get_bio;
-       }
-
-       if (!(flags & BCH_READ_LAST_FRAGMENT) ||
-           bio_flagged(&orig->bio, BIO_CHAIN))
-               flags |= BCH_READ_MUST_CLONE;
-
-       narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
-               bch2_can_narrow_extent_crcs(k, pick.crc);
-
-       if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
-               flags |= BCH_READ_MUST_BOUNCE;
-
-       EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
-
-       if (crc_is_compressed(pick.crc) ||
-           (pick.crc.csum_type != BCH_CSUM_none &&
-            (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-             (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
-              (flags & BCH_READ_USER_MAPPED)) ||
-             (flags & BCH_READ_MUST_BOUNCE)))) {
-               read_full = true;
-               bounce = true;
-       }
-
-       if (orig->opts.promote_target)
-               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
-                                       &rbio, &bounce, &read_full);
-
-       if (!read_full) {
-               EBUG_ON(crc_is_compressed(pick.crc));
-               EBUG_ON(pick.crc.csum_type &&
-                       (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
-                        bvec_iter_sectors(iter) != pick.crc.live_size ||
-                        pick.crc.offset ||
-                        offset_into_extent));
-
-               data_pos.offset += offset_into_extent;
-               pick.ptr.offset += pick.crc.offset +
-                       offset_into_extent;
-               offset_into_extent              = 0;
-               pick.crc.compressed_size        = bvec_iter_sectors(iter);
-               pick.crc.uncompressed_size      = bvec_iter_sectors(iter);
-               pick.crc.offset                 = 0;
-               pick.crc.live_size              = bvec_iter_sectors(iter);
-               offset_into_extent              = 0;
-       }
-get_bio:
-       if (rbio) {
-               /*
-                * promote already allocated bounce rbio:
-                * promote needs to allocate a bio big enough for uncompressing
-                * data in the write path, but we're not going to use it all
-                * here:
-                */
-               EBUG_ON(rbio->bio.bi_iter.bi_size <
-                      pick.crc.compressed_size << 9);
-               rbio->bio.bi_iter.bi_size =
-                       pick.crc.compressed_size << 9;
-       } else if (bounce) {
-               unsigned sectors = pick.crc.compressed_size;
-
-               rbio = rbio_init(bio_alloc_bioset(NULL,
-                                                 DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                                 0,
-                                                 GFP_NOFS,
-                                                 &c->bio_read_split),
-                                orig->opts);
-
-               bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-               rbio->bounce    = true;
-               rbio->split     = true;
-       } else if (flags & BCH_READ_MUST_CLONE) {
-               /*
-                * Have to clone if there were any splits, due to error
-                * reporting issues (if a split errored, and retrying didn't
-                * work, when it reports the error to its parent (us) we don't
-                * know if the error was from our bio, and we should retry, or
-                * from the whole bio, in which case we don't want to retry and
-                * lose the error)
-                */
-               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
-                                                &c->bio_read_split),
-                                orig->opts);
-               rbio->bio.bi_iter = iter;
-               rbio->split     = true;
-       } else {
-               rbio = orig;
-               rbio->bio.bi_iter = iter;
-               EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
-       }
-
-       EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
-
-       rbio->c                 = c;
-       rbio->submit_time       = local_clock();
-       if (rbio->split)
-               rbio->parent    = orig;
-       else
-               rbio->end_io    = orig->bio.bi_end_io;
-       rbio->bvec_iter         = iter;
-       rbio->offset_into_extent= offset_into_extent;
-       rbio->flags             = flags;
-       rbio->have_ioref        = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
-       rbio->narrow_crcs       = narrow_crcs;
-       rbio->hole              = 0;
-       rbio->retry             = 0;
-       rbio->context           = 0;
-       /* XXX: only initialize this if needed */
-       rbio->devs_have         = bch2_bkey_devs(k);
-       rbio->pick              = pick;
-       rbio->subvol            = orig->subvol;
-       rbio->read_pos          = read_pos;
-       rbio->data_btree        = data_btree;
-       rbio->data_pos          = data_pos;
-       rbio->version           = k.k->version;
-       rbio->promote           = promote;
-       INIT_WORK(&rbio->work, NULL);
-
-       rbio->bio.bi_opf        = orig->bio.bi_opf;
-       rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
-       rbio->bio.bi_end_io     = bch2_read_endio;
-
-       if (rbio->bounce)
-               trace_and_count(c, read_bounce, &rbio->bio);
-
-       this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio));
-       bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-
-       /*
-        * If it's being moved internally, we don't want to flag it as a cache
-        * hit:
-        */
-       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
-               bch2_bucket_io_time_reset(trans, pick.ptr.dev,
-                       PTR_BUCKET_NR(ca, &pick.ptr), READ);
-
-       if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
-               bio_inc_remaining(&orig->bio);
-               trace_and_count(c, read_split, &orig->bio);
-       }
-
-       if (!rbio->pick.idx) {
-               if (!rbio->have_ioref) {
-                       bch_err_inum_offset_ratelimited(c,
-                                       read_pos.inode,
-                                       read_pos.offset << 9,
-                                       "no device to read from");
-                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-                       goto out;
-               }
-
-               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
-                            bio_sectors(&rbio->bio));
-               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
-
-               if (unlikely(c->opts.no_data_io)) {
-                       if (likely(!(flags & BCH_READ_IN_RETRY)))
-                               bio_endio(&rbio->bio);
-               } else {
-                       if (likely(!(flags & BCH_READ_IN_RETRY)))
-                               submit_bio(&rbio->bio);
-                       else
-                               submit_bio_wait(&rbio->bio);
-               }
-
-               /*
-                * We just submitted IO which may block, we expect relock fail
-                * events and shouldn't count them:
-                */
-               trans->notrace_relock_fail = true;
-       } else {
-               /* Attempting reconstruct read: */
-               if (bch2_ec_read_extent(c, rbio)) {
-                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
-                       goto out;
-               }
-
-               if (likely(!(flags & BCH_READ_IN_RETRY)))
-                       bio_endio(&rbio->bio);
-       }
-out:
-       if (likely(!(flags & BCH_READ_IN_RETRY))) {
-               return 0;
-       } else {
-               int ret;
-
-               rbio->context = RBIO_CONTEXT_UNBOUND;
-               bch2_read_endio(&rbio->bio);
-
-               ret = rbio->retry;
-               rbio = bch2_rbio_free(rbio);
-
-               if (ret == READ_RETRY_AVOID) {
-                       bch2_mark_io_failure(failed, &pick);
-                       ret = READ_RETRY;
-               }
-
-               if (!ret)
-                       goto out_read_done;
-
-               return ret;
-       }
-
-err:
-       if (flags & BCH_READ_IN_RETRY)
-               return READ_ERR;
-
-       orig->bio.bi_status = BLK_STS_IOERR;
-       goto out_read_done;
-
-hole:
-       /*
-        * won't normally happen in the BCH_READ_NODECODE
-        * (bch2_move_extent()) path, but if we retry and the extent we wanted
-        * to read no longer exists we have to signal that:
-        */
-       if (flags & BCH_READ_NODECODE)
-               orig->hole = true;
-
-       zero_fill_bio_iter(&orig->bio, iter);
-out_read_done:
-       if (flags & BCH_READ_LAST_FRAGMENT)
-               bch2_rbio_done(orig);
-       return 0;
-}
-
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                struct bvec_iter bvec_iter, subvol_inum inum,
-                struct bch_io_failures *failed, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_buf sk;
-       struct bkey_s_c k;
-       u32 snapshot;
-       int ret;
-
-       BUG_ON(flags & BCH_READ_NODECODE);
-
-       bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-       iter = (struct btree_iter) { NULL };
-
-       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
-       if (ret)
-               goto err;
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
-                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
-                            BTREE_ITER_SLOTS);
-       while (1) {
-               unsigned bytes, sectors, offset_into_extent;
-               enum btree_id data_btree = BTREE_ID_extents;
-
-               /*
-                * read_extent -> io_time_reset may cause a transaction restart
-                * without returning an error, we need to check for that here:
-                */
-               ret = bch2_trans_relock(&trans);
-               if (ret)
-                       break;
-
-               bch2_btree_iter_set_pos(&iter,
-                               POS(inum.inum, bvec_iter.bi_sector));
-
-               k = bch2_btree_iter_peek_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       break;
-
-               offset_into_extent = iter.pos.offset -
-                       bkey_start_offset(k.k);
-               sectors = k.k->size - offset_into_extent;
-
-               bch2_bkey_buf_reassemble(&sk, c, k);
-
-               ret = bch2_read_indirect_extent(&trans, &data_btree,
-                                       &offset_into_extent, &sk);
-               if (ret)
-                       break;
-
-               k = bkey_i_to_s_c(sk.k);
-
-               /*
-                * With indirect extents, the amount of data to read is the min
-                * of the original extent and the indirect extent:
-                */
-               sectors = min(sectors, k.k->size - offset_into_extent);
-
-               bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
-               swap(bvec_iter.bi_size, bytes);
-
-               if (bvec_iter.bi_size == bytes)
-                       flags |= BCH_READ_LAST_FRAGMENT;
-
-               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
-                                        data_btree, k,
-                                        offset_into_extent, failed, flags);
-               if (ret)
-                       break;
-
-               if (flags & BCH_READ_LAST_FRAGMENT)
-                       break;
-
-               swap(bvec_iter.bi_size, bytes);
-               bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
-
-               ret = btree_trans_too_many_iters(&trans);
-               if (ret)
-                       break;
-       }
-err:
-       bch2_trans_iter_exit(&trans, &iter);
-
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-           ret == READ_RETRY ||
-           ret == READ_RETRY_AVOID)
-               goto retry;
-
-       bch2_trans_exit(&trans);
-       bch2_bkey_buf_exit(&sk, c);
-
-       if (ret) {
-               bch_err_inum_offset_ratelimited(c, inum.inum,
-                                               bvec_iter.bi_sector << 9,
-                                               "read error %i from btree lookup", ret);
-               rbio->bio.bi_status = BLK_STS_IOERR;
-               bch2_rbio_done(rbio);
-       }
-}
-
-void bch2_fs_io_exit(struct bch_fs *c)
-{
-       if (c->promote_table.tbl)
-               rhashtable_destroy(&c->promote_table);
-       mempool_exit(&c->bio_bounce_pages);
-       bioset_exit(&c->bio_write);
-       bioset_exit(&c->bio_read_split);
-       bioset_exit(&c->bio_read);
-}
-
-int bch2_fs_io_init(struct bch_fs *c)
-{
-       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_read_init;
-
-       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_read_split_init;
-
-       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-                       BIOSET_NEED_BVECS))
-               return -BCH_ERR_ENOMEM_bio_write_init;
-
-       if (mempool_init_page_pool(&c->bio_bounce_pages,
-                                  max_t(unsigned,
-                                        c->opts.btree_node_size,
-                                        c->opts.encoded_extent_max) /
-                                  PAGE_SIZE, 0))
-               return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
-
-       if (rhashtable_init(&c->promote_table, &bch_promote_params))
-               return -BCH_ERR_ENOMEM_promote_table_init;
-
-       return 0;
-}
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
deleted file mode 100644 (file)
index 1476380..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_H
-#define _BCACHEFS_IO_H
-
-#include "checksum.h"
-#include "bkey_buf.h"
-#include "io_types.h"
-
-#define to_wbio(_bio)                  \
-       container_of((_bio), struct bch_write_bio, bio)
-
-#define to_rbio(_bio)                  \
-       container_of((_bio), struct bch_read_bio, bio)
-
-void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
-void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void bch2_latency_acct(struct bch_dev *, u64, int);
-#else
-static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
-#endif
-
-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-                              enum bch_data_type, const struct bkey_i *, bool);
-
-#define BLK_STS_REMOVED                ((__force blk_status_t)128)
-
-const char *bch2_blk_status_to_str(blk_status_t);
-
-#define BCH_WRITE_FLAGS()              \
-       x(ALLOC_NOWAIT)                 \
-       x(CACHED)                       \
-       x(DATA_ENCODED)                 \
-       x(PAGES_STABLE)                 \
-       x(PAGES_OWNED)                  \
-       x(ONLY_SPECIFIED_DEVS)          \
-       x(WROTE_DATA_INLINE)            \
-       x(FROM_INTERNAL)                \
-       x(CHECK_ENOSPC)                 \
-       x(SYNC)                         \
-       x(MOVE)                         \
-       x(IN_WORKER)                    \
-       x(DONE)                         \
-       x(IO_ERROR)                     \
-       x(CONVERT_UNWRITTEN)
-
-enum __bch_write_flags {
-#define x(f)   __BCH_WRITE_##f,
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-enum bch_write_flags {
-#define x(f)   BCH_WRITE_##f = 1U << __BCH_WRITE_##f,
-       BCH_WRITE_FLAGS()
-#undef x
-};
-
-static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-       return op->watermark == BCH_WATERMARK_copygc
-               ? op->c->copygc_wq
-               : op->c->btree_update_wq;
-}
-
-int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-                              struct bkey_i *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, subvol_inum,
-                      struct btree_iter *, struct bkey_i *,
-                      struct disk_reservation *, u64, s64 *, bool);
-int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *,
-                         unsigned, struct bch_io_opts, s64 *,
-                         struct write_point_specifier);
-
-int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-                  subvol_inum, u64, s64 *);
-int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
-
-static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                                     struct bch_io_opts opts)
-{
-       op->c                   = c;
-       op->end_io              = NULL;
-       op->flags               = 0;
-       op->written             = 0;
-       op->error               = 0;
-       op->csum_type           = bch2_data_checksum_type(c, opts);
-       op->compression_opt     = opts.compression;
-       op->nr_replicas         = 0;
-       op->nr_replicas_required = c->opts.data_replicas_required;
-       op->watermark           = BCH_WATERMARK_normal;
-       op->incompressible      = 0;
-       op->open_buckets.nr     = 0;
-       op->devs_have.nr        = 0;
-       op->target              = 0;
-       op->opts                = opts;
-       op->subvol              = 0;
-       op->pos                 = POS_MAX;
-       op->version             = ZERO_VERSION;
-       op->write_point         = (struct write_point_specifier) { 0 };
-       op->res                 = (struct disk_reservation) { 0 };
-       op->new_i_size          = U64_MAX;
-       op->i_sectors_delta     = 0;
-       op->devs_need_flush     = NULL;
-}
-
-void bch2_write(struct closure *);
-
-void bch2_write_point_do_index_updates(struct work_struct *);
-
-static inline struct bch_write_bio *wbio_init(struct bio *bio)
-{
-       struct bch_write_bio *wbio = to_wbio(bio);
-
-       memset(&wbio->wbio, 0, sizeof(wbio->wbio));
-       return wbio;
-}
-
-void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *);
-
-struct bch_devs_mask;
-struct cache_promote_op;
-struct extent_ptr_decoded;
-
-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-                               struct bkey_buf *);
-
-static inline int bch2_read_indirect_extent(struct btree_trans *trans,
-                                           enum btree_id *data_btree,
-                                           unsigned *offset_into_extent,
-                                           struct bkey_buf *k)
-{
-       if (k->k->k.type != KEY_TYPE_reflink_p)
-               return 0;
-
-       *data_btree = BTREE_ID_reflink;
-       return __bch2_read_indirect_extent(trans, offset_into_extent, k);
-}
-
-enum bch_read_flags {
-       BCH_READ_RETRY_IF_STALE         = 1 << 0,
-       BCH_READ_MAY_PROMOTE            = 1 << 1,
-       BCH_READ_USER_MAPPED            = 1 << 2,
-       BCH_READ_NODECODE               = 1 << 3,
-       BCH_READ_LAST_FRAGMENT          = 1 << 4,
-
-       /* internal: */
-       BCH_READ_MUST_BOUNCE            = 1 << 5,
-       BCH_READ_MUST_CLONE             = 1 << 6,
-       BCH_READ_IN_RETRY               = 1 << 7,
-};
-
-int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
-                      struct bvec_iter, struct bpos, enum btree_id,
-                      struct bkey_s_c, unsigned,
-                      struct bch_io_failures *, unsigned);
-
-static inline void bch2_read_extent(struct btree_trans *trans,
-                       struct bch_read_bio *rbio, struct bpos read_pos,
-                       enum btree_id data_btree, struct bkey_s_c k,
-                       unsigned offset_into_extent, unsigned flags)
-{
-       __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
-                          data_btree, k, offset_into_extent, NULL, flags);
-}
-
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-                subvol_inum, struct bch_io_failures *, unsigned flags);
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                            subvol_inum inum)
-{
-       struct bch_io_failures failed = { .nr = 0 };
-
-       BUG_ON(rbio->_state);
-
-       rbio->c = c;
-       rbio->start_time = local_clock();
-       rbio->subvol = inum.subvol;
-
-       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
-                   BCH_READ_RETRY_IF_STALE|
-                   BCH_READ_MAY_PROMOTE|
-                   BCH_READ_USER_MAPPED);
-}
-
-static inline struct bch_read_bio *rbio_init(struct bio *bio,
-                                            struct bch_io_opts opts)
-{
-       struct bch_read_bio *rbio = to_rbio(bio);
-
-       rbio->_state    = 0;
-       rbio->promote   = NULL;
-       rbio->opts      = opts;
-       return rbio;
-}
-
-void bch2_fs_io_exit(struct bch_fs *);
-int bch2_fs_io_init(struct bch_fs *);
-
-#endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
deleted file mode 100644 (file)
index 737f16d..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_IO_TYPES_H
-#define _BCACHEFS_IO_TYPES_H
-
-#include "alloc_types.h"
-#include "btree_types.h"
-#include "buckets_types.h"
-#include "extents_types.h"
-#include "keylist_types.h"
-#include "opts.h"
-#include "super_types.h"
-
-#include <linux/llist.h>
-#include <linux/workqueue.h>
-
-struct bch_read_bio {
-       struct bch_fs           *c;
-       u64                     start_time;
-       u64                     submit_time;
-
-       /*
-        * Reads will often have to be split, and if the extent being read from
-        * was checksummed or compressed we'll also have to allocate bounce
-        * buffers and copy the data back into the original bio.
-        *
-        * If we didn't have to split, we have to save and restore the original
-        * bi_end_io - @split below indicates which:
-        */
-       union {
-       struct bch_read_bio     *parent;
-       bio_end_io_t            *end_io;
-       };
-
-       /*
-        * Saved copy of bio->bi_iter, from submission time - allows us to
-        * resubmit on IO error, and also to copy data back to the original bio
-        * when we're bouncing:
-        */
-       struct bvec_iter        bvec_iter;
-
-       unsigned                offset_into_extent;
-
-       u16                     flags;
-       union {
-       struct {
-       u16                     bounce:1,
-                               split:1,
-                               kmalloc:1,
-                               have_ioref:1,
-                               narrow_crcs:1,
-                               hole:1,
-                               retry:2,
-                               context:2;
-       };
-       u16                     _state;
-       };
-
-       struct bch_devs_list    devs_have;
-
-       struct extent_ptr_decoded pick;
-
-       /*
-        * pos we read from - different from data_pos for indirect extents:
-        */
-       u32                     subvol;
-       struct bpos             read_pos;
-
-       /*
-        * start pos of data we read (may not be pos of data we want) - for
-        * promote, narrow extents paths:
-        */
-       enum btree_id           data_btree;
-       struct bpos             data_pos;
-       struct bversion         version;
-
-       struct promote_op       *promote;
-
-       struct bch_io_opts      opts;
-
-       struct work_struct      work;
-
-       struct bio              bio;
-};
-
-struct bch_write_bio {
-       struct_group(wbio,
-       struct bch_fs           *c;
-       struct bch_write_bio    *parent;
-
-       u64                     submit_time;
-       u64                     inode_offset;
-
-       struct bch_devs_list    failed;
-       u8                      dev;
-
-       unsigned                split:1,
-                               bounce:1,
-                               put_bio:1,
-                               have_ioref:1,
-                               nocow:1,
-                               used_mempool:1,
-                               first_btree_write:1;
-       );
-
-       struct bio              bio;
-};
-
-struct bch_write_op {
-       struct closure          cl;
-       struct bch_fs           *c;
-       void                    (*end_io)(struct bch_write_op *);
-       u64                     start_time;
-
-       unsigned                written; /* sectors */
-       u16                     flags;
-       s16                     error; /* dio write path expects it to hold -ERESTARTSYS... */
-
-       unsigned                compression_opt:8;
-       unsigned                csum_type:4;
-       unsigned                nr_replicas:4;
-       unsigned                nr_replicas_required:4;
-       unsigned                watermark:3;
-       unsigned                incompressible:1;
-       unsigned                stripe_waited:1;
-
-       struct bch_devs_list    devs_have;
-       u16                     target;
-       u16                     nonce;
-       struct bch_io_opts      opts;
-
-       u32                     subvol;
-       struct bpos             pos;
-       struct bversion         version;
-
-       /* For BCH_WRITE_DATA_ENCODED: */
-       struct bch_extent_crc_unpacked crc;
-
-       struct write_point_specifier write_point;
-
-       struct write_point      *wp;
-       struct list_head        wp_list;
-
-       struct disk_reservation res;
-
-       struct open_buckets     open_buckets;
-
-       u64                     new_i_size;
-       s64                     i_sectors_delta;
-
-       struct bch_devs_mask    failed;
-
-       struct keylist          insert_keys;
-       u64                     inline_keys[BKEY_EXTENT_U64s_MAX * 2];
-
-       /*
-        * Bitmask of devices that have had nocow writes issued to them since
-        * last flush:
-        */
-       struct bch_devs_mask    *devs_need_flush;
-
-       /* Must be last: */
-       struct bch_write_bio    wbio;
-};
-
-#endif /* _BCACHEFS_IO_TYPES_H */
index 80a612c0577f25d4228aabc3567d4e7aebe3de61..7d448136434bd8f2b63674298387e84c214753bb 100644 (file)
@@ -63,6 +63,7 @@ journal_seq_to_buf(struct journal *j, u64 seq)
 static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
 {
        unsigned i;
+
        for (i = 0; i < ARRAY_SIZE(p->list); i++)
                INIT_LIST_HEAD(&p->list[i]);
        INIT_LIST_HEAD(&p->flushed);
@@ -131,13 +132,21 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
        return stuck;
 }
 
-/* journal entry close/open: */
-
-void __bch2_journal_buf_put(struct journal *j)
+/*
+ * Final processing when the last reference of a journal buffer has been
+ * dropped. Drop the pin list reference acquired at journal entry open and write
+ * the buffer, if requested.
+ */
+void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
 
-       closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+       lockdep_assert_held(&j->lock);
+
+       if (__bch2_journal_pin_put(j, seq))
+               bch2_journal_reclaim_fast(j);
+       if (write)
+               closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -203,13 +212,11 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
        buf->data->last_seq     = cpu_to_le64(buf->last_seq);
        BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));
 
-       __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
-
        cancel_delayed_work(&j->write_work);
 
        bch2_journal_space_available(j);
 
-       bch2_journal_buf_put(j, old.idx);
+       __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq));
 }
 
 void bch2_journal_halt(struct journal *j)
@@ -354,11 +361,6 @@ static int journal_entry_open(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       if (j->res_get_blocked_start)
-               bch2_time_stats_update(j->blocked_time,
-                                      j->res_get_blocked_start);
-       j->res_get_blocked_start = 0;
-
        mod_delayed_work(c->io_complete_wq,
                         &j->write_work,
                         msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -458,15 +460,12 @@ retry:
        __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
        ret = journal_entry_open(j);
 
-       if (ret == JOURNAL_ERR_max_in_flight)
+       if (ret == JOURNAL_ERR_max_in_flight) {
+               track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+                                  &j->max_in_flight_start, true);
                trace_and_count(c, journal_entry_full, c);
-unlock:
-       if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
-           !j->res_get_blocked_start) {
-               j->res_get_blocked_start = local_clock() ?: 1;
-               trace_and_count(c, journal_full, c);
        }
-
+unlock:
        can_discard = j->can_discard;
        spin_unlock(&j->lock);
 
@@ -514,42 +513,11 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
        int ret;
 
        closure_wait_event(&j->async_wait,
-                  (ret = __journal_res_get(j, res, flags)) !=
-                  -BCH_ERR_journal_res_get_blocked||
+                  (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
                   (flags & JOURNAL_RES_GET_NONBLOCK));
        return ret;
 }
 
-/* journal_preres: */
-
-static bool journal_preres_available(struct journal *j,
-                                    struct journal_preres *res,
-                                    unsigned new_u64s,
-                                    unsigned flags)
-{
-       bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
-
-       if (!ret && mutex_trylock(&j->reclaim_lock)) {
-               bch2_journal_reclaim(j);
-               mutex_unlock(&j->reclaim_lock);
-       }
-
-       return ret;
-}
-
-int __bch2_journal_preres_get(struct journal *j,
-                             struct journal_preres *res,
-                             unsigned new_u64s,
-                             unsigned flags)
-{
-       int ret;
-
-       closure_wait_event(&j->preres_wait,
-                  (ret = bch2_journal_error(j)) ||
-                  journal_preres_available(j, res, new_u64s, flags));
-       return ret;
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *j,
@@ -588,8 +556,13 @@ out:
 
 /**
  * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ * @j:         journal object
+ * @seq:       seq to flush
+ * @parent:    closure object to wait with
+ * Returns:    1 if @seq has already been flushed, 0 if @seq is being flushed,
+ *             -EIO if @seq will never be flushed
  *
- * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if
  * necessary
  */
 int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
@@ -829,12 +802,12 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                break;
 
                        ret = bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                               bch2_trans_mark_metadata_bucket(trans, ca,
                                                ob[nr_got]->bucket, BCH_DATA_journal,
                                                ca->mi.bucket_size));
                        if (ret) {
                                bch2_open_bucket_put(c, ob[nr_got]);
-                               bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret));
+                               bch_err_msg(c, ret, "marking new journal buckets");
                                break;
                        }
 
@@ -910,7 +883,7 @@ err_unblock:
        if (ret && !new_fs)
                for (i = 0; i < nr_got; i++)
                        bch2_trans_run(c,
-                               bch2_trans_mark_metadata_bucket(&trans, ca,
+                               bch2_trans_mark_metadata_bucket(trans, ca,
                                                bu[i], BCH_DATA_free, 0));
 err_free:
        if (!new_fs)
@@ -944,7 +917,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                goto unlock;
 
        while (ja->nr < nr) {
-               struct disk_reservation disk_res = { 0, 0 };
+               struct disk_reservation disk_res = { 0, 0, 0 };
 
                /*
                 * note: journal buckets aren't really counted as _sectors_ used yet, so
@@ -1008,6 +981,25 @@ err:
        return ret;
 }
 
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i) {
+               if (ca->journal.nr)
+                       continue;
+
+               int ret = bch2_dev_journal_alloc(ca);
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 /* startup/shutdown: */
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
@@ -1159,9 +1151,9 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 {
        struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal *journal_buckets =
-               bch2_sb_get_journal(sb);
+               bch2_sb_field_get(sb, journal);
        struct bch_sb_field_journal_v2 *journal_buckets_v2 =
-               bch2_sb_get_journal_v2(sb);
+               bch2_sb_field_get(sb, journal_v2);
        unsigned i, nr_bvecs;
 
        ja->nr = 0;
@@ -1260,6 +1252,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        union journal_res_state s;
        struct bch_dev *ca;
        unsigned long now = jiffies;
+       u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
        u64 seq;
        unsigned i;
 
@@ -1273,21 +1266,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        prt_printf(out, "dirty journal entries:\t%llu/%llu\n",  fifo_used(&j->pin), j->pin.size);
        prt_printf(out, "seq:\t\t\t%llu\n",                     journal_cur_seq(j));
        prt_printf(out, "seq_ondisk:\t\t%llu\n",                j->seq_ondisk);
-       prt_printf(out, "last_seq:\t\t%llu\n",          journal_last_seq(j));
+       prt_printf(out, "last_seq:\t\t%llu\n",                  journal_last_seq(j));
        prt_printf(out, "last_seq_ondisk:\t%llu\n",             j->last_seq_ondisk);
-       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",  j->flushed_seq_ondisk);
-       prt_printf(out, "prereserved:\t\t%u/%u\n",              j->prereserved.reserved, j->prereserved.remaining);
-       prt_printf(out, "watermark:\t\t%s\n",           bch2_watermarks[j->watermark]);
-       prt_printf(out, "each entry reserved:\t%u\n",   j->entry_u64s_reserved);
+       prt_printf(out, "flushed_seq_ondisk:\t%llu\n",          j->flushed_seq_ondisk);
+       prt_printf(out, "watermark:\t\t%s\n",                   bch2_watermarks[j->watermark]);
+       prt_printf(out, "each entry reserved:\t%u\n",           j->entry_u64s_reserved);
        prt_printf(out, "nr flush writes:\t%llu\n",             j->nr_flush_writes);
-       prt_printf(out, "nr noflush writes:\t%llu\n",   j->nr_noflush_writes);
-       prt_printf(out, "nr direct reclaim:\t%llu\n",   j->nr_direct_reclaim);
+       prt_printf(out, "nr noflush writes:\t%llu\n",           j->nr_noflush_writes);
+       prt_printf(out, "average write size:\t");
+       prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+       prt_newline(out);
+       prt_printf(out, "nr direct reclaim:\t%llu\n",           j->nr_direct_reclaim);
        prt_printf(out, "nr background reclaim:\t%llu\n",       j->nr_background_reclaim);
        prt_printf(out, "reclaim kicked:\t\t%u\n",              j->reclaim_kicked);
-       prt_printf(out, "reclaim runs in:\t%u ms\n",    time_after(j->next_reclaim, now)
+       prt_printf(out, "reclaim runs in:\t%u ms\n",            time_after(j->next_reclaim, now)
               ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
-       prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
-       prt_printf(out, "current entry error:\t%s\n",   bch2_journal_errors[j->cur_entry_error]);
+       prt_printf(out, "current entry sectors:\t%u\n",         j->cur_entry_sectors);
+       prt_printf(out, "current entry error:\t%s\n",           bch2_journal_errors[j->cur_entry_error]);
        prt_printf(out, "current entry:\t\t");
 
        switch (s.cur_entry_offset) {
index 008a2e25a4fac93df681233b679c9fc6eaca95d0..c85d01cf49484984d08d20a2159f84b2506f96a1 100644 (file)
@@ -252,9 +252,10 @@ static inline bool journal_entry_empty(struct jset *j)
        return true;
 }
 
-void __bch2_journal_buf_put(struct journal *);
-
-static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
+/*
+ * Drop reference on a buffer index and return true if the count has hit zero.
+ */
+static inline union journal_res_state journal_state_buf_put(struct journal *j, unsigned idx)
 {
        union journal_res_state s;
 
@@ -264,9 +265,30 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
                                    .buf2_count = idx == 2,
                                    .buf3_count = idx == 3,
                                    }).v, &j->reservations.counter);
+       return s;
+}
+
+void bch2_journal_buf_put_final(struct journal *, u64, bool);
+
+static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+       union journal_res_state s;
+
+       s = journal_state_buf_put(j, idx);
+       if (!journal_state_count(s, idx))
+               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+}
+
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
+{
+       union journal_res_state s;
 
-       if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
-               __bch2_journal_buf_put(j);
+       s = journal_state_buf_put(j, idx);
+       if (!journal_state_count(s, idx)) {
+               spin_lock(&j->lock);
+               bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+               spin_unlock(&j->lock);
+       }
 }
 
 /*
@@ -286,7 +308,7 @@ static inline void bch2_journal_res_put(struct journal *j,
                                       BCH_JSET_ENTRY_btree_keys,
                                       0, 0, 0);
 
-       bch2_journal_buf_put(j, res->idx);
+       bch2_journal_buf_put(j, res->idx, res->seq);
 
        res->ref = 0;
 }
@@ -373,104 +395,6 @@ out:
        return 0;
 }
 
-/* journal_preres: */
-
-static inline void journal_set_watermark(struct journal *j)
-{
-       union journal_preres_state s = READ_ONCE(j->prereserved);
-       unsigned watermark = BCH_WATERMARK_stripe;
-
-       if (fifo_free(&j->pin) < j->pin.size / 4)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (fifo_free(&j->pin) < j->pin.size / 8)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (s.reserved > s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc);
-       if (!s.remaining)
-               watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
-       if (watermark == j->watermark)
-               return;
-
-       swap(watermark, j->watermark);
-       if (watermark > j->watermark)
-               journal_wake(j);
-}
-
-static inline void bch2_journal_preres_put(struct journal *j,
-                                          struct journal_preres *res)
-{
-       union journal_preres_state s = { .reserved = res->u64s };
-
-       if (!res->u64s)
-               return;
-
-       s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
-       res->u64s = 0;
-
-       if (unlikely(s.waiting)) {
-               clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
-                         (unsigned long *) &j->prereserved.v);
-               closure_wake_up(&j->preres_wait);
-       }
-
-       if (s.reserved <= s.remaining && j->watermark)
-               journal_set_watermark(j);
-}
-
-int __bch2_journal_preres_get(struct journal *,
-                       struct journal_preres *, unsigned, unsigned);
-
-static inline int bch2_journal_preres_get_fast(struct journal *j,
-                                              struct journal_preres *res,
-                                              unsigned new_u64s,
-                                              unsigned flags,
-                                              bool set_waiting)
-{
-       int d = new_u64s - res->u64s;
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
-       enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-       int ret;
-
-       do {
-               old.v = new.v = v;
-               ret = 0;
-
-               if (watermark == BCH_WATERMARK_reclaim ||
-                   new.reserved + d < new.remaining) {
-                       new.reserved += d;
-                       ret = 1;
-               } else if (set_waiting && !new.waiting)
-                       new.waiting = true;
-               else
-                       return 0;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
-
-       if (ret)
-               res->u64s += d;
-       return ret;
-}
-
-static inline int bch2_journal_preres_get(struct journal *j,
-                                         struct journal_preres *res,
-                                         unsigned new_u64s,
-                                         unsigned flags)
-{
-       if (new_u64s <= res->u64s)
-               return 0;
-
-       if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
-               return 0;
-
-       if (flags & JOURNAL_RES_GET_NONBLOCK)
-               return -BCH_ERR_journal_preres_get_blocked;
-
-       return __bch2_journal_preres_get(j, res, new_u64s, flags);
-}
-
 /* journal_entry_res: */
 
 void bch2_journal_entry_res_resize(struct journal *,
@@ -512,6 +436,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
                                unsigned nr);
 int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 
index f861ae2f176a8d0d87467250272b472cdc466c94..109c1157eba1d0c18aa510b94ac134356324e8af 100644 (file)
@@ -8,12 +8,12 @@
 #include "checksum.h"
 #include "disk_groups.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
+#include "sb-clean.h"
 #include "trace.h"
 
 static struct nonce journal_nonce(const struct jset *jset)
@@ -140,7 +140,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
                if (!dup->csum_good)
                        goto replace;
 
-               fsck_err(c, "found duplicate but non identical journal entries (seq %llu)",
+               fsck_err(c, journal_entry_replicas_data_mismatch,
+                        "found duplicate but non identical journal entries (seq %llu)",
                         le64_to_cpu(j->seq));
                i = dup;
                goto found;
@@ -208,38 +209,47 @@ static void journal_entry_null_range(void *start, void *end)
 #define JOURNAL_ENTRY_BAD      7
 
 static void journal_entry_err_msg(struct printbuf *out,
+                                 u32 version,
                                  struct jset *jset,
                                  struct jset_entry *entry)
 {
-       prt_str(out, "invalid journal entry ");
-       if (entry)
-               prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]);
-
-       if (!jset)
-               prt_printf(out, "in superblock");
-       else if (!entry)
-               prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq));
-       else
-               prt_printf(out, "at offset %zi/%u seq %llu",
-                          (u64 *) entry - jset->_data,
-                          le32_to_cpu(jset->u64s),
-                          le64_to_cpu(jset->seq));
+       prt_str(out, "invalid journal entry, version=");
+       bch2_version_to_text(out, version);
+
+       if (entry) {
+               prt_str(out, " type=");
+               prt_str(out, bch2_jset_entry_types[entry->type]);
+       }
+
+       if (!jset) {
+               prt_printf(out, " in superblock");
+       } else {
+
+               prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq));
+
+               if (entry)
+                       prt_printf(out, " offset=%zi/%u",
+                                  (u64 *) entry - jset->_data,
+                                  le32_to_cpu(jset->u64s));
+       }
+
        prt_str(out, ": ");
 }
 
-#define journal_entry_err(c, jset, entry, msg, ...)                    \
+#define journal_entry_err(c, version, jset, entry, _err, msg, ...)     \
 ({                                                                     \
-       struct printbuf buf = PRINTBUF;                                 \
+       struct printbuf _buf = PRINTBUF;                                \
                                                                        \
-       journal_entry_err_msg(&buf, jset, entry);                       \
-       prt_printf(&buf, msg, ##__VA_ARGS__);                           \
+       journal_entry_err_msg(&_buf, version, jset, entry);             \
+       prt_printf(&_buf, msg, ##__VA_ARGS__);                          \
                                                                        \
-       switch (write) {                                                \
+       switch (flags & BKEY_INVALID_WRITE) {                           \
        case READ:                                                      \
-               mustfix_fsck_err(c, "%s", buf.buf);                     \
+               mustfix_fsck_err(c, _err, "%s", _buf.buf);              \
                break;                                                  \
        case WRITE:                                                     \
-               bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\
+               bch2_sb_error_count(c, BCH_FSCK_ERR_##_err);            \
+               bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\
                if (bch2_fs_inconsistent(c)) {                          \
                        ret = -BCH_ERR_fsck_errors_not_fixed;           \
                        goto fsck_err;                                  \
@@ -247,12 +257,12 @@ static void journal_entry_err_msg(struct printbuf *out,
                break;                                                  \
        }                                                               \
                                                                        \
-       printbuf_exit(&buf);                                            \
+       printbuf_exit(&_buf);                                           \
        true;                                                           \
 })
 
-#define journal_entry_err_on(cond, c, jset, entry, msg, ...)           \
-       ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false)
+#define journal_entry_err_on(cond, ...)                                        \
+       ((cond) ? journal_entry_err(__VA_ARGS__) : false)
 
 #define FSCK_DELETED_KEY       5
 
@@ -261,13 +271,18 @@ static int journal_validate_key(struct bch_fs *c,
                                struct jset_entry *entry,
                                unsigned level, enum btree_id btree_id,
                                struct bkey_i *k,
-                               unsigned version, int big_endian, int write)
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
+       int write = flags & BKEY_INVALID_WRITE;
        void *next = vstruct_next(entry);
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
-       if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) {
+       if (journal_entry_err_on(!k->k.u64s,
+                                c, version, jset, entry,
+                                journal_entry_bkey_u64s_0,
+                                "k->u64s 0")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
                return FSCK_DELETED_KEY;
@@ -275,7 +290,8 @@ static int journal_validate_key(struct bch_fs *c,
 
        if (journal_entry_err_on((void *) bkey_next(k) >
                                 (void *) vstruct_next(entry),
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_bkey_past_end,
                                 "extends past end of journal entry")) {
                entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
                journal_entry_null_range(vstruct_next(entry), next);
@@ -283,7 +299,8 @@ static int journal_validate_key(struct bch_fs *c,
        }
 
        if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT,
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_bkey_bad_format,
                                 "bad format %u", k->k.format)) {
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -298,11 +315,7 @@ static int journal_validate_key(struct bch_fs *c,
        if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
                              __btree_node_type(level, btree_id), write, &buf)) {
                printbuf_reset(&buf);
-               prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:",
-                          bch2_jset_entry_types[entry->type],
-                          (u64 *) entry - jset->_data,
-                          le32_to_cpu(jset->u64s),
-                          le64_to_cpu(jset->seq));
+               journal_entry_err_msg(&buf, version, jset, entry);
                prt_newline(&buf);
                printbuf_indent_add(&buf, 2);
 
@@ -311,7 +324,8 @@ static int journal_validate_key(struct bch_fs *c,
                bch2_bkey_invalid(c, bkey_i_to_s_c(k),
                                  __btree_node_type(level, btree_id), write, &buf);
 
-               mustfix_fsck_err(c, "%s", buf.buf);
+               mustfix_fsck_err(c, journal_entry_bkey_invalid,
+                                "%s", buf.buf);
 
                le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
                memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -330,9 +344,10 @@ fsck_err:
 }
 
 static int journal_entry_btree_keys_validate(struct bch_fs *c,
-                                            struct jset *jset,
-                                            struct jset_entry *entry,
-                                            unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct bkey_i *k = entry->start;
 
@@ -341,7 +356,7 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
                                               entry->level,
                                               entry->btree_id,
                                               k, version, big_endian,
-                                              write|BKEY_INVALID_JOURNAL);
+                                              flags|BKEY_INVALID_JOURNAL);
                if (ret == FSCK_DELETED_KEY)
                        continue;
 
@@ -362,23 +377,25 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs
                        prt_newline(out);
                        prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
                }
-               prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+               prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
                bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
                first = false;
        }
 }
 
 static int journal_entry_btree_root_validate(struct bch_fs *c,
-                                            struct jset *jset,
-                                            struct jset_entry *entry,
-                                            unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct bkey_i *k = entry->start;
        int ret = 0;
 
        if (journal_entry_err_on(!entry->u64s ||
                                 le16_to_cpu(entry->u64s) != k->k.u64s,
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_btree_root_bad_size,
                                 "invalid btree root journal entry: wrong number of keys")) {
                void *next = vstruct_next(entry);
                /*
@@ -392,7 +409,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c,
        }
 
        return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
-                                   version, big_endian, write);
+                                   version, big_endian, flags);
 fsck_err:
        return ret;
 }
@@ -404,9 +421,10 @@ static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        /* obsolete, don't care: */
        return 0;
@@ -418,14 +436,16 @@ static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        int ret = 0;
 
        if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1,
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_blacklist_bad_size,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -443,15 +463,17 @@ static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
-                                              struct jset *jset,
-                                              struct jset_entry *entry,
-                                              unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_blacklist_v2 *bl_entry;
        int ret = 0;
 
        if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2,
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_blacklist_v2_bad_size,
                "invalid journal seq blacklist entry: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                goto out;
@@ -461,7 +483,8 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
 
        if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
                                 le64_to_cpu(bl_entry->end),
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_blacklist_v2_start_past_end,
                "invalid journal seq blacklist entry: start > end")) {
                journal_entry_null_range(entry, vstruct_next(entry));
        }
@@ -482,9 +505,10 @@ static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_
 }
 
 static int journal_entry_usage_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_usage *u =
                container_of(entry, struct jset_entry_usage, entry);
@@ -492,7 +516,8 @@ static int journal_entry_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < sizeof(*u),
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_usage_bad_size,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -514,9 +539,10 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_data_usage *u =
                container_of(entry, struct jset_entry_data_usage, entry);
@@ -525,7 +551,8 @@ static int journal_entry_data_usage_validate(struct bch_fs *c,
 
        if (journal_entry_err_on(bytes < sizeof(*u) ||
                                 bytes < sizeof(*u) + u->r.nr_devs,
-                                c, jset, entry,
+                                c, version, jset, entry,
+                                journal_entry_data_usage_bad_size,
                                 "invalid journal entry usage: bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -546,9 +573,10 @@ static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_clock_validate(struct bch_fs *c,
-                                       struct jset *jset,
-                                       struct jset_entry *entry,
-                                       unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_clock *clock =
                container_of(entry, struct jset_entry_clock, entry);
@@ -556,13 +584,17 @@ static int journal_entry_clock_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes != sizeof(*clock),
-                                c, jset, entry, "bad size")) {
+                                c, version, jset, entry,
+                                journal_entry_clock_bad_size,
+                                "bad size")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(clock->rw > 1,
-                                c, jset, entry, "bad rw")) {
+                                c, version, jset, entry,
+                                journal_entry_clock_bad_rw,
+                                "bad rw")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -581,9 +613,10 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_dev_usage_validate(struct bch_fs *c,
-                                           struct jset *jset,
-                                           struct jset_entry *entry,
-                                           unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        struct jset_entry_dev_usage *u =
                container_of(entry, struct jset_entry_dev_usage, entry);
@@ -593,7 +626,9 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        int ret = 0;
 
        if (journal_entry_err_on(bytes < expected,
-                                c, jset, entry, "bad size (%u < %u)",
+                                c, version, jset, entry,
+                                journal_entry_dev_usage_bad_size,
+                                "bad size (%u < %u)",
                                 bytes, expected)) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
@@ -602,13 +637,17 @@ static int journal_entry_dev_usage_validate(struct bch_fs *c,
        dev = le32_to_cpu(u->dev);
 
        if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
-                                c, jset, entry, "bad dev")) {
+                                c, version, jset, entry,
+                                journal_entry_dev_usage_bad_dev,
+                                "bad dev")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
 
        if (journal_entry_err_on(u->pad,
-                                c, jset, entry, "bad pad")) {
+                                c, version, jset, entry,
+                                journal_entry_dev_usage_bad_pad,
+                                "bad pad")) {
                journal_entry_null_range(entry, vstruct_next(entry));
                return ret;
        }
@@ -641,9 +680,10 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
 }
 
 static int journal_entry_log_validate(struct bch_fs *c,
-                                     struct jset *jset,
-                                     struct jset_entry *entry,
-                                     unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return 0;
 }
@@ -658,9 +698,10 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int journal_entry_overwrite_validate(struct bch_fs *c,
-                                     struct jset *jset,
-                                     struct jset_entry *entry,
-                                     unsigned version, int big_endian, int write)
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return journal_entry_btree_keys_validate(c, jset, entry,
                                version, big_endian, READ);
@@ -674,7 +715,8 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
 
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
-                       struct jset_entry *, unsigned, int, int);
+                       struct jset_entry *, unsigned, int,
+                       enum bkey_invalid_flags);
        void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
@@ -691,11 +733,12 @@ static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 int bch2_journal_entry_validate(struct bch_fs *c,
                                struct jset *jset,
                                struct jset_entry *entry,
-                               unsigned version, int big_endian, int write)
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
 {
        return entry->type < BCH_JSET_ENTRY_NR
                ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry,
-                               version, big_endian, write)
+                               version, big_endian, flags)
                : 0;
 }
 
@@ -711,22 +754,23 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
-                                int write)
+                                enum bkey_invalid_flags flags)
 {
        struct jset_entry *entry;
+       unsigned version = le32_to_cpu(jset->version);
        int ret = 0;
 
        vstruct_for_each(jset, entry) {
-               if (journal_entry_err_on(vstruct_next(entry) >
-                                        vstruct_last(jset), c, jset, entry,
+               if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset),
+                               c, version, jset, entry,
+                               journal_entry_past_jset_end,
                                "journal entry extends past end of jset")) {
                        jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
                        break;
                }
 
                ret = bch2_journal_entry_validate(c, jset, entry,
-                                       le32_to_cpu(jset->version),
-                                       JSET_BIG_ENDIAN(jset), write);
+                                       version, JSET_BIG_ENDIAN(jset), flags);
                if (ret)
                        break;
        }
@@ -737,7 +781,7 @@ fsck_err:
 static int jset_validate(struct bch_fs *c,
                         struct bch_dev *ca,
                         struct jset *jset, u64 sector,
-                        int write)
+                        enum bkey_invalid_flags flags)
 {
        unsigned version;
        int ret = 0;
@@ -746,7 +790,9 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_NONE;
 
        version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+       if (journal_entry_err_on(!bch2_version_compatible(version),
+                       c, version, jset, NULL,
+                       jset_unsupported_version,
                        "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -757,7 +803,8 @@ static int jset_validate(struct bch_fs *c,
        }
 
        if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)),
-                                c, jset, NULL,
+                       c, version, jset, NULL,
+                       jset_unknown_csum,
                        "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -767,7 +814,8 @@ static int jset_validate(struct bch_fs *c,
        /* last_seq is ignored when JSET_NO_FLUSH is true */
        if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
                                 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq),
-                                c, jset, NULL,
+                                c, version, jset, NULL,
+                                jset_last_seq_newer_than_seq,
                                 "invalid journal entry: last_seq > seq (%llu > %llu)",
                                 le64_to_cpu(jset->last_seq),
                                 le64_to_cpu(jset->seq))) {
@@ -775,7 +823,7 @@ static int jset_validate(struct bch_fs *c,
                return JOURNAL_ENTRY_BAD;
        }
 
-       ret = jset_validate_entries(c, jset, write);
+       ret = jset_validate_entries(c, jset, flags);
 fsck_err:
        return ret;
 }
@@ -788,14 +836,16 @@ static int jset_validate_early(struct bch_fs *c,
 {
        size_t bytes = vstruct_bytes(jset);
        unsigned version;
-       int write = READ;
+       enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
        int ret = 0;
 
        if (le64_to_cpu(jset->magic) != jset_magic(c))
                return JOURNAL_ENTRY_NONE;
 
        version = le32_to_cpu(jset->version);
-       if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+       if (journal_entry_err_on(!bch2_version_compatible(version),
+                       c, version, jset, NULL,
+                       jset_unsupported_version,
                        "%s sector %llu seq %llu: unknown journal entry version %u.%u",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq),
@@ -810,7 +860,8 @@ static int jset_validate_early(struct bch_fs *c,
                return JOURNAL_ENTRY_REREAD;
 
        if (journal_entry_err_on(bytes > bucket_sectors_left << 9,
-                                c, jset, NULL,
+                       c, version, jset, NULL,
+                       jset_past_bucket_end,
                        "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
                        ca ? ca->name : c->name,
                        sector, le64_to_cpu(jset->seq), bytes))
@@ -879,7 +930,7 @@ reread:
                        ret = submit_bio_wait(bio);
                        kfree(bio);
 
-                       if (bch2_dev_io_err_on(ret, ca,
+                       if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read,
                                               "journal read error: sector %llu",
                                               offset) ||
                            bch2_meta_read_fault("journal")) {
@@ -935,7 +986,8 @@ reread:
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
                csum_good = jset_csum_good(c, j);
-               if (!csum_good)
+               if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
+                                      "journal checksum error"))
                        saw_bad = true;
 
                ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
@@ -1027,12 +1079,20 @@ found:
 
        if (ja->bucket_seq[ja->cur_idx] &&
            ja->sectors_free == ca->mi.bucket_size) {
+#if 0
+               /*
+                * Debug code for ZNS support, where we (probably) want to be
+                * correlated where we stopped in the journal to the zone write
+                * points:
+                */
                bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
                bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
                for (i = 0; i < 3; i++) {
                        unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
+
                        bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
                }
+#endif
                ja->sectors_free = 0;
        }
 
@@ -1127,7 +1187,7 @@ int bch2_journal_read(struct bch_fs *c,
         * those entries will be blacklisted:
         */
        genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
-               int write = READ;
+               enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL;
 
                i = *_i;
 
@@ -1149,7 +1209,8 @@ int bch2_journal_read(struct bch_fs *c,
                }
 
                if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
-                                        c, &i->j, NULL,
+                                        c, le32_to_cpu(i->j.version), &i->j, NULL,
+                                        jset_last_seq_newer_than_seq,
                                         "invalid journal entry: last_seq > seq (%llu > %llu)",
                                         le64_to_cpu(i->j.last_seq),
                                         le64_to_cpu(i->j.seq)))
@@ -1166,7 +1227,8 @@ int bch2_journal_read(struct bch_fs *c,
        }
 
        if (!*last_seq) {
-               fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
+               fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
+                        "journal read done, but no entries found after dropping non-flushes");
                return 0;
        }
 
@@ -1192,6 +1254,7 @@ int bch2_journal_read(struct bch_fs *c,
 
                if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
                        fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
+                                   jset_seq_blacklisted,
                                    "found blacklisted journal entry %llu", seq);
                        i->ignore = true;
                }
@@ -1232,7 +1295,8 @@ int bch2_journal_read(struct bch_fs *c,
                        bch2_journal_ptrs_to_text(&buf2, c, i);
 
                        missing_end = seq - 1;
-                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+                       fsck_err(c, journal_entries_missing,
+                                "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
                                 "  prev at %s\n"
                                 "  next at %s",
                                 missing_start, missing_end,
@@ -1259,7 +1323,7 @@ int bch2_journal_read(struct bch_fs *c,
                        continue;
 
                for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+                       ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
 
                        if (!i->ptrs[ptr].csum_good)
                                bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1281,18 +1345,15 @@ int bch2_journal_read(struct bch_fs *c,
 
                bch2_replicas_entry_sort(&replicas.e);
 
-               /*
-                * If we're mounting in degraded mode - if we didn't read all
-                * the devices - this is wrong:
-                */
-
                printbuf_reset(&buf);
                bch2_replicas_entry_to_text(&buf, &replicas.e);
 
                if (!degraded &&
-                   fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
-                               "superblock not marked as containing replicas %s",
-                               buf.buf)) {
+                   !bch2_replicas_marked(c, &replicas.e) &&
+                   (le64_to_cpu(i->j.seq) == *last_seq ||
+                    fsck_err(c, journal_entry_replicas_not_marked,
+                             "superblock not marked as containing replicas for journal entry %llu\n  %s",
+                             le64_to_cpu(i->j.seq), buf.buf))) {
                        ret = bch2_mark_replicas(c, &replicas.e);
                        if (ret)
                                goto err;
@@ -1361,16 +1422,21 @@ static void __journal_write_alloc(struct journal *j,
 }
 
 /**
- * journal_next_bucket - move on to the next journal bucket if possible
+ * journal_write_alloc - decide where to write next journal entry
+ *
+ * @j:         journal object
+ * @w:         journal buf (entry to be written)
+ *
+ * Returns: 0 on success, or -EROFS on failure
  */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-                              unsigned sectors)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_devs_mask devs;
        struct journal_device *ja;
        struct bch_dev *ca;
        struct dev_alloc_list devs_sorted;
+       unsigned sectors = vstruct_sectors(w->data, c->block_bits);
        unsigned target = c->opts.metadata_target ?:
                c->opts.foreground_target;
        unsigned i, replicas = 0, replicas_want =
@@ -1459,6 +1525,7 @@ static void journal_write_done(struct closure *cl)
        struct journal *j = container_of(cl, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *w = journal_last_unwritten_buf(j);
+       struct bch_replicas_padded replicas;
        union journal_res_state old, new;
        u64 v, seq;
        int err = 0;
@@ -1470,7 +1537,13 @@ static void journal_write_done(struct closure *cl)
        if (!w->devs_written.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
                err = -EIO;
+       } else {
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+                                        w->devs_written);
+               if (bch2_mark_replicas(c, &replicas.e))
+                       err = -EIO;
        }
+
        if (err)
                bch2_fatal_error(c);
 
@@ -1519,11 +1592,15 @@ static void journal_write_done(struct closure *cl)
 
        bch2_journal_space_available(j);
 
+       track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+                          &j->max_in_flight_start, false);
+
        closure_wake_up(&w->wait);
        journal_wake(j);
 
        if (!journal_state_count(new, new.unwritten_idx) &&
            journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
+               spin_unlock(&j->lock);
                closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
        } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
                   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
@@ -1536,10 +1613,11 @@ static void journal_write_done(struct closure *cl)
                 * might want to be written now:
                 */
 
+               spin_unlock(&j->lock);
                mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
+       } else {
+               spin_unlock(&j->lock);
        }
-
-       spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
@@ -1549,7 +1627,8 @@ static void journal_write_endio(struct bio *bio)
        struct journal_buf *w = journal_last_unwritten_buf(j);
        unsigned long flags;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
+                              "error writing journal entry %llu: %s",
                               le64_to_cpu(w->data->seq),
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("journal")) {
@@ -1607,12 +1686,17 @@ static void do_journal_write(struct closure *cl)
        }
 
        continue_at(cl, journal_write_done, c->io_complete_wq);
-       return;
 }
 
-static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 {
-       struct jset_entry *i, *next, *prev = NULL;
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct jset_entry *start, *end, *i, *next, *prev = NULL;
+       struct jset *jset = w->data;
+       unsigned sectors, bytes, u64s;
+       bool validate_before_checksum = false;
+       unsigned long btree_roots_have = 0;
+       int ret;
 
        /*
         * Simple compaction, dropping empty jset_entries (from journal
@@ -1629,8 +1713,20 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
                if (!u64s)
                        continue;
 
-               if (i->type == BCH_JSET_ENTRY_btree_root)
+               /*
+                * New btree roots are set by journalling them; when the journal
+                * entry gets written we have to propagate them to
+                * c->btree_roots
+                *
+                * But, every journal entry we write has to contain all the
+                * btree roots (at least for now); so after we copy btree roots
+                * to c->btree_roots we have to get any missing btree roots and
+                * add them to this journal entry:
+                */
+               if (i->type == BCH_JSET_ENTRY_btree_root) {
                        bch2_journal_entry_to_btree_root(c, i);
+                       __set_bit(i->btree_id, &btree_roots_have);
+               }
 
                /* Can we merge with previous entry? */
                if (prev &&
@@ -1654,85 +1750,10 @@ static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset
 
        prev = prev ? vstruct_next(prev) : jset->start;
        jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-void bch2_journal_write(struct closure *cl)
-{
-       struct journal *j = container_of(cl, struct journal, io);
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       struct journal_buf *w = journal_last_unwritten_buf(j);
-       struct bch_replicas_padded replicas;
-       struct jset_entry *start, *end;
-       struct jset *jset;
-       struct bio *bio;
-       struct printbuf journal_debug_buf = PRINTBUF;
-       bool validate_before_checksum = false;
-       unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
-       int ret;
-
-       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
-
-       journal_buf_realloc(j, w);
-       jset = w->data;
-
-       j->write_start_time = local_clock();
-
-       spin_lock(&j->lock);
-
-       /*
-        * If the journal is in an error state - we did an emergency shutdown -
-        * we prefer to continue doing journal writes. We just mark them as
-        * noflush so they'll never be used, but they'll still be visible by the
-        * list_journal tool - this helps in debugging.
-        *
-        * There's a caveat: the first journal write after marking the
-        * superblock dirty must always be a flush write, because on startup
-        * from a clean shutdown we didn't necessarily read the journal and the
-        * new journal write might overwrite whatever was in the journal
-        * previously - we can't leave the journal without any flush writes in
-        * it.
-        *
-        * So if we're in an error state, and we're still starting up, we don't
-        * write anything at all.
-        */
-       if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
-           (bch2_journal_error(j) ||
-            w->noflush ||
-            (!w->must_flush &&
-             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
-             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
-               w->noflush = true;
-               SET_JSET_NO_FLUSH(jset, true);
-               jset->last_seq  = 0;
-               w->last_seq     = 0;
-
-               j->nr_noflush_writes++;
-       } else if (!bch2_journal_error(j)) {
-               j->last_flush_write = jiffies;
-               j->nr_flush_writes++;
-               clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
-       } else {
-               spin_unlock(&j->lock);
-               goto err;
-       }
-       spin_unlock(&j->lock);
-
-       /*
-        * New btree roots are set by journalling them; when the journal entry
-        * gets written we have to propagate them to c->btree_roots
-        *
-        * But, every journal entry we write has to contain all the btree roots
-        * (at least for now); so after we copy btree roots to c->btree_roots we
-        * have to get any missing btree roots and add them to this journal
-        * entry:
-        */
-
-       bch2_journal_entries_postprocess(c, jset);
 
        start = end = vstruct_last(jset);
 
-       end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+       end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
        bch2_journal_super_entries_add_common(c, &end,
                                le64_to_cpu(jset->seq));
@@ -1748,7 +1769,7 @@ void bch2_journal_write(struct closure *cl)
                bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
                                    vstruct_bytes(jset), w->sectors << 9,
                                    u64s, w->u64s_reserved, j->entry_u64s_reserved);
-               goto err;
+               return -EINVAL;
        }
 
        jset->magic             = cpu_to_le64(jset_magic(c));
@@ -1767,37 +1788,119 @@ void bch2_journal_write(struct closure *cl)
                validate_before_checksum = true;
 
        if (validate_before_checksum &&
-           jset_validate(c, NULL, jset, 0, WRITE))
-               goto err;
+           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+               return ret;
 
        ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
                    jset->encrypted_start,
                    vstruct_end(jset) - (void *) jset->encrypted_start);
        if (bch2_fs_fatal_err_on(ret, c,
                        "error decrypting journal entry: %i", ret))
-               goto err;
+               return ret;
 
        jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
                                  journal_nonce(jset), jset);
 
        if (!validate_before_checksum &&
-           jset_validate(c, NULL, jset, 0, WRITE))
-               goto err;
+           (ret = jset_validate(c, NULL, jset, 0, WRITE)))
+               return ret;
 
        memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+       return 0;
+}
+
+static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       int error = bch2_journal_error(j);
+
+       /*
+        * If the journal is in an error state - we did an emergency shutdown -
+        * we prefer to continue doing journal writes. We just mark them as
+        * noflush so they'll never be used, but they'll still be visible by the
+        * list_journal tool - this helps in debugging.
+        *
+        * There's a caveat: the first journal write after marking the
+        * superblock dirty must always be a flush write, because on startup
+        * from a clean shutdown we didn't necessarily read the journal and the
+        * new journal write might overwrite whatever was in the journal
+        * previously - we can't leave the journal without any flush writes in
+        * it.
+        *
+        * So if we're in an error state, and we're still starting up, we don't
+        * write anything at all.
+        */
+       if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags))
+               return -EIO;
+
+       if (error ||
+           w->noflush ||
+           (!w->must_flush &&
+            (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+            test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+                    w->noflush = true;
+               SET_JSET_NO_FLUSH(w->data, true);
+               w->data->last_seq       = 0;
+               w->last_seq             = 0;
+
+               j->nr_noflush_writes++;
+       } else {
+               j->last_flush_write = jiffies;
+               j->nr_flush_writes++;
+               clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+       }
+
+       return 0;
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+       struct journal *j = container_of(cl, struct journal, io);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       struct journal_buf *w = journal_last_unwritten_buf(j);
+       struct bch_replicas_padded replicas;
+       struct bio *bio;
+       struct printbuf journal_debug_buf = PRINTBUF;
+       unsigned i, nr_rw_members = 0;
+       int ret;
+
+       BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+
+       j->write_start_time = local_clock();
 
-retry_alloc:
        spin_lock(&j->lock);
-       ret = journal_write_alloc(j, w, sectors);
+       ret = bch2_journal_write_pick_flush(j, w);
+       spin_unlock(&j->lock);
+       if (ret)
+               goto err;
+
+       journal_buf_realloc(j, w);
+
+       ret = bch2_journal_write_prep(j, w);
+       if (ret)
+               goto err;
+
+       j->entry_bytes_written += vstruct_bytes(w->data);
+
+       while (1) {
+               spin_lock(&j->lock);
+               ret = journal_write_alloc(j, w);
+               if (!ret || !j->can_discard)
+                       break;
 
-       if (ret && j->can_discard) {
                spin_unlock(&j->lock);
                bch2_journal_do_discards(j);
-               goto retry_alloc;
        }
 
-       if (ret)
+       if (ret) {
                __bch2_journal_debug_to_text(&journal_debug_buf, j);
+               spin_unlock(&j->lock);
+               bch_err(c, "Unable to allocate journal write:\n%s",
+                       journal_debug_buf.buf);
+               printbuf_exit(&journal_debug_buf);
+               goto err;
+       }
 
        /*
         * write is allocated, no longer need to account for it in
@@ -1812,13 +1915,6 @@ retry_alloc:
        bch2_journal_space_available(j);
        spin_unlock(&j->lock);
 
-       if (ret) {
-               bch_err(c, "Unable to allocate journal write:\n%s",
-                       journal_debug_buf.buf);
-               printbuf_exit(&journal_debug_buf);
-               goto err;
-       }
-
        w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
        if (c->opts.nochanges)
@@ -1840,7 +1936,7 @@ retry_alloc:
        if (ret)
                goto err;
 
-       if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
+       if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
                for_each_rw_member(ca, c, i) {
                        percpu_ref_get(&ca->io_ref);
 
index 8801e98104bd8aaa9671b41f89ee4e904169388d..a88d097b13f1294a5ca1f3c30ebba5282ef56da3 100644 (file)
@@ -50,7 +50,8 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
                jset_entry_for_each_key(entry, k)
 
 int bch2_journal_entry_validate(struct bch_fs *, struct jset *,
-                               struct jset_entry *, unsigned, int, int);
+                               struct jset_entry *, unsigned, int,
+                               enum bkey_invalid_flags);
 void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
                                struct jset_entry *);
 
index 8de83e10375187803730214e3bee4c3cd425256c..8fa05bedb7dff8f36084fafc37106fe9915579a0 100644 (file)
@@ -3,13 +3,14 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "errcode.h"
 #include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
-#include "super.h"
+#include "sb-members.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
@@ -49,16 +50,25 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
        return available;
 }
 
-static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
+static inline void journal_set_watermark(struct journal *j)
 {
-       union journal_preres_state old, new;
-       u64 v = atomic64_read(&j->prereserved.counter);
-
-       do {
-               old.v = new.v = v;
-               new.remaining = u64s_remaining;
-       } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
-                                      old.v, new.v)) != old.v);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool low_on_space = j->space[journal_space_clean].total * 4 <=
+               j->space[journal_space_total].total;
+       bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+       unsigned watermark = low_on_space || low_on_pin
+               ? BCH_WATERMARK_reclaim
+               : BCH_WATERMARK_stripe;
+
+       if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+                              &j->low_on_space_start, low_on_space) ||
+           track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+                              &j->low_on_pin_start, low_on_pin))
+               trace_and_count(c, journal_full, c);
+
+       swap(watermark, j->watermark);
+       if (watermark > j->watermark)
+               journal_wake(j);
 }
 
 static struct journal_space
@@ -161,7 +171,6 @@ void bch2_journal_space_available(struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        unsigned clean, clean_ondisk, total;
-       s64 u64s_remaining = 0;
        unsigned max_entry_size  = min(j->buf[0].buf_size >> 9,
                                       j->buf[1].buf_size >> 9);
        unsigned i, nr_online = 0, nr_devs_want;
@@ -221,16 +230,10 @@ void bch2_journal_space_available(struct journal *j)
        else
                clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-       u64s_remaining  = (u64) clean << 6;
-       u64s_remaining -= (u64) total << 3;
-       u64s_remaining = max(0LL, u64s_remaining);
-       u64s_remaining /= 4;
-       u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
+       journal_set_watermark(j);
 out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
-       journal_set_remaining(j, u64s_remaining);
-       journal_set_watermark(j);
 
        if (!ret)
                journal_wake(j);
@@ -289,9 +292,8 @@ void bch2_journal_do_discards(struct journal *j)
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
-static void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_reclaim_fast(struct journal *j)
 {
-       struct journal_entry_pin_list temp;
        bool popped = false;
 
        lockdep_assert_held(&j->lock);
@@ -302,7 +304,7 @@ static void bch2_journal_reclaim_fast(struct journal *j)
         */
        while (!fifo_empty(&j->pin) &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
-               fifo_pop(&j->pin, temp);
+               j->pin.front++;
                popped = true;
        }
 
@@ -310,19 +312,16 @@ static void bch2_journal_reclaim_fast(struct journal *j)
                bch2_journal_space_available(j);
 }
 
-void __bch2_journal_pin_put(struct journal *j, u64 seq)
+bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
        struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
 
-       if (atomic_dec_and_test(&pin_list->count))
-               bch2_journal_reclaim_fast(j);
+       return atomic_dec_and_test(&pin_list->count);
 }
 
 void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
-       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
-
-       if (atomic_dec_and_test(&pin_list->count)) {
+       if (__bch2_journal_pin_put(j, seq)) {
                spin_lock(&j->lock);
                bch2_journal_reclaim_fast(j);
                spin_unlock(&j->lock);
@@ -345,7 +344,7 @@ static inline bool __journal_pin_drop(struct journal *j,
        list_del_init(&pin->list);
 
        /*
-        * Unpinning a journal entry make make journal_next_bucket() succeed, if
+        * Unpinning a journal entry may make journal_next_bucket() succeed, if
         * writing a new last_seq will now make another bucket available:
         */
        return atomic_dec_and_test(&pin_list->count) &&
@@ -372,15 +371,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
                return JOURNAL_PIN_other;
 }
 
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
                          struct journal_entry_pin *pin,
-                         journal_pin_flush_fn flush_fn)
+                         journal_pin_flush_fn flush_fn,
+                         enum journal_pin_type type)
+{
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+       /*
+        * flush_fn is how we identify journal pins in debugfs, so must always
+        * exist, even if it doesn't do anything:
+        */
+       BUG_ON(!flush_fn);
+
+       atomic_inc(&pin_list->count);
+       pin->seq        = seq;
+       pin->flush      = flush_fn;
+       list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+                          struct journal_entry_pin *dst,
+                          struct journal_entry_pin *src,
+                          journal_pin_flush_fn flush_fn)
 {
-       struct journal_entry_pin_list *pin_list;
        bool reclaim;
 
        spin_lock(&j->lock);
 
+       u64 seq = READ_ONCE(src->seq);
+
        if (seq < journal_last_seq(j)) {
                /*
                 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -392,18 +412,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
                return;
        }
 
-       pin_list = journal_seq_pin(j, seq);
+       reclaim = __journal_pin_drop(j, dst);
 
-       reclaim = __journal_pin_drop(j, pin);
+       bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
-       atomic_inc(&pin_list->count);
-       pin->seq        = seq;
-       pin->flush      = flush_fn;
+       if (reclaim)
+               bch2_journal_reclaim_fast(j);
+       spin_unlock(&j->lock);
 
-       if (flush_fn)
-               list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
-       else
-               list_add(&pin->list, &pin_list->flushed);
+       /*
+        * If the journal is currently full,  we might want to call flush_fn
+        * immediately:
+        */
+       journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+                         struct journal_entry_pin *pin,
+                         journal_pin_flush_fn flush_fn)
+{
+       bool reclaim;
+
+       spin_lock(&j->lock);
+
+       BUG_ON(seq < journal_last_seq(j));
+
+       reclaim = __journal_pin_drop(j, pin);
+
+       bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
        if (reclaim)
                bch2_journal_reclaim_fast(j);
@@ -418,6 +454,8 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 
 /**
  * bch2_journal_pin_flush: ensure journal pin callback is no longer running
+ * @j:         journal object
+ * @pin:       pin to flush
  */
 void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
 {
@@ -556,11 +594,6 @@ static u64 journal_seq_to_flush(struct journal *j)
                /* Try to keep the journal at most half full: */
                nr_buckets = ja->nr / 2;
 
-               /* And include pre-reservations: */
-               nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
-                                          (ca->mi.bucket_size << 6) -
-                                          journal_entry_overhead(j));
-
                nr_buckets = min(nr_buckets, ja->nr);
 
                bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
@@ -578,7 +611,11 @@ static u64 journal_seq_to_flush(struct journal *j)
 }
 
 /**
- * bch2_journal_reclaim - free up journal buckets
+ * __bch2_journal_reclaim - free up journal buckets
+ * @j:         journal object
+ * @direct:    direct or background reclaim?
+ * @kicked:    requested to run since we last ran?
+ * Returns:    0 on success, or -EIO if the journal has been shutdown
  *
  * Background journal reclaim writes out btree nodes. It should be run
  * early enough so that we never completely run out of journal buckets.
@@ -635,10 +672,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                               msecs_to_jiffies(c->opts.journal_reclaim_delay)))
                        min_nr = 1;
 
-               if (j->prereserved.reserved * 4 > j->prereserved.remaining)
-                       min_nr = 1;
-
-               if (fifo_free(&j->pin) <= 32)
+               if (j->watermark != BCH_WATERMARK_stripe)
                        min_nr = 1;
 
                if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
@@ -649,8 +683,6 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
                trace_and_count(c, journal_reclaim_start, c,
                                direct, kicked,
                                min_nr, min_key_cache,
-                               j->prereserved.reserved,
-                               j->prereserved.remaining,
                                atomic_read(&c->btree_cache.dirty),
                                c->btree_cache.used,
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
@@ -757,7 +789,7 @@ int bch2_journal_reclaim_start(struct journal *j)
                           "bch-reclaim/%s", c->name);
        ret = PTR_ERR_OR_ZERO(p);
        if (ret) {
-               bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating journal reclaim thread");
                return ret;
        }
 
@@ -802,6 +834,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 
 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
+       /* time_stats this */
        bool did_work = false;
 
        if (!test_bit(JOURNAL_STARTED, &j->flags))
index 0fd1af120db551746fc5cac54000c8616914a4f3..7b15d682a0f51d28c47f7d881edb1b08ca24d10c 100644 (file)
@@ -31,7 +31,8 @@ journal_seq_pin(struct journal *j, u64 seq)
        return &j->pin.data[seq & j->pin.mask];
 }
 
-void __bch2_journal_pin_put(struct journal *, u64);
+void bch2_journal_reclaim_fast(struct journal *);
+bool __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
 
@@ -46,17 +47,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
                bch2_journal_pin_set(j, seq, pin, flush_fn);
 }
 
-static inline void bch2_journal_pin_copy(struct journal *j,
-                                        struct journal_entry_pin *dst,
-                                        struct journal_entry_pin *src,
-                                        journal_pin_flush_fn flush_fn)
-{
-       /* Guard against racing with journal_pin_drop(src): */
-       u64 seq = READ_ONCE(src->seq);
-
-       if (seq)
-               bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+                          struct journal_entry_pin *,
+                          struct journal_entry_pin *,
+                          journal_pin_flush_fn);
 
 static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
                                           struct journal_entry_pin *pin,
index cc41bff86d6bbc049be49f3f05e751af9dd528d5..ae4fb8c3a2bc26fe937c5bc88f8b5b78143e91b0 100644 (file)
@@ -21,7 +21,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
                                    struct printbuf *err)
 {
        struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
        int ret = -BCH_ERR_invalid_sb_journal;
        unsigned nr;
        unsigned i;
@@ -45,15 +45,15 @@ static int bch2_sb_journal_validate(struct bch_sb *sb,
                goto err;
        }
 
-       if (b[0] < le16_to_cpu(m->first_bucket)) {
+       if (b[0] < le16_to_cpu(m.first_bucket)) {
                prt_printf(err, "journal bucket %llu before first bucket %u",
-                      b[0], le16_to_cpu(m->first_bucket));
+                      b[0], le16_to_cpu(m.first_bucket));
                goto err;
        }
 
-       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+       if (b[nr - 1] >= le64_to_cpu(m.nbuckets)) {
                prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1], le64_to_cpu(m->nbuckets));
+                      b[nr - 1], le64_to_cpu(m.nbuckets));
                goto err;
        }
 
@@ -104,7 +104,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
                                    struct printbuf *err)
 {
        struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
-       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
        int ret = -BCH_ERR_invalid_sb_journal;
        unsigned nr;
        unsigned i;
@@ -130,15 +130,15 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
                goto err;
        }
 
-       if (b[0].start < le16_to_cpu(m->first_bucket)) {
+       if (b[0].start < le16_to_cpu(m.first_bucket)) {
                prt_printf(err, "journal bucket %llu before first bucket %u",
-                      b[0].start, le16_to_cpu(m->first_bucket));
+                      b[0].start, le16_to_cpu(m.first_bucket));
                goto err;
        }
 
-       if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+       if (b[nr - 1].end > le64_to_cpu(m.nbuckets)) {
                prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1].end - 1, le64_to_cpu(m->nbuckets));
+                      b[nr - 1].end - 1, le64_to_cpu(m.nbuckets));
                goto err;
        }
 
@@ -194,7 +194,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca,
                if (buckets[i] + 1 != buckets[i + 1])
                        nr_compacted++;
 
-       j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+       j = bch2_sb_field_resize(&ca->disk_sb, journal_v2,
                         (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64));
        if (!j)
                return -BCH_ERR_ENOSPC_sb_journal;
index d6b9f2cdf8e7df2664abd4f30df9d67559e0d926..f9d9aa95bf3a64640d3d1e6012fc319ca7aad05e 100644 (file)
@@ -58,8 +58,8 @@ blacklist_entry_try_merge(struct bch_fs *c,
                        &bl->start[i + 1],
                        sizeof(bl->start[0]) * (nr - i));
 
-               bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
-                                                       sb_blacklist_u64s(nr));
+               bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+                                         sb_blacklist_u64s(nr));
                BUG_ON(!bl);
        }
 
@@ -79,7 +79,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
        int ret = 0;
 
        mutex_lock(&c->sb_lock);
-       bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+       bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
        nr = blacklist_nr_entries(bl);
 
        for (i = 0; i < nr; i++) {
@@ -100,8 +100,8 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
                }
        }
 
-       bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
-                                       sb_blacklist_u64s(nr + 1));
+       bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
+                                 sb_blacklist_u64s(nr + 1));
        if (!bl) {
                ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist;
                goto out;
@@ -158,7 +158,7 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
 int bch2_blacklist_table_initialize(struct bch_fs *c)
 {
        struct bch_sb_field_journal_seq_blacklist *bl =
-               bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+               bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
        struct journal_seq_blacklist_table *t;
        unsigned i, nr = blacklist_nr_entries(bl);
 
@@ -250,20 +250,18 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
        struct journal_seq_blacklist_table *t;
        struct bch_sb_field_journal_seq_blacklist *bl;
        struct journal_seq_blacklist_entry *src, *dst;
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        unsigned i, nr, new_nr;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < BTREE_ID_NR; i++) {
                struct btree_iter iter;
                struct btree *b;
 
-               bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+               bch2_trans_node_iter_init(trans, &iter, i, POS_MIN,
                                          0, 0, BTREE_ITER_PREFETCH);
 retry:
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                b = bch2_btree_iter_peek_node(&iter);
 
@@ -275,15 +273,15 @@ retry:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        if (ret)
                return;
 
        mutex_lock(&c->sb_lock);
-       bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
+       bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
        if (!bl)
                goto out;
 
@@ -308,7 +306,7 @@ retry:
        bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
 
        if (new_nr != nr) {
-               bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
+               bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
                                new_nr ? sb_blacklist_u64s(new_nr) : 0);
                BUG_ON(new_nr && !bl);
 
index 42504e16acb6ccf261a6699b6d468cba7d26a776..2427cce64fed93388214c3de8b6446875eaf01b6 100644 (file)
@@ -76,14 +76,6 @@ struct journal_res {
        u64                     seq;
 };
 
-/*
- * For reserving space in the journal prior to getting a reservation on a
- * particular journal entry:
- */
-struct journal_preres {
-       unsigned                u64s;
-};
-
 union journal_res_state {
        struct {
                atomic64_t      counter;
@@ -104,22 +96,6 @@ union journal_res_state {
        };
 };
 
-union journal_preres_state {
-       struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
-       struct {
-               u64             waiting:1,
-                               reserved:31,
-                               remaining:32;
-       };
-};
-
 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN         (64U << 10) /* 64k */
 #define JOURNAL_ENTRY_SIZE_MAX         (4U  << 20) /* 4M */
@@ -180,8 +156,6 @@ struct journal {
        union journal_res_state reservations;
        enum bch_watermark      watermark;
 
-       union journal_preres_state prereserved;
-
        } __aligned(SMP_CACHE_BYTES);
 
        unsigned long           flags;
@@ -288,15 +262,18 @@ struct journal {
 
        unsigned long           last_flush_write;
 
-       u64                     res_get_blocked_start;
        u64                     write_start_time;
 
        u64                     nr_flush_writes;
        u64                     nr_noflush_writes;
+       u64                     entry_bytes_written;
+
+       u64                     low_on_space_start;
+       u64                     low_on_pin_start;
+       u64                     max_in_flight_start;
 
        struct bch2_time_stats  *flush_write_time;
        struct bch2_time_stats  *noflush_write_time;
-       struct bch2_time_stats  *blocked_time;
        struct bch2_time_stats  *flush_seq_time;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
index 3e8b8f2f38a31fbc17f4f341442754792fdd9ecc..e6d081c0592c81bb1db26db6c3d08aafce8bc7d9 100644 (file)
 #include "recovery.h"
 
 /* KEY_TYPE_lru is obsolete: */
-int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_lru_invalid(struct bch_fs *c, struct bkey_s_c k,
                     enum bkey_invalid_flags flags,
                     struct printbuf *err)
 {
-       if (!lru_pos_time(k.k->p)) {
-               prt_printf(err, "lru entry at time=0");
-               return -BCH_ERR_invalid_bkey;
-
-       }
+       int ret = 0;
 
-       return 0;
+       bkey_fsck_err_on(!lru_pos_time(k.k->p), c, err,
+                        lru_entry_at_time_0,
+                        "lru entry at time=0");
+fsck_err:
+       return ret;
 }
 
 void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
@@ -95,6 +95,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
        int ret;
 
        if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c,
+                       lru_entry_to_invalid_bucket,
                        "lru key points to nonexistent device:bucket %llu:%llu",
                        alloc_pos.inode, alloc_pos.offset))
                return bch2_btree_delete_at(trans, lru_iter, 0);
@@ -125,7 +126,8 @@ static int bch2_check_lru_key(struct btree_trans *trans,
                }
 
                if (c->opts.reconstruct_alloc ||
-                   fsck_err(c, "incorrect lru entry: lru %s time %llu\n"
+                   fsck_err(c, lru_entry_bad,
+                            "incorrect lru entry: lru %s time %llu\n"
                             "  %s\n"
                             "  for %s",
                             bch2_lru_types[type],
@@ -151,10 +153,10 @@ int bch2_check_lrus(struct bch_fs *c)
        int ret = 0;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                                BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-                       bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos)));
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+                       bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
index be66bf9ad80911006cff4b6d5ea139141511e3c0..429dca816df5c5049c85e31ea20eb1e92ea694cf 100644 (file)
@@ -48,7 +48,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l)
        return BCH_LRU_read;
 }
 
-int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_lru_invalid(struct bch_fs *, struct bkey_s_c,
                     enum bkey_invalid_flags, struct printbuf *);
 void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
index 81c8cdbac28597c9b88e140f47f357fd07b4ac83..8e5688d0a8ca6af79b9b98c11efdef88c67645e1 100644 (file)
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "errcode.h"
 #include "extents.h"
-#include "io.h"
+#include "io_write.h"
 #include "journal.h"
 #include "keylist.h"
 #include "migrate.h"
@@ -78,34 +78,32 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 
 static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        enum btree_id id;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
        for (id = 0; id < BTREE_ID_NR; id++) {
                if (!btree_type_has_ptrs(id))
                        continue;
 
-               ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN,
+               ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
                                BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL,
-                       bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags));
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                       bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        return ret;
 }
 
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct closure cl;
        struct btree *b;
@@ -117,16 +115,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        if (flags & BCH_FORCE_IF_METADATA_LOST)
                return -EINVAL;
 
+       trans = bch2_trans_get(c);
        bch2_bkey_buf_init(&k);
-       bch2_trans_init(&trans, c, 0, 0);
        closure_init_stack(&cl);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
-               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
                                          BTREE_ITER_PREFETCH);
 retry:
                ret = 0;
-               while (bch2_trans_begin(&trans),
+               while (bch2_trans_begin(trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
@@ -141,15 +139,14 @@ retry:
                                break;
                        }
 
-                       ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false);
+                       ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
                                ret = 0;
                                continue;
                        }
 
                        if (ret) {
-                               bch_err(c, "Error updating btree node key: %s",
-                                       bch2_err_str(ret));
+                               bch_err_msg(c, ret, "updating btree node key");
                                break;
                        }
 next:
@@ -158,7 +155,7 @@ next:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
 
                if (ret)
                        goto err;
@@ -167,8 +164,8 @@ next:
        bch2_btree_interior_updates_flush(c);
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&k, c);
+       bch2_trans_put(trans);
 
        BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
index 05272673901d2ba93cccd69c0b1e9febf4042803..4f7d1758d8a97588a2e73a7397ea9880a356b414 100644 (file)
 #include "errcode.h"
 #include "error.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
 #include "replicas.h"
+#include "snapshot.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -58,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
        }
 }
 
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_add(&stats->list, &c->data_progress_list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_del(&stats->list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
 struct moving_io {
        struct list_head                read_list;
        struct list_head                io_list;
@@ -155,35 +143,31 @@ static void move_read_endio(struct bio *bio)
        closure_put(&ctxt->cl);
 }
 
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
-                                       struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
 {
        struct moving_io *io;
 
-       if (trans)
-               bch2_trans_unlock(trans);
-
        while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
+               bch2_trans_unlock_long(ctxt->trans);
                list_del(&io->read_list);
                move_write(io);
        }
 }
 
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
-                                      struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 {
        unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 
-       move_ctxt_wait_event(ctxt, trans,
+       move_ctxt_wait_event(ctxt,
                !atomic_read(&ctxt->write_sectors) ||
                atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
-       struct bch_fs *c = ctxt->c;
+       struct bch_fs *c = ctxt->trans->c;
 
-       move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
        closure_sync(&ctxt->cl);
 
        EBUG_ON(atomic_read(&ctxt->write_sectors));
@@ -191,16 +175,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
        EBUG_ON(atomic_read(&ctxt->read_sectors));
        EBUG_ON(atomic_read(&ctxt->read_ios));
 
-       if (ctxt->stats) {
-               progress_list_del(c, ctxt->stats);
-               trace_move_data(c,
-                               atomic64_read(&ctxt->stats->sectors_moved),
-                               atomic64_read(&ctxt->stats->keys_moved));
-       }
-
        mutex_lock(&c->moving_context_lock);
        list_del(&ctxt->list);
        mutex_unlock(&c->moving_context_lock);
+
+       bch2_trans_put(ctxt->trans);
+       memset(ctxt, 0, sizeof(*ctxt));
 }
 
 void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -212,7 +192,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
 {
        memset(ctxt, 0, sizeof(*ctxt));
 
-       ctxt->c         = c;
+       ctxt->trans     = bch2_trans_get(c);
        ctxt->fn        = (void *) _RET_IP_;
        ctxt->rate      = rate;
        ctxt->stats     = stats;
@@ -229,16 +209,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
        mutex_lock(&c->moving_context_lock);
        list_add(&ctxt->list, &c->moving_context_list);
        mutex_unlock(&c->moving_context_lock);
+}
 
-       if (stats) {
-               progress_list_add(c, stats);
-               stats->data_type = BCH_DATA_user;
-       }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+       trace_move_data(c, stats);
 }
 
 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
 {
        memset(stats, 0, sizeof(*stats));
+       stats->data_type = BCH_DATA_user;
        scnprintf(stats->name, sizeof(stats->name), "%s", name);
 }
 
@@ -282,18 +263,17 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
 
        return bch2_trans_relock(trans) ?:
                bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-               bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+               bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
 }
 
-static int bch2_move_extent(struct btree_trans *trans,
-                           struct btree_iter *iter,
-                           struct moving_context *ctxt,
-                           struct move_bucket_in_flight *bucket_in_flight,
-                           struct bch_io_opts io_opts,
-                           enum btree_id btree_id,
-                           struct bkey_s_c k,
-                           struct data_update_opts data_opts)
+int bch2_move_extent(struct moving_context *ctxt,
+                    struct move_bucket_in_flight *bucket_in_flight,
+                    struct btree_iter *iter,
+                    struct bkey_s_c k,
+                    struct bch_io_opts io_opts,
+                    struct data_update_opts data_opts)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        struct moving_io *io;
@@ -302,6 +282,8 @@ static int bch2_move_extent(struct btree_trans *trans,
        unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
+       if (ctxt->stats)
+               ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
        trace_move_extent2(c, k);
 
        bch2_data_update_opts_normalize(k, &data_opts);
@@ -354,7 +336,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        io->rbio.bio.bi_end_io          = move_read_endio;
 
        ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
-                                   io_opts, data_opts, btree_id, k);
+                                   io_opts, data_opts, iter->btree_id, k);
        if (ret && ret != -BCH_ERR_unwritten_extent_update)
                goto err_free_pages;
 
@@ -366,9 +348,11 @@ static int bch2_move_extent(struct btree_trans *trans,
 
        BUG_ON(ret);
 
-       io->write.ctxt = ctxt;
        io->write.op.end_io = move_write_done;
 
+       if (ctxt->rate)
+               bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
        if (ctxt->stats) {
                atomic64_inc(&ctxt->stats->keys_moved);
                atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -398,7 +382,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        closure_get(&ctxt->cl);
        bch2_read_extent(trans, &io->rbio,
                         bkey_start_pos(k.k),
-                        btree_id, k, 0,
+                        iter->btree_id, k, 0,
                         BCH_READ_NODECODE|
                         BCH_READ_LAST_FRAGMENT);
        return 0;
@@ -412,45 +396,96 @@ err:
        return ret;
 }
 
-static int lookup_inode(struct btree_trans *trans, struct bpos pos,
-                       struct bch_inode_unpacked *inode)
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+                         struct per_snapshot_io_opts *io_opts,
+                         struct bkey_s_c extent_k)
+{
+       struct bch_fs *c = trans->c;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
+
+       if (io_opts->cur_inum != extent_k.k->p.inode) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               io_opts->d.nr = 0;
+
+               for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+                                  BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+                       if (k.k->p.offset != extent_k.k->p.inode)
+                               break;
+
+                       if (!bkey_is_inode(k.k))
+                               continue;
+
+                       struct bch_inode_unpacked inode;
+                       BUG_ON(bch2_inode_unpack(k, &inode));
+
+                       struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+                       bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+                       ret = darray_push(&io_opts->d, e);
+                       if (ret)
+                               break;
+               }
+               bch2_trans_iter_exit(trans, &iter);
+               io_opts->cur_inum = extent_k.k->p.inode;
+       }
+
+       ret = ret ?: trans_was_restarted(trans, restart_count);
+       if (ret)
+               return ERR_PTR(ret);
+
+       if (extent_k.k->p.snapshot) {
+               struct snapshot_io_opts_entry *i;
+               darray_for_each(io_opts->d, i)
+                       if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+                               return &i->io_opts;
+       }
+
+       return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+                             struct bch_io_opts *io_opts,
+                             struct bkey_s_c extent_k)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(&iter);
+       /* reflink btree? */
+       if (!extent_k.k->p.inode) {
+               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+               return 0;
+       }
+
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+                              SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+                              BTREE_ITER_CACHED);
        ret = bkey_err(k);
-       if (ret)
-               goto err;
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               return ret;
 
-       if (!k.k || !bkey_eq(k.k->p, pos)) {
-               ret = -BCH_ERR_ENOENT_inode;
-               goto err;
+       if (!ret && bkey_is_inode(k.k)) {
+               struct bch_inode_unpacked inode;
+               bch2_inode_unpack(k, &inode);
+               bch2_inode_opts_get(io_opts, trans->c, &inode);
+       } else {
+               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
        }
 
-       ret = bkey_is_inode(k.k) ? 0 : -EIO;
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_unpack(k, inode);
-       if (ret)
-               goto err;
-err:
        bch2_trans_iter_exit(trans, &iter);
-       return ret;
+       return 0;
 }
 
-static int move_ratelimit(struct btree_trans *trans,
-                         struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
 {
-       struct bch_fs *c = trans->c;
+       struct bch_fs *c = ctxt->trans->c;
        u64 delay;
 
-       if (ctxt->wait_on_copygc) {
-               bch2_trans_unlock(trans);
+       if (ctxt->wait_on_copygc && !c->copygc_running) {
+               bch2_trans_unlock_long(ctxt->trans);
                wait_event_killable(c->copygc_running_wq,
                                    !c->copygc_running ||
                                    kthread_should_stop());
@@ -459,8 +494,12 @@ static int move_ratelimit(struct btree_trans *trans,
        do {
                delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
 
+
                if (delay) {
-                       bch2_trans_unlock(trans);
+                       if (delay > HZ / 10)
+                               bch2_trans_unlock_long(ctxt->trans);
+                       else
+                               bch2_trans_unlock(ctxt->trans);
                        set_current_state(TASK_INTERRUPTIBLE);
                }
 
@@ -473,7 +512,7 @@ static int move_ratelimit(struct btree_trans *trans,
                        schedule_timeout(delay);
 
                if (unlikely(freezing(current))) {
-                       move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+                       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
                        try_to_freeze();
                }
        } while (delay);
@@ -482,7 +521,7 @@ static int move_ratelimit(struct btree_trans *trans,
         * XXX: these limits really ought to be per device, SSDs and hard drives
         * will want different limits
         */
-       move_ctxt_wait_event(ctxt, trans,
+       move_ctxt_wait_event(ctxt,
                atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
                atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
                atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@@ -491,64 +530,39 @@ static int move_ratelimit(struct btree_trans *trans,
        return 0;
 }
 
-static int move_get_io_opts(struct btree_trans *trans,
-                           struct bch_io_opts *io_opts,
-                           struct bkey_s_c k, u64 *cur_inum)
-{
-       struct bch_inode_unpacked inode;
-       int ret;
-
-       if (*cur_inum == k.k->p.inode)
-               return 0;
-
-       ret = lookup_inode(trans,
-                          SPOS(0, k.k->p.inode, k.k->p.snapshot),
-                          &inode);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               return ret;
-
-       if (!ret)
-               bch2_inode_opts_get(io_opts, trans->c, &inode);
-       else
-               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
-       *cur_inum = k.k->p.inode;
-       return 0;
-}
-
-static int __bch2_move_data(struct moving_context *ctxt,
-                           struct bpos start,
-                           struct bpos end,
-                           move_pred_fn pred, void *arg,
-                           enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+                               struct bpos start,
+                               struct bpos end,
+                               move_pred_fn pred, void *arg,
+                               enum btree_id btree_id)
 {
-       struct bch_fs *c = ctxt->c;
-       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct per_snapshot_io_opts snapshot_io_opts;
+       struct bch_io_opts *io_opts;
        struct bkey_buf sk;
-       struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct data_update_opts data_opts;
-       u64 cur_inum = U64_MAX;
        int ret = 0, ret2;
 
+       per_snapshot_io_opts_init(&snapshot_io_opts, c);
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
        if (ctxt->stats) {
                ctxt->stats->data_type  = BCH_DATA_user;
-               ctxt->stats->btree_id   = btree_id;
-               ctxt->stats->pos        = start;
+               ctxt->stats->pos        = BBPOS(btree_id, start);
        }
 
-       bch2_trans_iter_init(&trans, &iter, btree_id, start,
+       bch2_trans_iter_init(trans, &iter, btree_id, start,
                             BTREE_ITER_PREFETCH|
                             BTREE_ITER_ALL_SNAPSHOTS);
 
        if (ctxt->rate)
                bch2_ratelimit_reset(ctxt->rate);
 
-       while (!move_ratelimit(&trans, ctxt)) {
-               bch2_trans_begin(&trans);
+       while (!bch2_move_ratelimit(ctxt)) {
+               bch2_trans_begin(trans);
 
                k = bch2_btree_iter_peek(&iter);
                if (!k.k)
@@ -564,17 +578,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
                        break;
 
                if (ctxt->stats)
-                       ctxt->stats->pos = iter.pos;
+                       ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
 
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
-               ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum);
+               io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+               ret = PTR_ERR_OR_ZERO(io_opts);
                if (ret)
                        continue;
 
                memset(&data_opts, 0, sizeof(data_opts));
-               if (!pred(c, arg, k, &io_opts, &data_opts))
+               if (!pred(c, arg, k, io_opts, &data_opts))
                        goto next;
 
                /*
@@ -584,24 +599,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
 
-               ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL,
-                                       io_opts, btree_id, k, data_opts);
+               ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
                if (ret2) {
                        if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
                                continue;
 
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
+                               bch2_move_ctxt_wait_for_io(ctxt);
                                continue;
                        }
 
                        /* XXX signal failure */
                        goto next;
                }
-
-               if (ctxt->rate)
-                       bch2_ratelimit_increment(ctxt->rate, k.k->size);
 next:
                if (ctxt->stats)
                        atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@@ -609,60 +620,69 @@ next_nondata:
                bch2_btree_iter_advance(&iter);
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
        bch2_bkey_buf_exit(&sk, c);
+       per_snapshot_io_opts_exit(&snapshot_io_opts);
 
        return ret;
 }
 
-int bch2_move_data(struct bch_fs *c,
-                  enum btree_id start_btree_id, struct bpos start_pos,
-                  enum btree_id end_btree_id,   struct bpos end_pos,
-                  struct bch_ratelimit *rate,
-                  struct bch_move_stats *stats,
-                  struct write_point_specifier wp,
-                  bool wait_on_copygc,
-                  move_pred_fn pred, void *arg)
+int __bch2_move_data(struct moving_context *ctxt,
+                    struct bbpos start,
+                    struct bbpos end,
+                    move_pred_fn pred, void *arg)
 {
-       struct moving_context ctxt;
+       struct bch_fs *c = ctxt->trans->c;
        enum btree_id id;
-       int ret;
-
-       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+       int ret = 0;
 
-       for (id = start_btree_id;
-            id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+       for (id = start.btree;
+            id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
             id++) {
-               stats->btree_id = id;
-
-               if (id != BTREE_ID_extents &&
-                   id != BTREE_ID_reflink)
-                       continue;
+               ctxt->stats->pos = BBPOS(id, POS_MIN);
 
-               if (!bch2_btree_id_root(c, id)->b)
+               if (!btree_type_has_ptrs(id) ||
+                   !bch2_btree_id_root(c, id)->b)
                        continue;
 
-               ret = __bch2_move_data(&ctxt,
-                                      id == start_btree_id ? start_pos : POS_MIN,
-                                      id == end_btree_id   ? end_pos   : POS_MAX,
+               ret = bch2_move_data_btree(ctxt,
+                                      id == start.btree ? start.pos : POS_MIN,
+                                      id == end.btree   ? end.pos   : POS_MAX,
                                       pred, arg, id);
                if (ret)
                        break;
        }
 
+       return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+                  struct bbpos start,
+                  struct bbpos end,
+                  struct bch_ratelimit *rate,
+                  struct bch_move_stats *stats,
+                  struct write_point_specifier wp,
+                  bool wait_on_copygc,
+                  move_pred_fn pred, void *arg)
+{
+
+       struct moving_context ctxt;
+       int ret;
+
+       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+       ret = __bch2_move_data(&ctxt, start, end, pred, arg);
        bch2_moving_ctxt_exit(&ctxt);
 
        return ret;
 }
 
-int __bch2_evacuate_bucket(struct btree_trans *trans,
-                          struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
                           struct move_bucket_in_flight *bucket_in_flight,
                           struct bpos bucket, int gen,
                           struct data_update_opts _data_opts)
 {
-       struct bch_fs *c = ctxt->c;
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_iter iter;
        struct bkey_buf sk;
@@ -673,7 +693,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
        struct data_update_opts data_opts;
        unsigned dirty_sectors, bucket_size;
        u64 fragmentation;
-       u64 cur_inum = U64_MAX;
        struct bpos bp_pos = POS_MIN;
        int ret = 0;
 
@@ -708,7 +727,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                goto err;
        }
 
-       while (!(ret = move_ratelimit(trans, ctxt))) {
+       while (!(ret = bch2_move_ratelimit(ctxt))) {
                bch2_trans_begin(trans);
 
                ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -723,7 +742,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
 
                if (!bp.level) {
                        const struct bch_extent_ptr *ptr;
-                       struct bkey_s_c k;
                        unsigned i = 0;
 
                        k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
@@ -738,7 +756,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        k = bkey_i_to_s_c(sk.k);
 
-                       ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+                       ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
                        if (ret) {
                                bch2_trans_iter_exit(trans, &iter);
                                continue;
@@ -759,23 +777,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                                i++;
                        }
 
-                       ret = bch2_move_extent(trans, &iter, ctxt,
-                                       bucket_in_flight,
-                                       io_opts, bp.btree_id, k, data_opts);
+                       ret = bch2_move_extent(ctxt, bucket_in_flight,
+                                              &iter, k, io_opts, data_opts);
                        bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, trans);
+                               bch2_move_ctxt_wait_for_io(ctxt);
                                continue;
                        }
                        if (ret)
                                goto err;
 
-                       if (ctxt->rate)
-                               bch2_ratelimit_increment(ctxt->rate, k.k->size);
                        if (ctxt->stats)
                                atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
                } else {
@@ -826,15 +841,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
                         struct write_point_specifier wp,
                         bool wait_on_copygc)
 {
-       struct btree_trans trans;
        struct moving_context ctxt;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts);
+       ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
        bch2_moving_ctxt_exit(&ctxt);
-       bch2_trans_exit(&trans);
 
        return ret;
 }
@@ -851,31 +863,34 @@ static int bch2_move_btree(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_trans trans;
+       struct moving_context ctxt;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
        struct data_update_opts data_opts;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       progress_list_add(c, stats);
+       bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+                             writepoint_ptr(&c->btree_write_point),
+                             true);
+       trans = ctxt.trans;
 
        stats->data_type = BCH_DATA_btree;
 
        for (id = start_btree_id;
             id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
             id++) {
-               stats->btree_id = id;
+               stats->pos = BBPOS(id, POS_MIN);
 
                if (!bch2_btree_id_root(c, id)->b)
                        continue;
 
-               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
                                          BTREE_ITER_PREFETCH);
 retry:
                ret = 0;
-               while (bch2_trans_begin(&trans),
+               while (bch2_trans_begin(trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (kthread && kthread_should_stop())
@@ -885,12 +900,12 @@ retry:
                             bpos_cmp(b->key.k.p, end_pos)) > 0)
                                break;
 
-                       stats->pos = iter.pos;
+                       stats->pos = BBPOS(iter.btree_id, iter.pos);
 
                        if (!pred(c, arg, b, &io_opts, &data_opts))
                                goto next;
 
-                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+                       ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret;
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret)
@@ -901,20 +916,16 @@ next:
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        goto retry;
 
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
 
                if (kthread && kthread_should_stop())
                        break;
        }
 
-       bch2_trans_exit(&trans);
-
-       if (ret)
-               bch_err_fn(c, ret);
-
+       bch_err_fn(c, ret);
+       bch2_moving_ctxt_exit(&ctxt);
        bch2_btree_interior_updates_flush(c);
 
-       progress_list_del(c, stats);
        return ret;
 }
 
@@ -1035,8 +1046,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
                mutex_unlock(&c->sb_lock);
        }
 
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1059,14 +1069,16 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
 
                ret = bch2_move_data(c,
-                                    op.start_btree,    op.start_pos,
-                                    op.end_btree,      op.end_pos,
+                                    (struct bbpos) { op.start_btree,   op.start_pos },
+                                    (struct bbpos) { op.end_btree,     op.end_pos },
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     rereplicate_pred, c) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
+
+               bch2_move_stats_exit(stats, c);
                break;
        case BCH_DATA_OP_MIGRATE:
                if (op.migrate.dev >= c->sb.nr_devices)
@@ -1083,18 +1095,21 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
 
                ret = bch2_move_data(c,
-                                    op.start_btree,    op.start_pos,
-                                    op.end_btree,      op.end_pos,
+                                    (struct bbpos) { op.start_btree,   op.start_pos },
+                                    (struct bbpos) { op.end_btree,     op.end_pos },
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     migrate_pred, &op) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
+
+               bch2_move_stats_exit(stats, c);
                break;
        case BCH_DATA_OP_REWRITE_OLD_NODES:
                bch2_move_stats_init(stats, "rewrite_old_nodes");
                ret = bch2_scan_old_btree_nodes(c, stats);
+               bch2_move_stats_exit(stats, c);
                break;
        default:
                ret = -EINVAL;
@@ -1103,46 +1118,64 @@ int bch2_data_job(struct bch_fs *c,
        return ret;
 }
 
-void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
 {
-       struct bch_move_stats *stats;
-
-       mutex_lock(&c->data_progress_lock);
-       list_for_each_entry(stats, &c->data_progress_list, list) {
-               prt_printf(out, "%s: data type %s btree_id %s position: ",
-                      stats->name,
-                      bch2_data_types[stats->data_type],
-                      bch2_btree_ids[stats->btree_id]);
-               bch2_bpos_to_text(out, stats->pos);
-               prt_printf(out, "%s", "\n");
-       }
-       mutex_unlock(&c->data_progress_lock);
+       prt_printf(out, "%s: data type=%s pos=",
+                  stats->name,
+                  bch2_data_types[stats->data_type]);
+       bch2_bbpos_to_text(out, stats->pos);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_str(out, "keys moved:  ");
+       prt_u64(out, atomic64_read(&stats->keys_moved));
+       prt_newline(out);
+
+       prt_str(out, "keys raced:  ");
+       prt_u64(out, atomic64_read(&stats->keys_raced));
+       prt_newline(out);
+
+       prt_str(out, "bytes seen:  ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+       prt_newline(out);
+
+       prt_str(out, "bytes moved: ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
+       prt_newline(out);
+
+       prt_str(out, "bytes raced: ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
 }
 
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt)
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
 {
        struct moving_io *io;
 
-       prt_printf(out, "%ps:", ctxt->fn);
-       prt_newline(out);
+       bch2_move_stats_to_text(out, ctxt->stats);
        printbuf_indent_add(out, 2);
 
-       prt_printf(out, "reads: %u sectors %u",
+       prt_printf(out, "reads: ios %u/%u sectors %u/%u",
                   atomic_read(&ctxt->read_ios),
-                  atomic_read(&ctxt->read_sectors));
+                  c->opts.move_ios_in_flight,
+                  atomic_read(&ctxt->read_sectors),
+                  c->opts.move_bytes_in_flight >> 9);
        prt_newline(out);
 
-       prt_printf(out, "writes: %u sectors %u",
+       prt_printf(out, "writes: ios %u/%u sectors %u/%u",
                   atomic_read(&ctxt->write_ios),
-                  atomic_read(&ctxt->write_sectors));
+                  c->opts.move_ios_in_flight,
+                  atomic_read(&ctxt->write_sectors),
+                  c->opts.move_bytes_in_flight >> 9);
        prt_newline(out);
 
        printbuf_indent_add(out, 2);
 
        mutex_lock(&ctxt->lock);
-       list_for_each_entry(io, &ctxt->ios, io_list) {
+       list_for_each_entry(io, &ctxt->ios, io_list)
                bch2_write_op_to_text(out, &io->write.op);
-       }
        mutex_unlock(&ctxt->lock);
 
        printbuf_indent_sub(out, 4);
@@ -1154,7 +1187,7 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c)
 
        mutex_lock(&c->moving_context_lock);
        list_for_each_entry(ctxt, &c->moving_context_list, list)
-               bch2_moving_ctxt_to_text(out, ctxt);
+               bch2_moving_ctxt_to_text(out, c, ctxt);
        mutex_unlock(&c->moving_context_lock);
 }
 
@@ -1162,7 +1195,4 @@ void bch2_fs_move_init(struct bch_fs *c)
 {
        INIT_LIST_HEAD(&c->moving_context_list);
        mutex_init(&c->moving_context_lock);
-
-       INIT_LIST_HEAD(&c->data_progress_list);
-       mutex_init(&c->data_progress_lock);
 }
index 547ee7b72c1617177056a3eb32061e345ed4351a..07cf9d42643b4fe537b6db513285efc1f65bd366 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bbpos.h"
+#include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
 #include "data_update.h"
@@ -10,7 +12,7 @@
 struct bch_read_bio;
 
 struct moving_context {
-       struct bch_fs           *c;
+       struct btree_trans      *trans;
        struct list_head        list;
        void                    *fn;
 
@@ -36,13 +38,14 @@ struct moving_context {
        wait_queue_head_t       wait;
 };
 
-#define move_ctxt_wait_event(_ctxt, _trans, _cond)                     \
+#define move_ctxt_wait_event(_ctxt, _cond)                             \
 do {                                                                   \
        bool cond_finished = false;                                     \
-       bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);              \
+       bch2_moving_ctxt_do_pending_writes(_ctxt);                      \
                                                                        \
        if (_cond)                                                      \
                break;                                                  \
+       bch2_trans_unlock_long((_ctxt)->trans);                         \
        __wait_event((_ctxt)->wait,                                     \
                     bch2_moving_ctxt_next_pending_write(_ctxt) ||      \
                     (cond_finished = (_cond)));                        \
@@ -58,22 +61,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
                           struct bch_ratelimit *, struct bch_move_stats *,
                           struct write_point_specifier, bool);
 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
-                                       struct btree_trans *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+       u32                     snapshot;
+       struct bch_io_opts      io_opts;
+};
+
+struct per_snapshot_io_opts {
+       u64                     cur_inum;
+       struct bch_io_opts      fs_io_opts;
+       DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+       memset(io_opts, 0, sizeof(*io_opts));
+       io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+       darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+                               struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
+int bch2_move_extent(struct moving_context *,
+                    struct move_bucket_in_flight *,
+                    struct btree_iter *,
+                    struct bkey_s_c,
+                    struct bch_io_opts,
+                    struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+                    struct bbpos,
+                    struct bbpos,
+                    move_pred_fn, void *);
 int bch2_move_data(struct bch_fs *,
-                  enum btree_id, struct bpos,
-                  enum btree_id, struct bpos,
+                  struct bbpos start,
+                  struct bbpos end,
                   struct bch_ratelimit *,
                   struct bch_move_stats *,
                   struct write_point_specifier,
                   bool,
                   move_pred_fn, void *);
 
-int __bch2_evacuate_bucket(struct btree_trans *,
-                          struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
                           struct move_bucket_in_flight *,
                           struct bpos, int,
                           struct data_update_opts);
@@ -87,8 +128,10 @@ int bch2_data_job(struct bch_fs *,
                  struct bch_move_stats *,
                  struct bch_ioctl_data);
 
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
-void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
 void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_move_init(struct bch_fs *);
index baf1f8570b3fe05f1733ae9be1cfa28c9e3db933..e22841ef31e475fdfa11d8dcc7d48adb8d333897 100644 (file)
@@ -2,17 +2,17 @@
 #ifndef _BCACHEFS_MOVE_TYPES_H
 #define _BCACHEFS_MOVE_TYPES_H
 
+#include "bbpos_types.h"
+
 struct bch_move_stats {
        enum bch_data_type      data_type;
-       enum btree_id           btree_id;
-       struct bpos             pos;
-       struct list_head        list;
+       struct bbpos            pos;
        char                    name[32];
 
        atomic64_t              keys_moved;
        atomic64_t              keys_raced;
-       atomic64_t              sectors_moved;
        atomic64_t              sectors_seen;
+       atomic64_t              sectors_moved;
        atomic64_t              sectors_raced;
 };
 
index 5242f20bb680fc2c0d9fc8ebd54188db87dfa8ab..0a0576326c5b2d433fcd4aace513379972f57152 100644 (file)
 #include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
-#include "disk_groups.h"
 #include "errcode.h"
 #include "error.h"
-#include "extents.h"
-#include "eytzinger.h"
-#include "io.h"
-#include "keylist.h"
 #include "lru.h"
 #include "move.h"
 #include "movinggc.h"
-#include "super-io.h"
 #include "trace.h"
 
-#include <linux/bsearch.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sched/task.h>
-#include <linux/sort.h>
 #include <linux/wait.h>
 
 struct buckets_in_flight {
@@ -109,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
        return ret;
 }
 
-static void move_buckets_wait(struct btree_trans *trans,
-                             struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
                              struct buckets_in_flight *list,
                              bool flush)
 {
@@ -119,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
 
        while ((i = list->first)) {
                if (flush)
-                       move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+                       move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
 
                if (atomic_read(&i->count))
                        break;
@@ -137,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
                kfree(i);
        }
 
-       bch2_trans_unlock(trans);
+       bch2_trans_unlock_long(ctxt->trans);
 }
 
 static bool bucket_in_flight(struct buckets_in_flight *list,
@@ -148,19 +139,19 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
 
 typedef DARRAY(struct move_bucket) move_buckets;
 
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
-                       struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
                        struct buckets_in_flight *buckets_in_flight,
                        move_buckets *buckets)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
-       size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4);
+       size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
        size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
        int ret;
 
-       move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+       move_buckets_wait(ctxt, buckets_in_flight, false);
 
        ret = bch2_btree_write_buffer_flush(trans);
        if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@@ -172,7 +163,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
                                  lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
                                  0, k, ({
                struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) };
-               int ret = 0;
+               int ret2 = 0;
 
                saw++;
 
@@ -181,11 +172,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
                else if (bucket_in_flight(buckets_in_flight, b.k))
                        in_flight++;
                else {
-                       ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
-                       if (ret >= 0)
+                       ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
+                       if (ret2 >= 0)
                                sectors += b.sectors;
                }
-               ret;
+               ret2;
        }));
 
        pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i",
@@ -196,10 +187,11 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 }
 
 noinline
-static int bch2_copygc(struct btree_trans *trans,
-                      struct moving_context *ctxt,
-                      struct buckets_in_flight *buckets_in_flight)
+static int bch2_copygc(struct moving_context *ctxt,
+                      struct buckets_in_flight *buckets_in_flight,
+                      bool *did_work)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct data_update_opts data_opts = {
                .btree_insert_flags = BCH_WATERMARK_copygc,
@@ -210,7 +202,7 @@ static int bch2_copygc(struct btree_trans *trans,
        u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
        int ret = 0;
 
-       ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+       ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
        if (ret)
                goto err;
 
@@ -220,17 +212,21 @@ static int bch2_copygc(struct btree_trans *trans,
 
                f = move_bucket_in_flight_add(buckets_in_flight, *i);
                ret = PTR_ERR_OR_ZERO(f);
-               if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */
+               if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */
+                       ret = 0;
                        continue;
+               }
                if (ret == -ENOMEM) { /* flush IO, continue later */
                        ret = 0;
                        break;
                }
 
-               ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+               ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
                                             f->bucket.k.gen, data_opts);
                if (ret)
                        goto err;
+
+               *did_work = true;
        }
 err:
        darray_exit(&buckets);
@@ -240,7 +236,7 @@ err:
                ret = 0;
 
        if (ret < 0 && !bch2_err_matches(ret, EROFS))
-               bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "from bch2_move_data()");
 
        moved = atomic64_read(&ctxt->stats->sectors_moved) - moved;
        trace_and_count(c, copygc, c, moved, 0, 0, 0);
@@ -306,25 +302,24 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
-       struct btree_trans trans;
        struct moving_context ctxt;
        struct bch_move_stats move_stats;
        struct io_clock *clock = &c->io_clock[WRITE];
-       struct buckets_in_flight move_buckets;
+       struct buckets_in_flight *buckets;
        u64 last, wait;
        int ret = 0;
 
-       memset(&move_buckets, 0, sizeof(move_buckets));
-
-       ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params);
+       buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL);
+       if (!buckets)
+               return -ENOMEM;
+       ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
        if (ret) {
-               bch_err(c, "error allocating copygc buckets in flight: %s",
-                       bch2_err_str(ret));
+               kfree(buckets);
+               bch_err_msg(c, ret, "allocating copygc buckets in flight");
                return ret;
        }
 
        set_freezable();
-       bch2_trans_init(&trans, c, 0, 0);
 
        bch2_move_stats_init(&move_stats, "copygc");
        bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -332,16 +327,18 @@ static int bch2_copygc_thread(void *arg)
                              false);
 
        while (!ret && !kthread_should_stop()) {
-               bch2_trans_unlock(&trans);
+               bool did_work = false;
+
+               bch2_trans_unlock_long(ctxt.trans);
                cond_resched();
 
                if (!c->copy_gc_enabled) {
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(&ctxt, buckets, true);
                        kthread_wait_freezable(c->copy_gc_enabled);
                }
 
                if (unlikely(freezing(current))) {
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(&ctxt, buckets, true);
                        __refrigerator(false);
                        continue;
                }
@@ -352,7 +349,7 @@ static int bch2_copygc_thread(void *arg)
                if (wait > clock->max_slop) {
                        c->copygc_wait_at = last;
                        c->copygc_wait = last + wait;
-                       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
+                       move_buckets_wait(&ctxt, buckets, true);
                        trace_and_count(c, copygc_wait, c, wait, last + wait);
                        bch2_kthread_io_clock_wait(clock, last + wait,
                                        MAX_SCHEDULE_TIMEOUT);
@@ -362,16 +359,29 @@ static int bch2_copygc_thread(void *arg)
                c->copygc_wait = 0;
 
                c->copygc_running = true;
-               ret = bch2_copygc(&trans, &ctxt, &move_buckets);
+               ret = bch2_copygc(&ctxt, buckets, &did_work);
                c->copygc_running = false;
 
                wake_up(&c->copygc_running_wq);
+
+               if (!wait && !did_work) {
+                       u64 min_member_capacity = bch2_min_rw_member_capacity(c);
+
+                       if (min_member_capacity == U64_MAX)
+                               min_member_capacity = 128 * 2048;
+
+                       bch2_trans_unlock_long(ctxt.trans);
+                       bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6),
+                                       MAX_SCHEDULE_TIMEOUT);
+               }
        }
 
-       move_buckets_wait(&trans, &ctxt, &move_buckets, true);
-       rhashtable_destroy(&move_buckets.table);
-       bch2_trans_exit(&trans);
+       move_buckets_wait(&ctxt, buckets, true);
+
+       rhashtable_destroy(&buckets->table);
+       kfree(buckets);
        bch2_moving_ctxt_exit(&ctxt);
+       bch2_move_stats_exit(&move_stats, c);
 
        return 0;
 }
@@ -402,7 +412,7 @@ int bch2_copygc_start(struct bch_fs *c)
        t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
        ret = PTR_ERR_OR_ZERO(t);
        if (ret) {
-               bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating copygc thread");
                return ret;
        }
 
index 396357cd8f2fe715c278bfe486799e6fe039e335..3c21981a4a1c09f9c70596876ef71fdef72cddcb 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_methods.h"
 #include "nocow_locking.h"
 #include "util.h"
 
@@ -29,9 +30,10 @@ void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos buc
 
        for (i = 0; i < ARRAY_SIZE(l->b); i++)
                if (l->b[i] == dev_bucket) {
-                       BUG_ON(sign(atomic_read(&l->l[i])) != lock_val);
+                       int v = atomic_sub_return(lock_val, &l->l[i]);
 
-                       if (!atomic_sub_return(lock_val, &l->l[i]))
+                       BUG_ON(v && sign(v) != lock_val);
+                       if (!v)
                                closure_wake_up(&l->wait);
                        return;
                }
@@ -64,6 +66,11 @@ got_entry:
        if (lock_val > 0 ? v < 0 : v > 0)
                goto fail;
 take_lock:
+       v = atomic_read(&l->l[i]);
+       /* Overflow? */
+       if (v && sign(v + lock_val) != sign(v))
+               goto fail;
+
        atomic_add(lock_val, &l->l[i]);
        spin_unlock(&l->lock);
        return true;
@@ -83,6 +90,7 @@ void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t,
 }
 
 void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t)
+
 {
        unsigned i, nr_zero = 0;
        struct nocow_lock_bucket *l;
@@ -102,9 +110,13 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab
                        prt_printf(out, "(%u empty entries)\n", nr_zero);
                nr_zero = 0;
 
-               for (i = 0; i < ARRAY_SIZE(l->l); i++)
-                       if (atomic_read(&l->l[i]))
-                               prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i]));
+               for (i = 0; i < ARRAY_SIZE(l->l); i++) {
+                       int v = atomic_read(&l->l[i]);
+                       if (v) {
+                               bch2_bpos_to_text(out, u64_to_bucket(l->b[i]));
+                               prt_printf(out, ": %s %u ", v < 0 ? "copy" : "update", abs(v));
+                       }
+               }
                prt_newline(out);
        }
 
@@ -112,12 +124,21 @@ void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_tab
                prt_printf(out, "(%u empty entries)\n", nr_zero);
 }
 
+void bch2_fs_nocow_locking_exit(struct bch_fs *c)
+{
+       struct bucket_nocow_lock_table *t = &c->nocow_locks;
+
+       for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+               for (unsigned j = 0; j < ARRAY_SIZE(l->l); j++)
+                       BUG_ON(atomic_read(&l->l[j]));
+}
+
 int bch2_fs_nocow_locking_init(struct bch_fs *c)
 {
-       unsigned i;
+       struct bucket_nocow_lock_table *t = &c->nocow_locks;
 
-       for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
-               spin_lock_init(&c->nocow_locks.l[i].lock);
+       for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++)
+               spin_lock_init(&l->lock);
 
        return 0;
 }
index ff8e4af52edcd95fefc0b3164870280ff58fd2b7..f9d6a426a960c4949902c72f2019bd76bd308676 100644 (file)
@@ -44,6 +44,7 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t,
 
 void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *);
 
+void bch2_fs_nocow_locking_exit(struct bch_fs *);
 int bch2_fs_nocow_locking_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_NOCOW_LOCKING_H */
index 960bb247f3a0ab8b84c64e8f439738ffdf76b3b6..8dd4046cca41ef23b061f4aeac1892f82a504d65 100644 (file)
@@ -37,9 +37,8 @@ const char * const bch2_sb_compat[] = {
        NULL
 };
 
-const char * const bch2_btree_ids[] = {
+const char * const __bch2_btree_ids[] = {
        BCH_BTREE_IDS()
-       "interior btree node",
        NULL
 };
 
@@ -266,14 +265,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
                if (err)
                        prt_printf(err, "%s: too small (min %llu)",
                               opt->attr.name, opt->min);
-               return -ERANGE;
+               return -BCH_ERR_ERANGE_option_too_small;
        }
 
        if (opt->max && v >= opt->max) {
                if (err)
                        prt_printf(err, "%s: too big (max %llu)",
                               opt->attr.name, opt->max);
-               return -ERANGE;
+               return -BCH_ERR_ERANGE_option_too_big;
        }
 
        if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
@@ -290,6 +289,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
                return -EINVAL;
        }
 
+       if (opt->fn.validate)
+               return opt->fn.validate(v, err);
+
        return 0;
 }
 
@@ -471,8 +473,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
                        val = "0";
                }
 
+               /* Unknown options are ignored: */
                if (id < 0)
-                       goto bad_opt;
+                       continue;
 
                if (!(bch2_opt_table[id].flags & OPT_MOUNT))
                        goto bad_opt;
index 8a9db110d64fccee8a23e94b461efd09881a6a09..8526f177450a56900c907a2e4cba3950fe5f9e00 100644 (file)
@@ -15,7 +15,7 @@ extern const char * const bch2_fsck_fix_opts[];
 extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
-extern const char * const bch2_btree_ids[];
+extern const char * const __bch2_btree_ids[];
 extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
 extern const char * const bch2_compression_types[];
@@ -73,6 +73,7 @@ enum opt_type {
 struct bch_opt_fn {
        int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
        void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+       int (*validate)(u64, struct printbuf *);
 };
 
 /**
@@ -469,7 +470,7 @@ struct bch_opts {
 #undef x
 };
 
-static const struct bch_opts bch2_opts_default = {
+static const __maybe_unused struct bch_opts bch2_opts_default = {
 #define x(_name, _bits, _mode, _type, _sb_opt, _default, ...)          \
        ._name##_defined = true,                                        \
        ._name = _default,                                              \
index c41daa1806821198ad4f640f6dc508841718b51b..5e653eb81d54f8fdfcca37038eeaf5a1febdb8e7 100644 (file)
@@ -81,8 +81,10 @@ void bch2_prt_printf(struct printbuf *out, const char *fmt, ...)
 }
 
 /**
- * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null
- * terminated
+ * bch2_printbuf_str() - returns printbuf's buf as a C string, guaranteed to be
+ * null terminated
+ * @buf:       printbuf to terminate
+ * Returns:    Printbuf contents, as a nul terminated C string
  */
 const char *bch2_printbuf_str(const struct printbuf *buf)
 {
@@ -97,8 +99,9 @@ const char *bch2_printbuf_str(const struct printbuf *buf)
 }
 
 /**
- * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it
+ * bch2_printbuf_exit() - exit a printbuf, freeing memory it owns and poisoning it
  * against accidental use.
+ * @buf:       printbuf to exit
  */
 void bch2_printbuf_exit(struct printbuf *buf)
 {
@@ -120,7 +123,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf)
 }
 
 /*
- * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop
+ * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop
  *
  * @buf: printbuf to control
  * @spaces: number of spaces from previous tabpstop
@@ -144,7 +147,7 @@ int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_add - add to the current indent level
+ * bch2_printbuf_indent_add() - add to the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to add to the current indent level
@@ -164,7 +167,7 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces)
 }
 
 /**
- * printbuf_indent_sub - subtract from the current indent level
+ * bch2_printbuf_indent_sub() - subtract from the current indent level
  *
  * @buf: printbuf to control
  * @spaces: number of spaces to subtract from the current indent level
@@ -227,9 +230,8 @@ static void __prt_tab(struct printbuf *out)
 }
 
 /**
- * prt_tab - Advance printbuf to the next tabstop
- *
- * @buf: printbuf to control
+ * bch2_prt_tab() - Advance printbuf to the next tabstop
+ * @out:       printbuf to control
  *
  * Advance output to the next tabstop by printing spaces.
  */
@@ -267,7 +269,7 @@ static void __prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
+ * bch2_prt_tab_rjust - Advance printbuf to the next tabstop, right justifying
  * previous output
  *
  * @buf: printbuf to control
@@ -284,11 +286,11 @@ void bch2_prt_tab_rjust(struct printbuf *buf)
 }
 
 /**
- * prt_bytes_indented - Print an array of chars, handling embedded control characters
+ * bch2_prt_bytes_indented() - Print an array of chars, handling embedded control characters
  *
- * @out: printbuf to output to
- * @str: string to print
- * @count: number of bytes to print
+ * @out:       output printbuf
+ * @str:       string to print
+ * @count:     number of bytes to print
  *
  * The following contol characters are handled as so:
  *   \n: prt_newline   newline that obeys current indent level
@@ -335,32 +337,38 @@ void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned cou
 }
 
 /**
- * prt_human_readable_u64 - Print out a u64 in human readable units
+ * bch2_prt_human_readable_u64() - Print out a u64 in human readable units
+ * @out:       output printbuf
+ * @v:         integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v)
+void bch2_prt_human_readable_u64(struct printbuf *out, u64 v)
 {
-       bch2_printbuf_make_room(buf, 10);
-       buf->pos += string_get_size(v, 1, !buf->si_units,
-                                   buf->buf + buf->pos,
-                                   printbuf_remaining_size(buf));
+       bch2_printbuf_make_room(out, 10);
+       out->pos += string_get_size(v, 1, !out->si_units,
+                                   out->buf + out->pos,
+                                   printbuf_remaining_size(out));
 }
 
 /**
- * prt_human_readable_s64 - Print out a s64 in human readable units
+ * bch2_prt_human_readable_s64() - Print out a s64 in human readable units
+ * @out:       output printbuf
+ * @v:         integer to print
  *
- * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units
+ * Units of 2^10 (default) or 10^3 are controlled via @out->si_units
  */
-void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v)
+void bch2_prt_human_readable_s64(struct printbuf *out, s64 v)
 {
        if (v < 0)
-               prt_char(buf, '-');
-       bch2_prt_human_readable_u64(buf, abs(v));
+               prt_char(out, '-');
+       bch2_prt_human_readable_u64(out, abs(v));
 }
 
 /**
- * prt_units_u64 - Print out a u64 according to printbuf unit options
+ * bch2_prt_units_u64() - Print out a u64 according to printbuf unit options
+ * @out:       output printbuf
+ * @v:         integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
@@ -374,7 +382,9 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v)
 }
 
 /**
- * prt_units_s64 - Print out a s64 according to printbuf unit options
+ * bch2_prt_units_s64() - Print out a s64 according to printbuf unit options
+ * @out:       output printbuf
+ * @v:         integer to print
  *
  * Units are either raw (default), or human reabable units (controlled via
  * @buf->human_readable_units)
@@ -405,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
        while (list[nr])
                nr++;
 
-       while (flags && (bit = __ffs(flags)) < nr) {
+       while (flags && (bit = __ffs64(flags)) < nr) {
                if (!first)
                        bch2_prt_printf(out, ",");
                first = false;
                bch2_prt_printf(out, "%s", list[bit]);
-               flags ^= 1 << bit;
+               flags ^= BIT_ULL(bit);
        }
 }
index 4f0654ff816f35e6c83e0ade9c2c37e6f839ca30..a54647c36b8501b7099c81fd5c4e9a6cba410787 100644 (file)
@@ -5,7 +5,7 @@
 #include "error.h"
 #include "inode.h"
 #include "quota.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "super-io.h"
 
 static const char * const bch2_quota_types[] = {
@@ -59,17 +59,18 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = {
        .to_text        = bch2_sb_quota_to_text,
 };
 
-int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_quota_invalid(struct bch_fs *c, struct bkey_s_c k,
                       enum bkey_invalid_flags flags,
                       struct printbuf *err)
 {
-       if (k.k->p.inode >= QTYP_NR) {
-               prt_printf(err, "invalid quota type (%llu >= %u)",
-                      k.k->p.inode, QTYP_NR);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
 
-       return 0;
+       bkey_fsck_err_on(k.k->p.inode >= QTYP_NR, c, err,
+                        quota_type_invalid,
+                        "invalid quota type (%llu >= %u)",
+                        k.k->p.inode, QTYP_NR);
+fsck_err:
+       return ret;
 }
 
 void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
@@ -513,12 +514,12 @@ void bch2_fs_quota_init(struct bch_fs *c)
 
 static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb)
 {
-       struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb);
+       struct bch_sb_field_quota *sb_quota = bch2_sb_field_get(sb->sb, quota);
 
        if (sb_quota)
                return sb_quota;
 
-       sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64));
+       sb_quota = bch2_sb_field_resize(sb, quota, sizeof(*sb_quota) / sizeof(u64));
        if (sb_quota) {
                unsigned qtype, qc;
 
@@ -536,7 +537,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
        struct bch_sb_field_quota *sb_quota;
        unsigned i, j;
 
-       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
+       sb_quota = bch2_sb_field_get(c->disk_sb.sb, quota);
        if (!sb_quota)
                return;
 
@@ -572,7 +573,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans,
        if (!s_t.master_subvol)
                goto advance;
 
-       ret = bch2_inode_find_by_inum_trans(trans,
+       ret = bch2_inode_find_by_inum_nowarn_trans(trans,
                                (subvol_inum) {
                                        le32_to_cpu(s_t.master_subvol),
                                        k.k->p.offset,
@@ -599,7 +600,7 @@ advance:
 int bch2_fs_quota_read(struct bch_fs *c)
 {
        struct bch_sb_field_quota *sb_quota;
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
@@ -614,16 +615,16 @@ int bch2_fs_quota_read(struct bch_fs *c)
        bch2_sb_quota_read(c);
        mutex_unlock(&c->sb_lock);
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
-       ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas,
+       ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
                        POS_MIN, BTREE_ITER_PREFETCH, k,
                __bch2_quota_set(c, k, NULL)) ?:
-             for_each_btree_key2(&trans, iter, BTREE_ID_inodes,
+             for_each_btree_key2(trans, iter, BTREE_ID_inodes,
                        POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-               bch2_fs_quota_read_inode(&trans, &iter, k));
+               bch2_fs_quota_read_inode(trans, &iter, k));
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                bch_err_fn(c, ret);
@@ -786,7 +787,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 {
        struct bch_fs *c = sb->s_fs_info;
        struct bch_sb_field_quota *sb_quota;
-       struct bch_memquota_type *q;
        int ret = 0;
 
        if (0) {
@@ -810,8 +810,6 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
            ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
                return -EINVAL;
 
-       q = &c->quotas[type];
-
        mutex_lock(&c->sb_lock);
        sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
        if (!sb_quota) {
@@ -959,7 +957,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
        new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                           bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
+                           bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
                __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
 
        return bch2_err_class(ret);
index 2f463874a3628238fb62c17f341d698f4fc3f3e6..884f601f41c425b711f14ea05be3c8fc26f12158 100644 (file)
@@ -8,7 +8,7 @@
 enum bkey_invalid_flags;
 extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
 
-int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_quota_invalid(struct bch_fs *, struct bkey_s_c,
                       enum bkey_invalid_flags, struct printbuf *);
 void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
index c3d577236ce2b1f22932eed06484062cc6e9e678..db2139c0545d789c95297fd68ea9292a26a9cdaa 100644 (file)
@@ -1,17 +1,21 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
 #include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
-#include "extents.h"
-#include "io.h"
+#include "error.h"
+#include "inode.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super-io.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
 
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
-                          struct bkey_s_c k,
-                          struct bch_io_opts *io_opts,
-                          struct data_update_opts *data_opts)
+#define REBALANCE_WORK_SCAN_OFFSET     (U64_MAX - 1)
+
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+       BCH_REBALANCE_STATES()
+       NULL
+#undef x
+};
+
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
 {
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       unsigned i;
-
-       data_opts->rewrite_ptrs         = 0;
-       data_opts->target               = io_opts->background_target;
-       data_opts->extra_replicas       = 0;
-       data_opts->btree_insert_flags   = 0;
-
-       if (io_opts->background_compression &&
-           !bch2_bkey_is_incompressible(k)) {
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-
-               i = 0;
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       if (!p.ptr.cached &&
-                           p.crc.compression_type !=
-                           bch2_compression_opt_to_type(io_opts->background_compression))
-                               data_opts->rewrite_ptrs |= 1U << i;
-                       i++;
-               }
-       }
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie *cookie;
+       u64 v;
+       int ret;
 
-       if (io_opts->background_target) {
-               const struct bch_extent_ptr *ptr;
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       v = k.k->type == KEY_TYPE_cookie
+               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+               : 0;
+
+       cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+       ret = PTR_ERR_OR_ZERO(cookie);
+       if (ret)
+               goto err;
+
+       bkey_cookie_init(&cookie->k_i);
+       cookie->k.p = iter.pos;
+       cookie->v.cookie = cpu_to_le64(v + 1);
+
+       ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
 
-               i = 0;
-               bkey_for_each_ptr(ptrs, ptr) {
-                       if (!ptr->cached &&
-                           !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
-                           bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
-                               data_opts->rewrite_ptrs |= 1U << i;
-                       i++;
-               }
-       }
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+{
+       int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+                           __bch2_set_rebalance_needs_scan(trans, inum));
+       rebalance_wakeup(c);
+       return ret;
+}
 
-       return data_opts->rewrite_ptrs != 0;
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+       return bch2_set_rebalance_needs_scan(c, 0);
 }
 
-void bch2_rebalance_add_key(struct bch_fs *c,
-                           struct bkey_s_c k,
-                           struct bch_io_opts *io_opts)
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
 {
-       struct data_update_opts update_opts = { 0 };
-       struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr;
-       unsigned i;
-
-       if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
-               return;
-
-       i = 0;
-       ptrs = bch2_bkey_ptrs_c(k);
-       bkey_for_each_ptr(ptrs, ptr) {
-               if ((1U << i) && update_opts.rewrite_ptrs)
-                       if (atomic64_add_return(k.k->size,
-                                       &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
-                           k.k->size)
-                               rebalance_wakeup(c);
-               i++;
-       }
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 v;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       v = k.k->type == KEY_TYPE_cookie
+               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+               : 0;
+
+       if (v == cookie)
+               ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
 }
 
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+                                           struct btree_iter *work_iter)
 {
-       if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
-           sectors)
-               rebalance_wakeup(c);
+       return !kthread_should_stop()
+               ? bch2_btree_iter_peek(work_iter)
+               : bkey_s_c_null;
 }
 
-struct rebalance_work {
-       int             dev_most_full_idx;
-       unsigned        dev_most_full_percent;
-       u64             dev_most_full_work;
-       u64             dev_most_full_capacity;
-       u64             total_work;
-};
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+                                          struct btree_iter *iter,
+                                          struct bkey_s_c k)
+{
+       struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+       int ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
+
+       extent_entry_drop(bkey_i_to_s(n),
+                         (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+       return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+}
 
-static void rebalance_work_accumulate(struct rebalance_work *w,
-               u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+                       struct bpos work_pos,
+                       struct btree_iter *extent_iter,
+                       struct data_update_opts *data_opts)
 {
-       unsigned percent_full;
-       u64 work = dev_work + unknown_dev;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+
+       bch2_trans_iter_exit(trans, extent_iter);
+       bch2_trans_iter_init(trans, extent_iter,
+                            work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+                            work_pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek_slot(extent_iter);
+       if (bkey_err(k))
+               return k;
+
+       const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+       if (!r) {
+               /* raced due to btree write buffer, nothing to do */
+               return bkey_s_c_null;
+       }
 
-       if (work < dev_work || work < unknown_dev)
-               work = U64_MAX;
-       work = min(work, capacity);
+       memset(data_opts, 0, sizeof(*data_opts));
 
-       percent_full = div64_u64(work * 100, capacity);
+       data_opts->rewrite_ptrs         =
+               bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+       data_opts->target               = r->target;
 
-       if (percent_full >= w->dev_most_full_percent) {
-               w->dev_most_full_idx            = idx;
-               w->dev_most_full_percent        = percent_full;
-               w->dev_most_full_work           = work;
-               w->dev_most_full_capacity       = capacity;
+       if (!data_opts->rewrite_ptrs) {
+               /*
+                * device we would want to write to offline? devices in target
+                * changed?
+                *
+                * We'll now need a full scan before this extent is picked up
+                * again:
+                */
+               int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+               if (ret)
+                       return bkey_s_c_err(ret);
+               return bkey_s_c_null;
        }
 
-       if (w->total_work + dev_work >= w->total_work &&
-           w->total_work + dev_work >= dev_work)
-               w->total_work += dev_work;
+       return k;
 }
 
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+                              struct bpos work_pos,
+                              struct btree_iter *extent_iter)
 {
-       struct bch_dev *ca;
-       struct rebalance_work ret = { .dev_most_full_idx = -1 };
-       u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
-       unsigned i;
-
-       for_each_online_member(ca, c, i)
-               rebalance_work_accumulate(&ret,
-                       atomic64_read(&ca->rebalance_work),
-                       unknown_dev,
-                       bucket_to_sector(ca, ca->mi.nbuckets -
-                                        ca->mi.first_bucket),
-                       i);
-
-       rebalance_work_accumulate(&ret,
-               unknown_dev, 0, c->capacity, -1);
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct bch_fs_rebalance *r = &trans->c->rebalance;
+       struct data_update_opts data_opts;
+       struct bch_io_opts io_opts;
+       struct bkey_s_c k;
+       struct bkey_buf sk;
+       int ret;
+
+       ctxt->stats = &r->work_stats;
+       r->state = BCH_REBALANCE_working;
+
+       bch2_bkey_buf_init(&sk);
+
+       ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+                                                extent_iter, &data_opts));
+       if (ret || !k.k)
+               goto out;
+
+       ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+       if (ret)
+               goto out;
 
+       atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+       /*
+        * The iterator gets unlocked by __bch2_read_extent - need to
+        * save a copy of @k elsewhere:
+        */
+       bch2_bkey_buf_reassemble(&sk, c, k);
+       k = bkey_i_to_s_c(sk.k);
+
+       ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+       if (ret) {
+               if (bch2_err_matches(ret, ENOMEM)) {
+                       /* memory allocation failure, wait for some IO to finish */
+                       bch2_move_ctxt_wait_for_io(ctxt);
+                       ret = -BCH_ERR_transaction_restart_nested;
+               }
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       goto out;
+
+               /* skip it and continue, XXX signal failure */
+               ret = 0;
+       }
+out:
+       bch2_bkey_buf_exit(&sk, c);
        return ret;
 }
 
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+                          struct bkey_s_c k,
+                          struct bch_io_opts *io_opts,
+                          struct data_update_opts *data_opts)
 {
-       struct bch_dev *ca;
-       unsigned i;
+       unsigned target, compression;
 
-       for_each_online_member(ca, c, i)
-               atomic64_set(&ca->rebalance_work, 0);
+       if (k.k->p.inode) {
+               target          = io_opts->background_target;
+               compression     = io_opts->background_compression ?: io_opts->compression;
+       } else {
+               const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
 
-       atomic64_set(&c->rebalance.work_unknown_dev, 0);
+               target          = r ? r->target : io_opts->background_target;
+               compression     = r ? r->compression :
+                       (io_opts->background_compression ?: io_opts->compression);
+       }
+
+       data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+       data_opts->target               = target;
+       return data_opts->rewrite_ptrs != 0;
 }
 
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
 {
-       u64 utime, stime;
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs_rebalance *r = &trans->c->rebalance;
+       int ret;
+
+       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+       ctxt->stats = &r->scan_stats;
+
+       if (!inum) {
+               r->scan_start   = BBPOS_MIN;
+               r->scan_end     = BBPOS_MAX;
+       } else {
+               r->scan_start   = BBPOS(BTREE_ID_extents, POS(inum, 0));
+               r->scan_end     = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+       }
+
+       r->state = BCH_REBALANCE_scanning;
+
+       ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+               commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+                         bch2_clear_rebalance_needs_scan(trans, inum, cookie));
 
-       task_cputime_adjusted(current, &utime, &stime);
-       return nsecs_to_jiffies(utime + stime);
+       bch2_move_stats_exit(&r->scan_stats, trans->c);
+       return ret;
 }
 
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
 {
-       struct bch_fs *c = arg;
        struct bch_fs_rebalance *r = &c->rebalance;
        struct io_clock *clock = &c->io_clock[WRITE];
-       struct rebalance_work w, p;
-       struct bch_move_stats move_stats;
-       unsigned long start, prev_start;
-       unsigned long prev_run_time, prev_run_cputime;
-       unsigned long cputime, prev_cputime;
-       u64 io_start;
-       long throttle;
+       u64 now = atomic64_read(&clock->now);
+       u64 min_member_capacity = bch2_min_rw_member_capacity(c);
 
-       set_freezable();
+       if (min_member_capacity == U64_MAX)
+               min_member_capacity = 128 * 2048;
+
+       r->wait_iotime_end              = now + (min_member_capacity >> 6);
 
-       io_start        = atomic64_read(&clock->now);
-       p               = rebalance_work(c);
-       prev_start      = jiffies;
-       prev_cputime    = curr_cputime();
+       if (r->state != BCH_REBALANCE_waiting) {
+               r->wait_iotime_start    = now;
+               r->wait_wallclock_start = ktime_get_real_ns();
+               r->state                = BCH_REBALANCE_waiting;
+       }
 
-       bch2_move_stats_init(&move_stats, "rebalance");
-       while (!kthread_wait_freezable(r->enabled)) {
-               cond_resched();
+       bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
 
-               start                   = jiffies;
-               cputime                 = curr_cputime();
+static int do_rebalance(struct moving_context *ctxt)
+{
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+       struct bkey_s_c k;
+       int ret = 0;
 
-               prev_run_time           = start - prev_start;
-               prev_run_cputime        = cputime - prev_cputime;
+       bch2_move_stats_init(&r->work_stats, "rebalance_work");
+       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
 
-               w                       = rebalance_work(c);
-               BUG_ON(!w.dev_most_full_capacity);
+       bch2_trans_iter_init(trans, &rebalance_work_iter,
+                            BTREE_ID_rebalance_work, POS_MIN,
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-               if (!w.total_work) {
-                       r->state = REBALANCE_WAITING;
-                       kthread_wait_freezable(rebalance_work(c).total_work);
+       while (!bch2_move_ratelimit(ctxt) &&
+              !kthread_wait_freezable(r->enabled)) {
+               bch2_trans_begin(trans);
+
+               ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
-               }
+               if (ret || !k.k)
+                       break;
 
-               /*
-                * If there isn't much work to do, throttle cpu usage:
-                */
-               throttle = prev_run_cputime * 100 /
-                       max(1U, w.dev_most_full_percent) -
-                       prev_run_time;
-
-               if (w.dev_most_full_percent < 20 && throttle > 0) {
-                       r->throttled_until_iotime = io_start +
-                               div_u64(w.dev_most_full_capacity *
-                                       (20 - w.dev_most_full_percent),
-                                       50);
-
-                       if (atomic64_read(&clock->now) + clock->max_slop <
-                           r->throttled_until_iotime) {
-                               r->throttled_until_cputime = start + throttle;
-                               r->state = REBALANCE_THROTTLED;
-
-                               bch2_kthread_io_clock_wait(clock,
-                                       r->throttled_until_iotime,
-                                       throttle);
-                               continue;
-                       }
-               }
+               ret = k.k->type == KEY_TYPE_cookie
+                       ? do_rebalance_scan(ctxt, k.k->p.inode,
+                                           le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+                       : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
 
-               /* minimum 1 mb/sec: */
-               r->pd.rate.rate =
-                       max_t(u64, 1 << 11,
-                             r->pd.rate.rate *
-                             max(p.dev_most_full_percent, 1U) /
-                             max(w.dev_most_full_percent, 1U));
-
-               io_start        = atomic64_read(&clock->now);
-               p               = w;
-               prev_start      = start;
-               prev_cputime    = cputime;
-
-               r->state = REBALANCE_RUNNING;
-               memset(&move_stats, 0, sizeof(move_stats));
-               rebalance_work_reset(c);
-
-               bch2_move_data(c,
-                              0,               POS_MIN,
-                              BTREE_ID_NR,     POS_MAX,
-                              /* ratelimiting disabled for now */
-                              NULL, /*  &r->pd.rate, */
-                              &move_stats,
-                              writepoint_ptr(&c->rebalance_write_point),
-                              true,
-                              rebalance_pred, NULL);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       break;
+
+               bch2_btree_iter_advance(&rebalance_work_iter);
        }
 
-       return 0;
+       bch2_trans_iter_exit(trans, &extent_iter);
+       bch2_trans_iter_exit(trans, &rebalance_work_iter);
+       bch2_move_stats_exit(&r->scan_stats, c);
+
+       if (!ret &&
+           !kthread_should_stop() &&
+           !atomic64_read(&r->work_stats.sectors_seen) &&
+           !atomic64_read(&r->scan_stats.sectors_seen)) {
+               bch2_trans_unlock_long(trans);
+               rebalance_wait(c);
+       }
+
+       if (!bch2_err_matches(ret, EROFS))
+               bch_err_fn(c, ret);
+       return ret;
 }
 
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
 {
+       struct bch_fs *c = arg;
        struct bch_fs_rebalance *r = &c->rebalance;
-       struct rebalance_work w = rebalance_work(c);
+       struct moving_context ctxt;
+       int ret;
 
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 20);
+       set_freezable();
 
-       prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
-       prt_tab(out);
+       bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+                             writepoint_ptr(&c->rebalance_write_point),
+                             true);
 
-       prt_human_readable_u64(out, w.dev_most_full_work << 9);
-       prt_printf(out, "/");
-       prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
-       prt_newline(out);
+       while (!kthread_should_stop() &&
+              !(ret = do_rebalance(&ctxt)))
+               ;
 
-       prt_printf(out, "total work:");
-       prt_tab(out);
+       bch2_moving_ctxt_exit(&ctxt);
 
-       prt_human_readable_u64(out, w.total_work << 9);
-       prt_printf(out, "/");
-       prt_human_readable_u64(out, c->capacity << 9);
-       prt_newline(out);
+       return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct bch_fs_rebalance *r = &c->rebalance;
 
-       prt_printf(out, "rate:");
-       prt_tab(out);
-       prt_printf(out, "%u", r->pd.rate.rate);
+       prt_str(out, bch2_rebalance_state_strs[r->state]);
        prt_newline(out);
+       printbuf_indent_add(out, 2);
 
        switch (r->state) {
-       case REBALANCE_WAITING:
-               prt_printf(out, "waiting");
+       case BCH_REBALANCE_waiting: {
+               u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+               prt_str(out, "io wait duration:  ");
+               bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+               prt_newline(out);
+
+               prt_str(out, "io wait remaining: ");
+               bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+               prt_newline(out);
+
+               prt_str(out, "duration waited:   ");
+               bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+               prt_newline(out);
                break;
-       case REBALANCE_THROTTLED:
-               prt_printf(out, "throttled for %lu sec or ",
-                      (r->throttled_until_cputime - jiffies) / HZ);
-               prt_human_readable_u64(out,
-                           (r->throttled_until_iotime -
-                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
-               prt_printf(out, " io");
+       }
+       case BCH_REBALANCE_working:
+               bch2_move_stats_to_text(out, &r->work_stats);
                break;
-       case REBALANCE_RUNNING:
-               prt_printf(out, "running");
+       case BCH_REBALANCE_scanning:
+               bch2_move_stats_to_text(out, &r->scan_stats);
                break;
        }
        prt_newline(out);
+       printbuf_indent_sub(out, 2);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
@@ -346,7 +448,7 @@ int bch2_rebalance_start(struct bch_fs *c)
        p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
        ret = PTR_ERR_OR_ZERO(p);
        if (ret) {
-               bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "creating rebalance thread");
                return ret;
        }
 
@@ -359,6 +461,4 @@ int bch2_rebalance_start(struct bch_fs *c)
 void bch2_fs_rebalance_init(struct bch_fs *c)
 {
        bch2_pd_controller_init(&c->rebalance.pd);
-
-       atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
 }
index 7ade0bb81cce8d1ac0a12819a44f3eb2d2e8f79d..28a52638f16cc113848cf3925758e12d510dc247 100644 (file)
@@ -4,6 +4,9 @@
 
 #include "rebalance_types.h"
 
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
 static inline void rebalance_wakeup(struct bch_fs *c)
 {
        struct task_struct *p;
@@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
        rcu_read_unlock();
 }
 
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
-                           struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
index 7462a92e95985d91cdc454485d045659240dd0fc..0fffb536c1d0c1b65d1a2a68730cab49f7535db2 100644 (file)
@@ -2,25 +2,36 @@
 #ifndef _BCACHEFS_REBALANCE_TYPES_H
 #define _BCACHEFS_REBALANCE_TYPES_H
 
+#include "bbpos_types.h"
 #include "move_types.h"
 
-enum rebalance_state {
-       REBALANCE_WAITING,
-       REBALANCE_THROTTLED,
-       REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES()         \
+       x(waiting)                      \
+       x(working)                      \
+       x(scanning)
+
+enum bch_rebalance_states {
+#define x(t)   BCH_REBALANCE_##t,
+       BCH_REBALANCE_STATES()
+#undef x
 };
 
 struct bch_fs_rebalance {
-       struct task_struct __rcu *thread;
+       struct task_struct __rcu        *thread;
        struct bch_pd_controller pd;
 
-       atomic64_t              work_unknown_dev;
+       enum bch_rebalance_states       state;
+       u64                             wait_iotime_start;
+       u64                             wait_iotime_end;
+       u64                             wait_wallclock_start;
+
+       struct bch_move_stats           work_stats;
 
-       enum rebalance_state    state;
-       u64                     throttled_until_iotime;
-       unsigned long           throttled_until_cputime;
+       struct bbpos                    scan_start;
+       struct bbpos                    scan_end;
+       struct bch_move_stats           scan_stats;
 
-       unsigned                enabled:1;
+       unsigned                        enabled:1;
 };
 
 #endif /* _BCACHEFS_REBALANCE_TYPES_H */
index 55a233c2c7cc7b7f8122b66c3cd32fc057658c8d..83fc121ff3c44d3012a1d5770536549cbc479971 100644 (file)
@@ -5,6 +5,7 @@
 #include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "lru.h"
+#include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "super-io.h"
 
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
+static bool btree_id_is_alloc(enum btree_id id)
+{
+       switch (id) {
+       case BTREE_ID_alloc:
+       case BTREE_ID_backpointers:
+       case BTREE_ID_need_discard:
+       case BTREE_ID_freespace:
+       case BTREE_ID_bucket_gens:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /* for -o reconstruct_alloc: */
 static void drop_alloc_keys(struct journal_keys *keys)
 {
        size_t src, dst;
 
        for (src = 0, dst = 0; src < keys->nr; src++)
-               if (keys->d[src].btree_id != BTREE_ID_alloc)
+               if (!btree_id_is_alloc(keys->d[src].btree_id))
                        keys->d[dst++] = keys->d[src];
 
        keys->nr = dst;
@@ -57,524 +76,6 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
                        bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
 }
 
-/* iterate over keys read from the journal: */
-
-static int __journal_key_cmp(enum btree_id     l_btree_id,
-                            unsigned           l_level,
-                            struct bpos        l_pos,
-                            const struct journal_key *r)
-{
-       return (cmp_int(l_btree_id,     r->btree_id) ?:
-               cmp_int(l_level,        r->level) ?:
-               bpos_cmp(l_pos, r->k->k.p));
-}
-
-static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
-{
-       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
-}
-
-static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
-{
-       size_t gap_size = keys->size - keys->nr;
-
-       if (idx >= keys->gap)
-               idx += gap_size;
-       return idx;
-}
-
-static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
-{
-       return keys->d + idx_to_pos(keys, idx);
-}
-
-static size_t __bch2_journal_key_search(struct journal_keys *keys,
-                                       enum btree_id id, unsigned level,
-                                       struct bpos pos)
-{
-       size_t l = 0, r = keys->nr, m;
-
-       while (l < r) {
-               m = l + ((r - l) >> 1);
-               if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0)
-                       l = m + 1;
-               else
-                       r = m;
-       }
-
-       BUG_ON(l < keys->nr &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0);
-
-       BUG_ON(l &&
-              __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0);
-
-       return l;
-}
-
-static size_t bch2_journal_key_search(struct journal_keys *keys,
-                                     enum btree_id id, unsigned level,
-                                     struct bpos pos)
-{
-       return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
-}
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos,
-                                          struct bpos end_pos, size_t *idx)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       unsigned iters = 0;
-       struct journal_key *k;
-search:
-       if (!*idx)
-               *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
-
-       while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
-               if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
-                       return NULL;
-
-               if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-                   !k->overwritten)
-                       return k->k;
-
-               (*idx)++;
-               iters++;
-               if (iters == 10) {
-                       *idx = 0;
-                       goto search;
-               }
-       }
-
-       return NULL;
-}
-
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id,
-                                          unsigned level, struct bpos pos)
-{
-       size_t idx = 0;
-
-       return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
-}
-
-static void journal_iters_fix(struct bch_fs *c)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       /* The key we just inserted is immediately before the gap: */
-       size_t gap_end = keys->gap + (keys->size - keys->nr);
-       struct btree_and_journal_iter *iter;
-
-       /*
-        * If an iterator points one after the key we just inserted, decrement
-        * the iterator so it points at the key we just inserted - if the
-        * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
-        * handle that:
-        */
-       list_for_each_entry(iter, &c->journal_iters, journal.list)
-               if (iter->journal.idx == gap_end)
-                       iter->journal.idx = keys->gap - 1;
-}
-
-static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       struct journal_iter *iter;
-       size_t gap_size = keys->size - keys->nr;
-
-       list_for_each_entry(iter, &c->journal_iters, list) {
-               if (iter->idx > old_gap)
-                       iter->idx -= gap_size;
-               if (iter->idx >= new_gap)
-                       iter->idx += gap_size;
-       }
-}
-
-int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
-                                unsigned level, struct bkey_i *k)
-{
-       struct journal_key n = {
-               .btree_id       = id,
-               .level          = level,
-               .k              = k,
-               .allocated      = true,
-               /*
-                * Ensure these keys are done last by journal replay, to unblock
-                * journal reclaim:
-                */
-               .journal_seq    = U32_MAX,
-       };
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
-
-       BUG_ON(test_bit(BCH_FS_RW, &c->flags));
-
-       if (idx < keys->size &&
-           journal_key_cmp(&n, &keys->d[idx]) == 0) {
-               if (keys->d[idx].allocated)
-                       kfree(keys->d[idx].k);
-               keys->d[idx] = n;
-               return 0;
-       }
-
-       if (idx > keys->gap)
-               idx -= keys->size - keys->nr;
-
-       if (keys->nr == keys->size) {
-               struct journal_keys new_keys = {
-                       .nr                     = keys->nr,
-                       .size                   = max_t(size_t, keys->size, 8) * 2,
-               };
-
-               new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
-               if (!new_keys.d) {
-                       bch_err(c, "%s: error allocating new key array (size %zu)",
-                               __func__, new_keys.size);
-                       return -BCH_ERR_ENOMEM_journal_key_insert;
-               }
-
-               /* Since @keys was full, there was no gap: */
-               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
-               kvfree(keys->d);
-               *keys = new_keys;
-
-               /* And now the gap is at the end: */
-               keys->gap = keys->nr;
-       }
-
-       journal_iters_move_gap(c, keys->gap, idx);
-
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
-       keys->gap = idx;
-
-       keys->nr++;
-       keys->d[keys->gap++] = n;
-
-       journal_iters_fix(c);
-
-       return 0;
-}
-
-/*
- * Can only be used from the recovery thread while we're still RO - can't be
- * used once we've got RW, as journal_keys is at that point used by multiple
- * threads:
- */
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bkey_i *k)
-{
-       struct bkey_i *n;
-       int ret;
-
-       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
-       if (!n)
-               return -BCH_ERR_ENOMEM_journal_key_insert;
-
-       bkey_copy(n, k);
-       ret = bch2_journal_key_insert_take(c, id, level, n);
-       if (ret)
-               kfree(n);
-       return ret;
-}
-
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bpos pos)
-{
-       struct bkey_i whiteout;
-
-       bkey_init(&whiteout.k);
-       whiteout.k.p = pos;
-
-       return bch2_journal_key_insert(c, id, level, &whiteout);
-}
-
-void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
-                                 unsigned level, struct bpos pos)
-{
-       struct journal_keys *keys = &c->journal_keys;
-       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
-
-       if (idx < keys->size &&
-           keys->d[idx].btree_id       == btree &&
-           keys->d[idx].level          == level &&
-           bpos_eq(keys->d[idx].k->k.p, pos))
-               keys->d[idx].overwritten = true;
-}
-
-static void bch2_journal_iter_advance(struct journal_iter *iter)
-{
-       if (iter->idx < iter->keys->size) {
-               iter->idx++;
-               if (iter->idx == iter->keys->gap)
-                       iter->idx += iter->keys->size - iter->keys->nr;
-       }
-}
-
-static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
-{
-       struct journal_key *k = iter->keys->d + iter->idx;
-
-       while (k < iter->keys->d + iter->keys->size &&
-              k->btree_id      == iter->btree_id &&
-              k->level         == iter->level) {
-               if (!k->overwritten)
-                       return bkey_i_to_s_c(k->k);
-
-               bch2_journal_iter_advance(iter);
-               k = iter->keys->d + iter->idx;
-       }
-
-       return bkey_s_c_null;
-}
-
-static void bch2_journal_iter_exit(struct journal_iter *iter)
-{
-       list_del(&iter->list);
-}
-
-static void bch2_journal_iter_init(struct bch_fs *c,
-                                  struct journal_iter *iter,
-                                  enum btree_id id, unsigned level,
-                                  struct bpos pos)
-{
-       iter->btree_id  = id;
-       iter->level     = level;
-       iter->keys      = &c->journal_keys;
-       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
-}
-
-static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
-{
-       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-                                               iter->b, &iter->unpacked);
-}
-
-static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
-{
-       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
-}
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
-{
-       if (bpos_eq(iter->pos, SPOS_MAX))
-               iter->at_end = true;
-       else
-               iter->pos = bpos_successor(iter->pos);
-}
-
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
-{
-       struct bkey_s_c btree_k, journal_k, ret;
-again:
-       if (iter->at_end)
-               return bkey_s_c_null;
-
-       while ((btree_k = bch2_journal_iter_peek_btree(iter)).k &&
-              bpos_lt(btree_k.k->p, iter->pos))
-               bch2_journal_iter_advance_btree(iter);
-
-       while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-              bpos_lt(journal_k.k->p, iter->pos))
-               bch2_journal_iter_advance(&iter->journal);
-
-       ret = journal_k.k &&
-               (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
-               ? journal_k
-               : btree_k;
-
-       if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key))
-               ret = bkey_s_c_null;
-
-       if (ret.k) {
-               iter->pos = ret.k->p;
-               if (bkey_deleted(ret.k)) {
-                       bch2_btree_and_journal_iter_advance(iter);
-                       goto again;
-               }
-       } else {
-               iter->pos = SPOS_MAX;
-               iter->at_end = true;
-       }
-
-       return ret;
-}
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
-{
-       bch2_journal_iter_exit(&iter->journal);
-}
-
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                                 struct bch_fs *c,
-                                                 struct btree *b,
-                                                 struct btree_node_iter node_iter,
-                                                 struct bpos pos)
-{
-       memset(iter, 0, sizeof(*iter));
-
-       iter->b = b;
-       iter->node_iter = node_iter;
-       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
-       INIT_LIST_HEAD(&iter->journal.list);
-       iter->pos = b->data->min_key;
-       iter->at_end = false;
-}
-
-/*
- * this version is used by btree_gc before filesystem has gone RW and
- * multithreaded, so uses the journal_iters list:
- */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct bch_fs *c,
-                                               struct btree *b)
-{
-       struct btree_node_iter node_iter;
-
-       bch2_btree_node_iter_init_from_start(&node_iter, b);
-       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-       list_add(&iter->journal.list, &c->journal_iters);
-}
-
-/* sort and dedup all keys in the journal: */
-
-void bch2_journal_entries_free(struct bch_fs *c)
-{
-       struct journal_replay **i;
-       struct genradix_iter iter;
-
-       genradix_for_each(&c->journal_entries, iter, i)
-               if (*i)
-                       kvpfree(*i, offsetof(struct journal_replay, j) +
-                               vstruct_bytes(&(*i)->j));
-       genradix_free(&c->journal_entries);
-}
-
-/*
- * When keys compare equal, oldest compares first:
- */
-static int journal_sort_key_cmp(const void *_l, const void *_r)
-{
-       const struct journal_key *l = _l;
-       const struct journal_key *r = _r;
-
-       return  journal_key_cmp(l, r) ?:
-               cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->journal_offset, r->journal_offset);
-}
-
-void bch2_journal_keys_free(struct journal_keys *keys)
-{
-       struct journal_key *i;
-
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-       keys->gap = keys->nr;
-
-       for (i = keys->d; i < keys->d + keys->nr; i++)
-               if (i->allocated)
-                       kfree(i->k);
-
-       kvfree(keys->d);
-       keys->d = NULL;
-       keys->nr = keys->gap = keys->size = 0;
-}
-
-static void __journal_keys_sort(struct journal_keys *keys)
-{
-       struct journal_key *src, *dst;
-
-       sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
-
-       src = dst = keys->d;
-       while (src < keys->d + keys->nr) {
-               while (src + 1 < keys->d + keys->nr &&
-                      src[0].btree_id  == src[1].btree_id &&
-                      src[0].level     == src[1].level &&
-                      bpos_eq(src[0].k->k.p, src[1].k->k.p))
-                       src++;
-
-               *dst++ = *src++;
-       }
-
-       keys->nr = dst - keys->d;
-}
-
-static int journal_keys_sort(struct bch_fs *c)
-{
-       struct genradix_iter iter;
-       struct journal_replay *i, **_i;
-       struct jset_entry *entry;
-       struct bkey_i *k;
-       struct journal_keys *keys = &c->journal_keys;
-       size_t nr_keys = 0, nr_read = 0;
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (!i || i->ignore)
-                       continue;
-
-               for_each_jset_key(k, entry, &i->j)
-                       nr_keys++;
-       }
-
-       if (!nr_keys)
-               return 0;
-
-       keys->size = roundup_pow_of_two(nr_keys);
-
-       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-       if (!keys->d) {
-               bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
-                       nr_keys);
-
-               do {
-                       keys->size >>= 1;
-                       keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-               } while (!keys->d && keys->size > nr_keys / 8);
-
-               if (!keys->d) {
-                       bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
-                               keys->size);
-                       return -BCH_ERR_ENOMEM_journal_keys_sort;
-               }
-       }
-
-       genradix_for_each(&c->journal_entries, iter, _i) {
-               i = *_i;
-
-               if (!i || i->ignore)
-                       continue;
-
-               cond_resched();
-
-               for_each_jset_key(k, entry, &i->j) {
-                       if (keys->nr == keys->size) {
-                               __journal_keys_sort(keys);
-
-                               if (keys->nr > keys->size * 7 / 8) {
-                                       bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
-                                               keys->nr, keys->size, nr_read, nr_keys);
-                                       return -BCH_ERR_ENOMEM_journal_keys_sort;
-                               }
-                       }
-
-                       keys->d[keys->nr++] = (struct journal_key) {
-                               .btree_id       = entry->btree_id,
-                               .level          = entry->level,
-                               .k              = k,
-                               .journal_seq    = le64_to_cpu(i->j.seq),
-                               .journal_offset = k->_data - i->j._data,
-                       };
-
-                       nr_read++;
-               }
-       }
-
-       __journal_keys_sort(keys);
-       keys->gap = keys->nr;
-
-       bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
-       return 0;
-}
-
 /* journal replay: */
 
 static void replay_now_at(struct journal *j, u64 seq)
@@ -597,6 +98,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
        unsigned update_flags = BTREE_TRIGGER_NORUN;
        int ret;
 
+       if (k->overwritten)
+               return 0;
+
+       trans->journal_res.seq = k->journal_seq;
+
        /*
         * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
         * keep the key cache coherent with the underlying btree. Nothing
@@ -638,27 +144,14 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 static int bch2_journal_replay(struct bch_fs *c)
 {
        struct journal_keys *keys = &c->journal_keys;
-       struct journal_key **keys_sorted, *k;
+       DARRAY(struct journal_key *) keys_sorted = { 0 };
+       struct journal_key **kp;
        struct journal *j = &c->journal;
        u64 start_seq   = c->journal_replay_seq_start;
        u64 end_seq     = c->journal_replay_seq_start;
-       size_t i;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret;
 
-       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-       keys->gap = keys->nr;
-
-       keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
-       if (!keys_sorted)
-               return -BCH_ERR_ENOMEM_journal_replay;
-
-       for (i = 0; i < keys->nr; i++)
-               keys_sorted[i] = &keys->d[i];
-
-       sort(keys_sorted, keys->nr,
-            sizeof(keys_sorted[0]),
-            journal_sort_seq_cmp, NULL);
-
        if (keys->nr) {
                ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
                                           keys->nr, start_seq, end_seq);
@@ -666,27 +159,65 @@ static int bch2_journal_replay(struct bch_fs *c)
                        goto err;
        }
 
-       for (i = 0; i < keys->nr; i++) {
-               k = keys_sorted[i];
+       /*
+        * First, attempt to replay keys in sorted order. This is more
+        * efficient - better locality of btree access -  but some might fail if
+        * that would cause a journal deadlock.
+        */
+       for (size_t i = 0; i < keys->nr; i++) {
+               cond_resched();
+
+               struct journal_key *k = keys->d + i;
 
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               BCH_TRANS_COMMIT_journal_reclaim|
+                               (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+                            bch2_journal_replay_key(trans, k));
+               BUG_ON(!ret && !k->overwritten);
+               if (ret) {
+                       ret = darray_push(&keys_sorted, k);
+                       if (ret)
+                               goto err;
+               }
+       }
+
+       /*
+        * Now, replay any remaining keys in the order in which they appear in
+        * the journal, unpinning those journal entries as we go:
+        */
+       sort(keys_sorted.data, keys_sorted.nr,
+            sizeof(keys_sorted.data[0]),
+            journal_sort_seq_cmp, NULL);
+
+       darray_for_each(keys_sorted, kp) {
                cond_resched();
 
+               struct journal_key *k = *kp;
+
                replay_now_at(j, k->journal_seq);
 
-               ret = bch2_trans_do(c, NULL, NULL,
-                                   BTREE_INSERT_LAZY_RW|
-                                   BTREE_INSERT_NOFAIL|
-                                   (!k->allocated
-                                    ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
-                                    : 0),
-                            bch2_journal_replay_key(&trans, k));
-               if (ret) {
-                       bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
-                               bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret));
+               ret = commit_do(trans, NULL, NULL,
+                               BCH_TRANS_COMMIT_no_enospc|
+                               (!k->allocated
+                                ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+                                : 0),
+                            bch2_journal_replay_key(trans, k));
+               bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+                           bch2_btree_id_str(k->btree_id), k->level);
+               if (ret)
                        goto err;
-               }
+
+               BUG_ON(!k->overwritten);
        }
 
+       /*
+        * We need to put our btree_trans before calling flush_all_pins(), since
+        * that will use a btree_trans internally
+        */
+       bch2_trans_put(trans);
+       trans = NULL;
+
        replay_now_at(j, j->replay_journal_seq_end);
        j->replay_journal_seq = 0;
 
@@ -697,10 +228,10 @@ static int bch2_journal_replay(struct bch_fs *c)
        if (keys->nr && !ret)
                bch2_journal_log_msg(c, "journal replay finished");
 err:
-       kvfree(keys_sorted);
-
-       if (ret)
-               bch_err_fn(c, ret);
+       if (trans)
+               bch2_trans_put(trans);
+       darray_exit(&keys_sorted);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -725,7 +256,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 
                if (entry->u64s) {
                        r->level = entry->level;
-                       bkey_copy(&r->key, &entry->start[0]);
+                       bkey_copy(&r->key, (struct bkey_i *) entry->start);
                        r->error = 0;
                } else {
                        r->error = -EIO;
@@ -846,148 +377,6 @@ static int journal_replay_early(struct bch_fs *c,
 
 /* sb clean section: */
 
-static struct bkey_i *btree_root_find(struct bch_fs *c,
-                                     struct bch_sb_field_clean *clean,
-                                     struct jset *j,
-                                     enum btree_id id, unsigned *level)
-{
-       struct bkey_i *k;
-       struct jset_entry *entry, *start, *end;
-
-       if (clean) {
-               start = clean->start;
-               end = vstruct_end(&clean->field);
-       } else {
-               start = j->start;
-               end = vstruct_last(j);
-       }
-
-       for (entry = start; entry < end; entry = vstruct_next(entry))
-               if (entry->type == BCH_JSET_ENTRY_btree_root &&
-                   entry->btree_id == id)
-                       goto found;
-
-       return NULL;
-found:
-       if (!entry->u64s)
-               return ERR_PTR(-EINVAL);
-
-       k = entry->start;
-       *level = entry->level;
-       return k;
-}
-
-static int verify_superblock_clean(struct bch_fs *c,
-                                  struct bch_sb_field_clean **cleanp,
-                                  struct jset *j)
-{
-       unsigned i;
-       struct bch_sb_field_clean *clean = *cleanp;
-       struct printbuf buf1 = PRINTBUF;
-       struct printbuf buf2 = PRINTBUF;
-       int ret = 0;
-
-       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
-                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
-                       le64_to_cpu(clean->journal_seq),
-                       le64_to_cpu(j->seq))) {
-               kfree(clean);
-               *cleanp = NULL;
-               return 0;
-       }
-
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               struct bkey_i *k1, *k2;
-               unsigned l1 = 0, l2 = 0;
-
-               k1 = btree_root_find(c, clean, NULL, i, &l1);
-               k2 = btree_root_find(c, NULL, j, i, &l2);
-
-               if (!k1 && !k2)
-                       continue;
-
-               printbuf_reset(&buf1);
-               printbuf_reset(&buf2);
-
-               if (k1)
-                       bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1));
-               else
-                       prt_printf(&buf1, "(none)");
-
-               if (k2)
-                       bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2));
-               else
-                       prt_printf(&buf2, "(none)");
-
-               mustfix_fsck_err_on(!k1 || !k2 ||
-                                   IS_ERR(k1) ||
-                                   IS_ERR(k2) ||
-                                   k1->k.u64s != k2->k.u64s ||
-                                   memcmp(k1, k2, bkey_bytes(&k1->k)) ||
-                                   l1 != l2, c,
-                       "superblock btree root %u doesn't match journal after clean shutdown\n"
-                       "sb:      l=%u %s\n"
-                       "journal: l=%u %s\n", i,
-                       l1, buf1.buf,
-                       l2, buf2.buf);
-       }
-fsck_err:
-       printbuf_exit(&buf2);
-       printbuf_exit(&buf1);
-       return ret;
-}
-
-static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *clean, *sb_clean;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
-
-       if (fsck_err_on(!sb_clean, c,
-                       "superblock marked clean but clean section not present")) {
-               SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-               c->sb.clean = false;
-               mutex_unlock(&c->sb_lock);
-               return NULL;
-       }
-
-       clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
-                       GFP_KERNEL);
-       if (!clean) {
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean);
-       }
-
-       ret = bch2_sb_clean_validate_late(c, clean, READ);
-       if (ret) {
-               mutex_unlock(&c->sb_lock);
-               return ERR_PTR(ret);
-       }
-
-       mutex_unlock(&c->sb_lock);
-
-       return clean;
-fsck_err:
-       mutex_unlock(&c->sb_lock);
-       return ERR_PTR(ret);
-}
-
-static bool btree_id_is_alloc(enum btree_id id)
-{
-       switch (id) {
-       case BTREE_ID_alloc:
-       case BTREE_ID_backpointers:
-       case BTREE_ID_need_discard:
-       case BTREE_ID_freespace:
-       case BTREE_ID_bucket_gens:
-               return true;
-       default:
-               return false;
-       }
-}
-
 static int read_btree_roots(struct bch_fs *c)
 {
        unsigned i;
@@ -1006,23 +395,25 @@ static int read_btree_roots(struct bch_fs *c)
                }
 
                if (r->error) {
-                       __fsck_err(c, btree_id_is_alloc(i)
+                       __fsck_err(c,
+                                  btree_id_is_alloc(i)
                                   ? FSCK_CAN_IGNORE : 0,
+                                  btree_root_bkey_invalid,
                                   "invalid btree root %s",
-                                  bch2_btree_ids[i]);
+                                  bch2_btree_id_str(i));
                        if (i == BTREE_ID_alloc)
                                c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                }
 
                ret = bch2_btree_root_read(c, i, &r->key, r->level);
                if (ret) {
-                       __fsck_err(c,
-                                  btree_id_is_alloc(i)
-                                  ? FSCK_CAN_IGNORE : 0,
-                                  "error reading btree root %s",
-                                  bch2_btree_ids[i]);
+                       fsck_err(c,
+                                btree_root_read_error,
+                                "error reading btree root %s",
+                                bch2_btree_id_str(i));
                        if (btree_id_is_alloc(i))
                                c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+                       ret = 0;
                }
        }
 
@@ -1065,15 +456,9 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
        root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
        root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
 
-       ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,
-                                 &root_tree.k_i,
-                                 NULL, NULL, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_snapshots,
-                                 &root_snapshot.k_i,
-                                 NULL, NULL, 0) ?:
-               bch2_btree_insert(c, BTREE_ID_subvolumes,
-                                 &root_volume.k_i,
-                                 NULL, NULL, 0);
+       ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,   &root_tree.k_i, NULL, 0) ?:
+               bch2_btree_insert(c, BTREE_ID_snapshots,        &root_snapshot.k_i, NULL, 0) ?:
+               bch2_btree_insert(c, BTREE_ID_subvolumes,       &root_volume.k_i, NULL, 0);
        if (ret)
                bch_err_fn(c, ret);
        return ret;
@@ -1113,16 +498,57 @@ err:
 noinline_for_stack
 static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
 {
-       int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                               __bch2_fs_upgrade_for_subvolumes(&trans));
+       int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+                               __bch2_fs_upgrade_for_subvolumes(trans));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
 }
 
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, _when)  #_fn,
+       BCH_RECOVERY_PASSES()
+#undef x
+       NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+       return bch2_gc(c, true, c->opts.norecovery);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+       struct journal_keys *keys = &c->journal_keys;
+
+       /*
+        * After we go RW, the journal keys buffer can't be modified (except for
+        * setting journal_key->overwritten: it will be accessed by multiple
+        * threads
+        */
+       move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+       keys->gap = keys->nr;
+
+       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+       if (keys->nr)
+               return bch2_fs_read_write_early(c);
+       return 0;
+}
+
+struct recovery_pass_fn {
+       int             (*fn)(struct bch_fs *);
+       unsigned        when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _when)  { .fn = bch2_##_fn, .when = _when },
+       BCH_RECOVERY_PASSES()
+#undef x
+};
+
 static void check_version_upgrade(struct bch_fs *c)
 {
-       unsigned latest_compatible = bch2_version_compatible(c->sb.version);
+       unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
        unsigned latest_version = bcachefs_metadata_version_current;
        unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
        unsigned new_version = 0;
@@ -1172,7 +598,12 @@ static void check_version_upgrade(struct bch_fs *c)
 
                recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
                if (recovery_passes) {
-                       prt_str(&buf, "fsck required");
+                       if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
+                               prt_str(&buf, "fsck required");
+                       else {
+                               prt_str(&buf, "running recovery passes: ");
+                               prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
+                       }
 
                        c->recovery_passes_explicit |= recovery_passes;
                        c->opts.fix_errors = FSCK_FIX_yes;
@@ -1188,42 +619,19 @@ static void check_version_upgrade(struct bch_fs *c)
        }
 }
 
-static int bch2_check_allocations(struct bch_fs *c)
-{
-       return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-       set_bit(BCH_FS_MAY_GO_RW, &c->flags);
-       return 0;
-}
-
-struct recovery_pass_fn {
-       int             (*fn)(struct bch_fs *);
-       const char      *name;
-       unsigned        when;
-};
-
-static struct recovery_pass_fn recovery_passes[] = {
-#define x(_fn, _when)  { .fn = bch2_##_fn, .name = #_fn, .when = _when },
-       BCH_RECOVERY_PASSES()
-#undef x
-};
-
 u64 bch2_fsck_recovery_passes(void)
 {
        u64 ret = 0;
 
-       for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
-               if (recovery_passes[i].when & PASS_FSCK)
+       for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+               if (recovery_pass_fns[i].when & PASS_FSCK)
                        ret |= BIT_ULL(i);
        return ret;
 }
 
 static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
 {
-       struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass;
+       struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
 
        if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
                return false;
@@ -1245,15 +653,18 @@ static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
        c->curr_recovery_pass = pass;
 
        if (should_run_recovery_pass(c, pass)) {
-               struct recovery_pass_fn *p = recovery_passes + pass;
+               struct recovery_pass_fn *p = recovery_pass_fns + pass;
 
                if (!(p->when & PASS_SILENT))
-                       printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name);
+                       printk(KERN_INFO bch2_log_msg(c, "%s..."),
+                              bch2_recovery_passes[pass]);
                ret = p->fn(c);
                if (ret)
                        return ret;
                if (!(p->when & PASS_SILENT))
                        printk(KERN_CONT " done\n");
+
+               c->recovery_passes_complete |= BIT_ULL(pass);
        }
 
        return 0;
@@ -1263,7 +674,7 @@ static int bch2_run_recovery_passes(struct bch_fs *c)
 {
        int ret = 0;
 
-       while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) {
+       while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
                ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
                if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
                        continue;
@@ -1279,21 +690,21 @@ int bch2_fs_recovery(struct bch_fs *c)
 {
        struct bch_sb_field_clean *clean = NULL;
        struct jset *last_journal_entry = NULL;
-       u64 last_seq, blacklist_seq, journal_seq;
+       u64 last_seq = 0, blacklist_seq, journal_seq;
        bool write_sb = false;
        int ret = 0;
 
-       if (c->sb.clean)
-               clean = read_superblock_clean(c);
-       ret = PTR_ERR_OR_ZERO(clean);
-       if (ret)
-               goto err;
+       if (c->sb.clean) {
+               clean = bch2_read_superblock_clean(c);
+               ret = PTR_ERR_OR_ZERO(clean);
+               if (ret)
+                       goto err;
 
-       if (c->sb.clean)
                bch_info(c, "recovering from clean shutdown, journal seq %llu",
                         le64_to_cpu(clean->journal_seq));
-       else
+       } else {
                bch_info(c, "recovering from unclean shutdown");
+       }
 
        if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
                bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -1308,12 +719,6 @@ int bch2_fs_recovery(struct bch_fs *c)
                goto err;
        }
 
-       if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-               bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
-               ret = -EINVAL;
-               goto err;
-       }
-
        if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery))
                check_version_upgrade(c);
 
@@ -1354,6 +759,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                if (mustfix_fsck_err_on(c->sb.clean &&
                                        last_journal_entry &&
                                        !journal_entry_empty(last_journal_entry), c,
+                               clean_but_journal_not_empty,
                                "filesystem marked clean but journal not empty")) {
                        c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
                        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
@@ -1361,7 +767,9 @@ int bch2_fs_recovery(struct bch_fs *c)
                }
 
                if (!last_journal_entry) {
-                       fsck_err_on(!c->sb.clean, c, "no journal entries found");
+                       fsck_err_on(!c->sb.clean, c,
+                                   dirty_but_no_journal_entries,
+                                   "no journal entries found");
                        if (clean)
                                goto use_clean;
 
@@ -1369,16 +777,23 @@ int bch2_fs_recovery(struct bch_fs *c)
                                if (*i) {
                                        last_journal_entry = &(*i)->j;
                                        (*i)->ignore = false;
+                                       /*
+                                        * This was probably a NO_FLUSH entry,
+                                        * so last_seq was garbage - but we know
+                                        * we're only using a single journal
+                                        * entry, set it here:
+                                        */
+                                       (*i)->j.last_seq = (*i)->j.seq;
                                        break;
                                }
                }
 
-               ret = journal_keys_sort(c);
+               ret = bch2_journal_keys_sort(c);
                if (ret)
                        goto err;
 
                if (c->sb.clean && last_journal_entry) {
-                       ret = verify_superblock_clean(c, &clean,
+                       ret = bch2_verify_superblock_clean(c, &clean,
                                                      last_journal_entry);
                        if (ret)
                                goto err;
@@ -1395,7 +810,7 @@ use_clean:
        }
 
        c->journal_replay_seq_start     = last_seq;
-       c->journal_replay_seq_end       = blacklist_seq - 1;;
+       c->journal_replay_seq_end       = blacklist_seq - 1;
 
        if (c->opts.reconstruct_alloc) {
                c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
@@ -1463,6 +878,8 @@ use_clean:
            test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
            !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
            !test_bit(BCH_FS_ERROR, &c->flags)) {
+               bch2_flush_fsck_errs(c);
+
                bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
                clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
 
@@ -1513,7 +930,6 @@ use_clean:
        mutex_unlock(&c->sb_lock);
 
        if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
            c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) {
                struct bch_move_stats stats;
 
@@ -1543,7 +959,7 @@ out:
        }
        kfree(clean);
 
-       if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) {
+       if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
                bch2_fs_read_write_early(c);
                bch2_delete_dead_snapshots_async(c);
        }
@@ -1581,23 +997,19 @@ int bch2_fs_initialize(struct bch_fs *c)
        }
        mutex_unlock(&c->sb_lock);
 
-       c->curr_recovery_pass = ARRAY_SIZE(recovery_passes);
+       c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
        set_bit(BCH_FS_MAY_GO_RW, &c->flags);
        set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
        for (i = 0; i < BTREE_ID_NR; i++)
                bch2_btree_root_alloc(c, i);
 
-       for_each_online_member(ca, c, i)
+       for_each_member_device(ca, c, i)
                bch2_dev_usage_init(ca);
 
-       for_each_online_member(ca, c, i) {
-               ret = bch2_dev_journal_alloc(ca);
-               if (ret) {
-                       percpu_ref_put(&ca->io_ref);
-                       goto err;
-               }
-       }
+       ret = bch2_fs_journal_alloc(c);
+       if (ret)
+               goto err;
 
        /*
         * journal_res_get() will crash if called before this has
@@ -1615,15 +1027,13 @@ int bch2_fs_initialize(struct bch_fs *c)
         * btree updates
         */
        bch_verbose(c, "marking superblocks");
-       for_each_member_device(ca, c, i) {
-               ret = bch2_trans_mark_dev_sb(c, ca);
-               if (ret) {
-                       percpu_ref_put(&ca->ref);
-                       goto err;
-               }
+       ret = bch2_trans_mark_dev_sbs(c);
+       bch_err_msg(c, ret, "marking superblocks");
+       if (ret)
+               goto err;
 
+       for_each_online_member(ca, c, i)
                ca->new_fs_bucket_idx = 0;
-       }
 
        ret = bch2_fs_freespace_init(c);
        if (ret)
@@ -1645,9 +1055,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_pack(&packed_inode, &root_inode);
        packed_inode.inode.k.p.snapshot = U32_MAX;
 
-       ret = bch2_btree_insert(c, BTREE_ID_inodes,
-                               &packed_inode.inode.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
        if (ret) {
                bch_err_msg(c, ret, "creating root directory");
                goto err;
@@ -1656,7 +1064,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch2_inode_init_early(c, &lostfound_inode);
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans,
+               bch2_create_trans(trans,
                                  BCACHEFS_ROOT_SUBVOL_INUM,
                                  &root_inode, &lostfound_inode,
                                  &lostfound,
index f8e796c0f8c86880f6f9b63017f4c0ef4f879ecc..852d30567da9c4079c2f42a71b00e2d5de2c03e0 100644 (file)
@@ -2,55 +2,28 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-struct journal_iter {
-       struct list_head        list;
-       enum btree_id           btree_id;
-       unsigned                level;
-       size_t                  idx;
-       struct journal_keys     *keys;
-};
+extern const char * const bch2_recovery_passes[];
 
 /*
- * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ * For when we need to rewind recovery passes and run a pass we skipped:
  */
-
-struct btree_and_journal_iter {
-       struct btree            *b;
-       struct btree_node_iter  node_iter;
-       struct bkey             unpacked;
-
-       struct journal_iter     journal;
-       struct bpos             pos;
-       bool                    at_end;
-};
-
-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
-                               unsigned, struct bpos, struct bpos, size_t *);
-struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
-                                          unsigned, struct bpos);
-
-int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
-                                unsigned, struct bkey_i *);
-int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
-                           unsigned, struct bkey_i *);
-int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
-                           unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
-                                 unsigned, struct bpos);
-
-void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
-struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
-
-void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                               struct bch_fs *, struct btree *,
-                               struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                                               struct bch_fs *,
-                                               struct btree *);
-
-void bch2_journal_keys_free(struct journal_keys *);
-void bch2_journal_entries_free(struct bch_fs *);
+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+                                                 enum bch_recovery_pass pass)
+{
+       bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+                bch2_recovery_passes[pass], pass,
+                bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+       c->recovery_passes_explicit |= BIT_ULL(pass);
+
+       if (c->curr_recovery_pass >= pass) {
+               c->curr_recovery_pass = pass;
+               c->recovery_passes_complete &= (1ULL << pass) >> 1;
+               return -BCH_ERR_restart_recovery;
+       } else {
+               return 0;
+       }
+}
 
 u64 bch2_fsck_recovery_passes(void);
 
index abf1f834ec7a86434b42b3d46ba43fe5a7102242..515e3d62c2ac9ec481694985ddaa7b1722760d6f 100644 (file)
@@ -14,6 +14,8 @@
        x(snapshots_read,               PASS_ALWAYS)                                            \
        x(check_topology,               0)                                                      \
        x(check_allocations,            PASS_FSCK)                                              \
+       x(trans_mark_dev_sbs,           PASS_ALWAYS|PASS_SILENT)                                \
+       x(fs_journal_alloc,             PASS_ALWAYS|PASS_SILENT)                                \
        x(set_may_go_rw,                PASS_ALWAYS|PASS_SILENT)                                \
        x(journal_replay,               PASS_ALWAYS)                                            \
        x(check_alloc_info,             PASS_FSCK)                                              \
        x(check_snapshot_trees,         PASS_FSCK)                                              \
        x(check_snapshots,              PASS_FSCK)                                              \
        x(check_subvols,                PASS_FSCK)                                              \
-       x(delete_dead_snapshots,        PASS_FSCK|PASS_UNCLEAN)                                 \
+       x(delete_dead_snapshots,        PASS_FSCK)                                              \
        x(fs_upgrade_for_subvolumes,    0)                                                      \
+       x(resume_logged_ops,            PASS_ALWAYS)                                            \
        x(check_inodes,                 PASS_FSCK)                                              \
        x(check_extents,                PASS_FSCK)                                              \
+       x(check_indirect_extents,       PASS_FSCK)                                              \
        x(check_dirents,                PASS_FSCK)                                              \
        x(check_xattrs,                 PASS_FSCK)                                              \
        x(check_root,                   PASS_FSCK)                                              \
@@ -38,6 +42,7 @@
        x(check_nlinks,                 PASS_FSCK)                                              \
        x(delete_dead_inodes,           PASS_FSCK|PASS_UNCLEAN)                                 \
        x(fix_reflink_p,                0)                                                      \
+       x(set_fs_needs_rebalance,       0)                                                      \
 
 enum bch_recovery_pass {
 #define x(n, when)     BCH_RECOVERY_PASS_##n,
index 39f711d5069e9f0f483fa9cfb0ad1d2ffced92be..07ddf3e85ee454577f4ef6354e25ab7a89671161 100644 (file)
@@ -5,9 +5,12 @@
 #include "buckets.h"
 #include "extents.h"
 #include "inode.h"
-#include "io.h"
+#include "io_misc.h"
+#include "io_write.h"
+#include "rebalance.h"
 #include "reflink.h"
 #include "subvolume.h"
+#include "super-io.h"
 
 #include <linux/sched/signal.h>
 
@@ -25,7 +28,7 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k)
 
 /* reflink pointers */
 
-int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
                           enum bkey_invalid_flags flags,
                           struct printbuf *err)
 {
@@ -72,7 +75,7 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
 /* indirect extents */
 
-int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
                           enum bkey_invalid_flags flags,
                           struct printbuf *err)
 {
@@ -89,6 +92,9 @@ void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
+#if 0
+Currently disabled, needs to be debugged:
+
 bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
 {
        struct bkey_s_reflink_v   l = bkey_s_to_reflink_v(_l);
@@ -96,29 +102,31 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 
        return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r);
 }
+#endif
+
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+       if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+               new->k.type = KEY_TYPE_deleted;
+               new->k.size = 0;
+               set_bkey_val_u64s(&new->k, 0);;
+               *flags &= ~BTREE_TRIGGER_INSERT;
+       }
+}
 
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
                              enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
-       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
-               struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
-
-               if (!r->v.refcount) {
-                       r->k.type = KEY_TYPE_deleted;
-                       r->k.size = 0;
-                       set_bkey_val_u64s(&r->k, 0);
-                       return 0;
-               }
-       }
+       check_indirect_extent_deleting(new, &flags);
 
        return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
 }
 
 /* indirect inline data */
 
-int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_indirect_inline_data_invalid(struct bch_fs *c, struct bkey_s_c k,
                                      enum bkey_invalid_flags flags,
                                      struct printbuf *err)
 {
@@ -126,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 void bch2_indirect_inline_data_to_text(struct printbuf *out,
-                                       struct bch_fs *c, struct bkey_s_c k)
+                                      struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
        unsigned datalen = bkey_inline_data_bytes(k.k);
@@ -141,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
-       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
-               struct bkey_i_indirect_inline_data *r =
-                       bkey_i_to_indirect_inline_data(new);
-
-               if (!r->v.refcount) {
-                       r->k.type = KEY_TYPE_deleted;
-                       r->k.size = 0;
-                       set_bkey_val_u64s(&r->k, 0);
-               }
-       }
+       check_indirect_extent_deleting(new, &flags);
 
        return 0;
 }
@@ -247,15 +246,16 @@ s64 bch2_remap_range(struct bch_fs *c,
                     u64 remap_sectors,
                     u64 new_i_size, s64 *i_sectors_delta)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter dst_iter, src_iter;
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
        struct bpos dst_start = POS(dst_inum.inum, dst_offset);
        struct bpos src_start = POS(src_inum.inum, src_offset);
        struct bpos dst_end = dst_start, src_end = src_start;
+       struct bch_io_opts opts;
        struct bpos src_want;
-       u64 dst_done;
+       u64 dst_done = 0;
        u32 dst_snapshot, src_snapshot;
        int ret = 0, ret2 = 0;
 
@@ -269,11 +269,15 @@ s64 bch2_remap_range(struct bch_fs *c,
 
        bch2_bkey_buf_init(&new_dst);
        bch2_bkey_buf_init(&new_src);
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
+       trans = bch2_trans_get(c);
 
-       bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+       ret = bch2_inum_opts_get(trans, src_inum, &opts);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
                             BTREE_ITER_INTENT);
-       bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+       bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
                             BTREE_ITER_INTENT);
 
        while ((ret == 0 ||
@@ -281,21 +285,21 @@ s64 bch2_remap_range(struct bch_fs *c,
               bkey_lt(dst_iter.pos, dst_end)) {
                struct disk_reservation disk_res = { 0 };
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        break;
                }
 
-               ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+               ret = bch2_subvolume_get_snapshot(trans, src_inum.subvol,
                                                  &src_snapshot);
                if (ret)
                        continue;
 
                bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
 
-               ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+               ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol,
                                                  &dst_snapshot);
                if (ret)
                        continue;
@@ -312,7 +316,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        continue;
 
                if (bkey_lt(src_want, src_iter.pos)) {
-                       ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+                       ret = bch2_fpunch_at(trans, &dst_iter, dst_inum,
                                        min(dst_end.offset,
                                            dst_iter.pos.offset +
                                            src_iter.pos.offset - src_want.offset),
@@ -326,7 +330,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
-                       ret = bch2_make_extent_indirect(&trans, &src_iter,
+                       ret = bch2_make_extent_indirect(trans, &src_iter,
                                                new_src.k);
                        if (ret)
                                continue;
@@ -354,14 +358,17 @@ s64 bch2_remap_range(struct bch_fs *c,
                                min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter.pos.offset));
 
-               ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
-                                        new_dst.k, &disk_res,
-                                        new_i_size, i_sectors_delta,
-                                        true);
+               ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k,
+                                       opts.background_target,
+                                       opts.background_compression) ?:
+                       bch2_extent_update(trans, dst_inum, &dst_iter,
+                                       new_dst.k, &disk_res,
+                                       new_i_size, i_sectors_delta,
+                                       true);
                bch2_disk_reservation_put(c, &disk_res);
        }
-       bch2_trans_iter_exit(&trans, &dst_iter);
-       bch2_trans_iter_exit(&trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
+       bch2_trans_iter_exit(trans, &src_iter);
 
        BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end));
        BUG_ON(bkey_gt(dst_iter.pos, dst_end));
@@ -373,23 +380,23 @@ s64 bch2_remap_range(struct bch_fs *c,
                struct bch_inode_unpacked inode_u;
                struct btree_iter inode_iter = { NULL };
 
-               bch2_trans_begin(&trans);
+               bch2_trans_begin(trans);
 
-               ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+               ret2 = bch2_inode_peek(trans, &inode_iter, &inode_u,
                                       dst_inum, BTREE_ITER_INTENT);
 
                if (!ret2 &&
                    inode_u.bi_size < new_i_size) {
                        inode_u.bi_size = new_i_size;
-                       ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
-                                                 BTREE_INSERT_NOFAIL);
+                       ret2  = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
+                                                 BCH_TRANS_COMMIT_no_enospc);
                }
 
-               bch2_trans_iter_exit(&trans, &inode_iter);
+               bch2_trans_iter_exit(trans, &inode_iter);
        } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
-       bch2_trans_exit(&trans);
+err:
+       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&new_src, c);
        bch2_bkey_buf_exit(&new_dst, c);
 
index fe52538efb522940cd6cf7b853bc6d5d4afe3c94..8ccf3f9c4939eed45d9d9dc231bf5632506de836 100644 (file)
@@ -4,7 +4,7 @@
 
 enum bkey_invalid_flags;
 
-int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
                           enum bkey_invalid_flags, struct printbuf *);
 void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
@@ -19,7 +19,7 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
        .min_val_size   = 16,                                   \
 })
 
-int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
                           enum bkey_invalid_flags, struct printbuf *);
 void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
@@ -35,7 +35,7 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
        .min_val_size   = 8,                                    \
 })
 
-int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
                                      enum bkey_invalid_flags, struct printbuf *);
 void bch2_indirect_inline_data_to_text(struct printbuf *,
                                struct bch_fs *, struct bkey_s_c);
index 5b591c59bc3eadaf86ffd83360609950c9eaad5a..1c3ae13bfced1d8ce9eeee118cb6e9fe1552e7a5 100644 (file)
@@ -429,7 +429,7 @@ out:
 
        return ret;
 err:
-       bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret));
+       bch_err_msg(c, ret, "adding replicas entry");
        goto out;
 }
 
@@ -462,18 +462,13 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
        lockdep_assert_held(&c->replicas_gc_lock);
 
-       if (ret)
-               goto err;
-
        mutex_lock(&c->sb_lock);
        percpu_down_write(&c->mark_lock);
 
-       ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
-       if (ret)
-               goto err;
+       ret =   ret ?:
+               bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?:
+               replicas_table_update(c, &c->replicas_gc);
 
-       ret = replicas_table_update(c, &c->replicas_gc);
-err:
        kfree(c->replicas_gc.entries);
        c->replicas_gc.entries = NULL;
 
@@ -579,12 +574,9 @@ retry:
 
        bch2_cpu_replicas_sort(&new);
 
-       ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-       if (ret)
-               goto err;
+       ret =   bch2_cpu_replicas_to_sb_replicas(c, &new) ?:
+               replicas_table_update(c, &new);
 
-       ret = replicas_table_update(c, &new);
-err:
        kfree(new.entries);
 
        percpu_up_write(&c->mark_lock);
@@ -700,9 +692,9 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
        struct bch_replicas_cpu new_r = { 0, 0, NULL };
        int ret = 0;
 
-       if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
+       if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
                ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
-       else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
+       else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0)))
                ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
        if (ret)
                return ret;
@@ -732,13 +724,13 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
        for_each_cpu_replicas_entry(r, src)
                bytes += replicas_entry_bytes(src) - 1;
 
-       sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
+       sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
                        DIV_ROUND_UP(bytes, sizeof(u64)));
        if (!sb_r)
                return -BCH_ERR_ENOSPC_sb_replicas;
 
        bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
-       sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
+       sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0);
 
        memset(&sb_r->entries, 0,
               vstruct_end(&sb_r->field) -
@@ -777,13 +769,13 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
        if (!need_v1)
                return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
 
-       sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+       sb_r = bch2_sb_field_resize(&c->disk_sb, replicas,
                        DIV_ROUND_UP(bytes, sizeof(u64)));
        if (!sb_r)
                return -BCH_ERR_ENOSPC_sb_replicas;
 
        bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
-       sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
+       sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas);
 
        memset(&sb_r->entries, 0,
               vstruct_end(&sb_r->field) -
@@ -805,7 +797,6 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                                      struct bch_sb *sb,
                                      struct printbuf *err)
 {
-       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
        unsigned i, j;
 
        sort_cmp_size(cpu_r->entries,
@@ -837,7 +828,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                }
 
                for (j = 0; j < e->nr_devs; j++)
-                       if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+                       if (!bch2_dev_exists(sb, e->devs[j])) {
                                prt_printf(err, "invalid device %u in entry ", e->devs[j]);
                                bch2_replicas_entry_to_text(err, e);
                                return -BCH_ERR_invalid_sb_replicas;
@@ -999,8 +990,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
        struct bch_sb_field_replicas_v0 *replicas_v0;
        unsigned i, data_has = 0;
 
-       replicas = bch2_sb_get_replicas(sb);
-       replicas_v0 = bch2_sb_get_replicas_v0(sb);
+       replicas = bch2_sb_field_get(sb, replicas);
+       replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
 
        if (replicas) {
                struct bch_replicas_entry *r;
index ae21a8cca1b49d4d9bbfe2b38a330c78b9abc023..89fdb7c21134ebbb6c145a88ed5b1943ab54588a 100644 (file)
 #include <crypto/hash.h>
 #include <crypto/sha2.h>
 
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+       __BCH_HASH_SET_MUST_CREATE,
+       __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE       (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE      (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
 static inline enum bch_str_hash_type
 bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
@@ -246,7 +256,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
                           const struct bch_hash_info *info,
                           subvol_inum inum, u32 snapshot,
                           struct bkey_i *insert,
-                          int flags,
+                          bch_str_hash_flags_t str_hash_flags,
                           int update_flags)
 {
        struct btree_iter iter, slot = { NULL };
@@ -269,7 +279,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
                }
 
                if (!slot.path &&
-                   !(flags & BCH_HASH_SET_MUST_REPLACE))
+                   !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
                        bch2_trans_copy_iter(&slot, &iter);
 
                if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +297,16 @@ found:
        found = true;
 not_found:
 
-       if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+       if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
                ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
-       } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+       } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
                ret = -EEXIST;
        } else {
                if (!found && slot.path)
                        swap(iter, slot);
 
                insert->k.p = iter.pos;
-               ret = bch2_trans_update(trans, &iter, insert, 0);
+               ret = bch2_trans_update(trans, &iter, insert, update_flags);
        }
 
        goto out;
@@ -307,7 +317,8 @@ int bch2_hash_set(struct btree_trans *trans,
                  const struct bch_hash_desc desc,
                  const struct bch_hash_info *info,
                  subvol_inum inum,
-                 struct bkey_i *insert, int flags)
+                 struct bkey_i *insert,
+                 bch_str_hash_flags_t str_hash_flags)
 {
        u32 snapshot;
        int ret;
@@ -319,7 +330,7 @@ int bch2_hash_set(struct btree_trans *trans,
        insert->k.p.inode = inum.inum;
 
        return bch2_hash_set_snapshot(trans, desc, info, inum,
-                                     snapshot, insert, flags, 0);
+                                     snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline
index 736afb626a8945b75d4bc3300fb15fb60d594de8..1cbf9e3a09ecf67bbcc59951ce4df45e48f87c1c 100644 (file)
 #include "errcode.h"
 #include "error.h"
 #include "fs.h"
+#include "snapshot.h"
 #include "subvolume.h"
 
 #include <linux/random.h>
 
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
-static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
-{
-       const struct snapshot_t *s = __snapshot_t(t, id);
-
-       if (s->skip[2] <= ancestor)
-               return s->skip[2];
-       if (s->skip[1] <= ancestor)
-               return s->skip[1];
-       if (s->skip[0] <= ancestor)
-               return s->skip[0];
-       return s->parent;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-       struct snapshot_table *t;
-       bool ret;
-
-       EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
-
-       rcu_read_lock();
-       t = rcu_dereference(c->snapshots);
-
-       while (id && id < ancestor - IS_ANCESTOR_BITMAP)
-               id = get_ancestor_below(t, id, ancestor);
-
-       ret = id && id < ancestor
-               ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor)
-               : id == ancestor;
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
-{
-       struct snapshot_table *t;
-
-       rcu_read_lock();
-       t = rcu_dereference(c->snapshots);
-
-       while (id && id < ancestor)
-               id = __snapshot_t(t, id)->parent;
-       rcu_read_unlock();
-
-       return id == ancestor;
-}
-
-static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
-{
-       u32 depth;
-
-       rcu_read_lock();
-       depth = parent ? snapshot_t(c, parent)->depth + 1 : 0;
-       rcu_read_unlock();
-
-       return depth;
-}
-
-static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-       size_t idx = U32_MAX - id;
-       size_t new_size;
-       struct snapshot_table *new, *old;
-
-       new_size = max(16UL, roundup_pow_of_two(idx + 1));
-
-       new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
-       if (!new)
-               return NULL;
-
-       old = rcu_dereference_protected(c->snapshots, true);
-       if (old)
-               memcpy(new->s,
-                      rcu_dereference_protected(c->snapshots, true)->s,
-                      sizeof(new->s[0]) * c->snapshot_table_size);
-
-       rcu_assign_pointer(c->snapshots, new);
-       c->snapshot_table_size = new_size;
-       if (old)
-               kvfree_rcu(old);
-
-       return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-}
-
-static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
-{
-       size_t idx = U32_MAX - id;
-
-       lockdep_assert_held(&c->snapshot_table_lock);
-
-       if (likely(idx < c->snapshot_table_size))
-               return &rcu_dereference_protected(c->snapshots, true)->s[idx];
-
-       return __snapshot_t_mut(c, id);
-}
-
-/* Snapshot tree: */
-
-void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
-                               struct bkey_s_c k)
-{
-       struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
-
-       prt_printf(out, "subvol %u root snapshot %u",
-                  le32_to_cpu(t.v->master_subvol),
-                  le32_to_cpu(t.v->root_snapshot));
-}
-
-int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                              enum bkey_invalid_flags flags,
-                              struct printbuf *err)
-{
-       if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-           bkey_lt(k.k->p, POS(0, 1))) {
-               prt_printf(err, "bad pos");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
-}
-
-int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
-                             struct bch_snapshot_tree *s)
-{
-       int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
-                                         BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
-
-       if (bch2_err_matches(ret, ENOENT))
-               ret = -BCH_ERR_ENOENT_snapshot_tree;
-       return ret;
-}
-
-static struct bkey_i_snapshot_tree *
-__snapshot_tree_create(struct btree_trans *trans)
-{
-       struct btree_iter iter;
-       int ret = bch2_bkey_get_empty_slot(trans, &iter,
-                       BTREE_ID_snapshot_trees, POS(0, U32_MAX));
-       struct bkey_i_snapshot_tree *s_t;
-
-       if (ret == -BCH_ERR_ENOSPC_btree_slot)
-               ret = -BCH_ERR_ENOSPC_snapshot_tree;
-       if (ret)
-               return ERR_PTR(ret);
-
-       s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
-       ret = PTR_ERR_OR_ZERO(s_t);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret ? ERR_PTR(ret) : s_t;
-}
-
-static int snapshot_tree_create(struct btree_trans *trans,
-                               u32 root_id, u32 subvol_id, u32 *tree_id)
-{
-       struct bkey_i_snapshot_tree *n_tree =
-               __snapshot_tree_create(trans);
-
-       if (IS_ERR(n_tree))
-               return PTR_ERR(n_tree);
-
-       n_tree->v.master_subvol = cpu_to_le32(subvol_id);
-       n_tree->v.root_snapshot = cpu_to_le32(root_id);
-       *tree_id = n_tree->k.p.offset;
-       return 0;
-}
-
-/* Snapshot nodes: */
-
-void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
-                          struct bkey_s_c k)
-{
-       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
-
-       prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
-              BCH_SNAPSHOT_SUBVOL(s.v),
-              BCH_SNAPSHOT_DELETED(s.v),
-              le32_to_cpu(s.v->parent),
-              le32_to_cpu(s.v->children[0]),
-              le32_to_cpu(s.v->children[1]),
-              le32_to_cpu(s.v->subvol),
-              le32_to_cpu(s.v->tree));
-
-       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
-               prt_printf(out, " depth %u skiplist %u %u %u",
-                          le32_to_cpu(s.v->depth),
-                          le32_to_cpu(s.v->skip[0]),
-                          le32_to_cpu(s.v->skip[1]),
-                          le32_to_cpu(s.v->skip[2]));
-}
-
-int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                         enum bkey_invalid_flags flags,
-                         struct printbuf *err)
-{
-       struct bkey_s_c_snapshot s;
-       u32 i, id;
-
-       if (bkey_gt(k.k->p, POS(0, U32_MAX)) ||
-           bkey_lt(k.k->p, POS(0, 1))) {
-               prt_printf(err, "bad pos");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       s = bkey_s_c_to_snapshot(k);
-
-       id = le32_to_cpu(s.v->parent);
-       if (id && id <= k.k->p.offset) {
-               prt_printf(err, "bad parent node (%u <= %llu)",
-                      id, k.k->p.offset);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) {
-               prt_printf(err, "children not normalized");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (s.v->children[0] &&
-           s.v->children[0] == s.v->children[1]) {
-               prt_printf(err, "duplicate child nodes");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       for (i = 0; i < 2; i++) {
-               id = le32_to_cpu(s.v->children[i]);
-
-               if (id >= k.k->p.offset) {
-                       prt_printf(err, "bad child node (%u >= %llu)",
-                              id, k.k->p.offset);
-                       return -BCH_ERR_invalid_bkey;
-               }
-       }
-
-       if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
-               if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
-                   le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) {
-                       prt_printf(err, "skiplist not normalized");
-                       return -BCH_ERR_invalid_bkey;
-               }
-
-               for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
-                       id = le32_to_cpu(s.v->skip[i]);
-
-                       if (!id != !s.v->parent ||
-                           (s.v->parent &&
-                            id <= k.k->p.offset)) {
-                               prt_printf(err, "bad skiplist node %u)", id);
-                               return -BCH_ERR_invalid_bkey;
-                       }
-               }
-       }
-
-       return 0;
-}
-
-int bch2_mark_snapshot(struct btree_trans *trans,
-                      enum btree_id btree, unsigned level,
-                      struct bkey_s_c old, struct bkey_s_c new,
-                      unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct snapshot_t *t;
-       u32 id = new.k->p.offset;
-       int ret = 0;
-
-       mutex_lock(&c->snapshot_table_lock);
-
-       t = snapshot_t_mut(c, id);
-       if (!t) {
-               ret = -BCH_ERR_ENOMEM_mark_snapshot;
-               goto err;
-       }
-
-       if (new.k->type == KEY_TYPE_snapshot) {
-               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
-               u32 parent = id;
-
-               t->parent       = le32_to_cpu(s.v->parent);
-               t->children[0]  = le32_to_cpu(s.v->children[0]);
-               t->children[1]  = le32_to_cpu(s.v->children[1]);
-               t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
-               t->tree         = le32_to_cpu(s.v->tree);
-
-               if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
-                       t->depth        = le32_to_cpu(s.v->depth);
-                       t->skip[0]      = le32_to_cpu(s.v->skip[0]);
-                       t->skip[1]      = le32_to_cpu(s.v->skip[1]);
-                       t->skip[2]      = le32_to_cpu(s.v->skip[2]);
-               } else {
-                       t->depth        = 0;
-                       t->skip[0]      = 0;
-                       t->skip[1]      = 0;
-                       t->skip[2]      = 0;
-               }
-
-               while ((parent = bch2_snapshot_parent_early(c, parent)) &&
-                      parent - id - 1 < IS_ANCESTOR_BITMAP)
-                       __set_bit(parent - id - 1, t->is_ancestor);
-
-               if (BCH_SNAPSHOT_DELETED(s.v)) {
-                       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-                       c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots);
-               }
-       } else {
-               memset(t, 0, sizeof(*t));
-       }
-err:
-       mutex_unlock(&c->snapshot_table_lock);
-       return ret;
-}
-
-static int snapshot_lookup(struct btree_trans *trans, u32 id,
-                          struct bch_snapshot *s)
-{
-       return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
-                                      BTREE_ITER_WITH_UPDATES, snapshot, s);
-}
-
-static int snapshot_live(struct btree_trans *trans, u32 id)
-{
-       struct bch_snapshot v;
-       int ret;
-
-       if (!id)
-               return 0;
-
-       ret = snapshot_lookup(trans, id, &v);
-       if (bch2_err_matches(ret, ENOENT))
-               bch_err(trans->c, "snapshot node %u not found", id);
-       if (ret)
-               return ret;
-
-       return !BCH_SNAPSHOT_DELETED(&v);
-}
-
-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       unsigned i, nr_live = 0, live_idx = 0;
-       struct bkey_s_c_snapshot snap;
-       u32 id = k.k->p.offset, child[2];
-
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       snap = bkey_s_c_to_snapshot(k);
-
-       child[0] = le32_to_cpu(snap.v->children[0]);
-       child[1] = le32_to_cpu(snap.v->children[1]);
-
-       for (i = 0; i < 2; i++) {
-               int ret = snapshot_live(trans, child[i]);
-
-               if (ret < 0)
-                       return ret;
-
-               if (ret)
-                       live_idx = i;
-               nr_live += ret;
-       }
-
-       mutex_lock(&c->snapshot_table_lock);
-
-       snapshot_t_mut(c, id)->equiv = nr_live == 1
-               ? snapshot_t_mut(c, child[live_idx])->equiv
-               : id;
-
-       mutex_unlock(&c->snapshot_table_lock);
-
-       return 0;
-}
-
-/* fsck: */
-
-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
-{
-       return snapshot_t(c, id)->children[child];
-}
-
-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
-{
-       return bch2_snapshot_child(c, id, 0);
-}
-
-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
-{
-       return bch2_snapshot_child(c, id, 1);
-}
-
-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
-{
-       u32 n, parent;
-
-       n = bch2_snapshot_left_child(c, id);
-       if (n)
-               return n;
-
-       while ((parent = bch2_snapshot_parent(c, id))) {
-               n = bch2_snapshot_right_child(c, parent);
-               if (n && n != id)
-                       return n;
-               id = parent;
-       }
-
-       return 0;
-}
-
-static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
-{
-       u32 id = snapshot_root;
-       u32 subvol = 0, s;
-
-       while (id) {
-               s = snapshot_t(c, id)->subvol;
-
-               if (s && (!subvol || s < subvol))
-                       subvol = s;
-
-               id = bch2_snapshot_tree_next(c, id);
-       }
-
-       return subvol;
-}
-
-static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
-                                           u32 snapshot_root, u32 *subvol_id)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_s_c_subvolume s;
-       bool found = false;
-       int ret;
-
-       for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
-                                    0, k, ret) {
-               if (k.k->type != KEY_TYPE_subvolume)
-                       continue;
-
-               s = bkey_s_c_to_subvolume(k);
-               if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
-                       continue;
-               if (!BCH_SUBVOLUME_SNAP(s.v)) {
-                       *subvol_id = s.k->p.offset;
-                       found = true;
-                       break;
-               }
-       }
-
-       bch2_trans_iter_exit(trans, &iter);
-
-       if (!ret && !found) {
-               struct bkey_i_subvolume *s;
-
-               *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
-
-               s = bch2_bkey_get_mut_typed(trans, &iter,
-                                           BTREE_ID_subvolumes, POS(0, *subvol_id),
-                                           0, subvolume);
-               ret = PTR_ERR_OR_ZERO(s);
-               if (ret)
-                       return ret;
-
-               SET_BCH_SUBVOLUME_SNAP(&s->v, false);
-       }
-
-       return ret;
-}
-
-static int check_snapshot_tree(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c_snapshot_tree st;
-       struct bch_snapshot s;
-       struct bch_subvolume subvol;
-       struct printbuf buf = PRINTBUF;
-       u32 root_id;
-       int ret;
-
-       if (k.k->type != KEY_TYPE_snapshot_tree)
-               return 0;
-
-       st = bkey_s_c_to_snapshot_tree(k);
-       root_id = le32_to_cpu(st.v->root_snapshot);
-
-       ret = snapshot_lookup(trans, root_id, &s);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (fsck_err_on(ret ||
-                       root_id != bch2_snapshot_root(c, root_id) ||
-                       st.k->p.offset != le32_to_cpu(s.tree),
-                       c,
-                       "snapshot tree points to missing/incorrect snapshot:\n  %s",
-                       (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-               ret = bch2_btree_delete_at(trans, iter, 0);
-               goto err;
-       }
-
-       ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
-                                false, 0, &subvol);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               goto err;
-
-       if (fsck_err_on(ret, c,
-                       "snapshot tree points to missing subvolume:\n  %s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-           fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
-                                               le32_to_cpu(subvol.snapshot),
-                                               root_id), c,
-                       "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-           fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c,
-                       "snapshot tree points to snapshot subvolume:\n  %s",
-                       (printbuf_reset(&buf),
-                        bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
-               struct bkey_i_snapshot_tree *u;
-               u32 subvol_id;
-
-               ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
-               if (ret)
-                       goto err;
-
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.master_subvol = cpu_to_le32(subvol_id);
-               st = snapshot_tree_i_to_s_c(u);
-       }
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-/*
- * For each snapshot_tree, make sure it points to the root of a snapshot tree
- * and that snapshot entry points back to it, or delete it.
- *
- * And, make sure it points to a subvolume within that snapshot tree, or correct
- * it to point to the oldest subvolume within that snapshot tree.
- */
-int bch2_check_snapshot_trees(struct bch_fs *c)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
-                       BTREE_ID_snapshot_trees, POS_MIN,
-                       BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot_tree(&trans, &iter, k)));
-
-       if (ret)
-               bch_err(c, "error %i checking snapshot trees", ret);
-       return ret;
-}
-
-/*
- * Look up snapshot tree for @tree_id and find root,
- * make sure @snap_id is a descendent:
- */
-static int snapshot_tree_ptr_good(struct btree_trans *trans,
-                                 u32 snap_id, u32 tree_id)
-{
-       struct bch_snapshot_tree s_t;
-       int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-
-       if (bch2_err_matches(ret, ENOENT))
-               return 0;
-       if (ret)
-               return ret;
-
-       return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
-}
-
-static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s;
-
-       if (!id)
-               return 0;
-
-       rcu_read_lock();
-       s = snapshot_t(c, id);
-       if (s->parent)
-               id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
-       rcu_read_unlock();
-
-       return id;
-}
-
-static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s)
-{
-       struct bch_snapshot a;
-       unsigned i;
-       int ret;
-
-       for (i = 0; i < 3; i++) {
-               if (!s.parent != !s.skip[i])
-                       return false;
-
-               if (!s.parent)
-                       continue;
-
-               ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a);
-               if (bch2_err_matches(ret, ENOENT))
-                       return false;
-               if (ret)
-                       return ret;
-
-               if (a.tree != s.tree)
-                       return false;
-       }
-
-       return true;
-}
-
-/*
- * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
- * its snapshot_tree pointer is correct (allocate new one if necessary), then
- * update this node's pointer to root node's pointer:
- */
-static int snapshot_tree_ptr_repair(struct btree_trans *trans,
-                                   struct btree_iter *iter,
-                                   struct bkey_s_c k,
-                                   struct bch_snapshot *s)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter root_iter;
-       struct bch_snapshot_tree s_t;
-       struct bkey_s_c_snapshot root;
-       struct bkey_i_snapshot *u;
-       u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
-       int ret;
-
-       root = bch2_bkey_get_iter_typed(trans, &root_iter,
-                              BTREE_ID_snapshots, POS(0, root_id),
-                              BTREE_ITER_WITH_UPDATES, snapshot);
-       ret = bkey_err(root);
-       if (ret)
-               goto err;
-
-       tree_id = le32_to_cpu(root.v->tree);
-
-       ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
-       if (ret && !bch2_err_matches(ret, ENOENT))
-               return ret;
-
-       if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
-               u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
-               ret =   PTR_ERR_OR_ZERO(u) ?:
-                       snapshot_tree_create(trans, root_id,
-                               bch2_snapshot_tree_oldest_subvol(c, root_id),
-                               &tree_id);
-               if (ret)
-                       goto err;
-
-               u->v.tree = cpu_to_le32(tree_id);
-               if (k.k->p.offset == root_id)
-                       *s = u->v;
-       }
-
-       if (k.k->p.offset != root_id) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.tree = cpu_to_le32(tree_id);
-               *s = u->v;
-       }
-err:
-       bch2_trans_iter_exit(trans, &root_iter);
-       return ret;
-}
-
-static int cmp_le32(__le32 l, __le32 r)
-{
-       return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
-}
-
-static int check_snapshot(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct bkey_s_c k)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_snapshot s;
-       struct bch_subvolume subvol;
-       struct bch_snapshot v;
-       struct bkey_i_snapshot *u;
-       u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
-       u32 real_depth;
-       struct printbuf buf = PRINTBUF;
-       bool should_have_subvol;
-       u32 i, id;
-       int ret = 0;
-
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       memset(&s, 0, sizeof(s));
-       memcpy(&s, k.v, bkey_val_bytes(k.k));
-
-       id = le32_to_cpu(s.parent);
-       if (id) {
-               ret = snapshot_lookup(trans, id, &v);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot with nonexistent parent:\n  %s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-               if (ret)
-                       goto err;
-
-               if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
-                   le32_to_cpu(v.children[1]) != k.k->p.offset) {
-                       bch_err(c, "snapshot parent %u missing pointer to child %llu",
-                               id, k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-
-       for (i = 0; i < 2 && s.children[i]; i++) {
-               id = le32_to_cpu(s.children[i]);
-
-               ret = snapshot_lookup(trans, id, &v);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot node %llu has nonexistent child %u",
-                               k.k->p.offset, id);
-               if (ret)
-                       goto err;
-
-               if (le32_to_cpu(v.parent) != k.k->p.offset) {
-                       bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
-                               id, le32_to_cpu(v.parent), k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       }
-
-       should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
-               !BCH_SNAPSHOT_DELETED(&s);
-
-       if (should_have_subvol) {
-               id = le32_to_cpu(s.subvol);
-               ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-               if (ret)
-                       goto err;
-
-               if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
-                       bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
-                               k.k->p.offset);
-                       ret = -EINVAL;
-                       goto err;
-               }
-       } else {
-               if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n  %s",
-                               (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-                       u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-                       ret = PTR_ERR_OR_ZERO(u);
-                       if (ret)
-                               goto err;
-
-                       u->v.subvol = 0;
-                       s = u->v;
-               }
-       }
-
-       ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
-       if (ret < 0)
-               goto err;
-
-       if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n  %s",
-                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
-               ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
-               if (ret)
-                       goto err;
-       }
-       ret = 0;
-
-       real_depth = bch2_snapshot_depth(c, parent_id);
-
-       if (le32_to_cpu(s.depth) != real_depth &&
-           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-            fsck_err(c, "snapshot with incorrect depth field, should be %u:\n  %s",
-                     real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               u->v.depth = cpu_to_le32(real_depth);
-               s = u->v;
-       }
-
-       ret = snapshot_skiplist_good(trans, s);
-       if (ret < 0)
-               goto err;
-
-       if (!ret &&
-           (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
-            fsck_err(c, "snapshot with bad skiplist field:\n  %s",
-                     (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
-               u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(u);
-               if (ret)
-                       goto err;
-
-               for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
-                       u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id));
-
-               bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
-               s = u->v;
-       }
-       ret = 0;
-err:
-fsck_err:
-       printbuf_exit(&buf);
-       return ret;
-}
-
-int bch2_check_snapshots(struct bch_fs *c)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret;
-
-       /*
-        * We iterate backwards as checking/fixing the depth field requires that
-        * the parent's depth already be correct:
-        */
-       ret = bch2_trans_run(c,
-               for_each_btree_key_reverse_commit(&trans, iter,
-                       BTREE_ID_snapshots, POS_MAX,
-                       BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_snapshot(&trans, &iter, k)));
-       if (ret)
-               bch_err_fn(c, ret);
-       return ret;
-}
-
 static int check_subvol(struct btree_trans *trans,
                        struct btree_iter *iter,
                        struct bkey_s_c k)
@@ -881,7 +28,7 @@ static int check_subvol(struct btree_trans *trans,
 
        subvol = bkey_s_c_to_subvolume(k);
        snapid = le32_to_cpu(subvol.v->snapshot);
-       ret = snapshot_lookup(trans, snapid, &snapshot);
+       ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
 
        if (bch2_err_matches(ret, ENOENT))
                bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
@@ -894,8 +41,7 @@ static int check_subvol(struct btree_trans *trans,
 
                ret = bch2_subvolume_delete(trans, iter->pos.offset);
                if (ret)
-                       bch_err(c, "error deleting subvolume %llu: %s",
-                               iter->pos.offset, bch2_err_str(ret));
+                       bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
                return ret ?: -BCH_ERR_transaction_restart_nested;
        }
 
@@ -916,7 +62,8 @@ static int check_subvol(struct btree_trans *trans,
                if (ret)
                        return ret;
 
-               if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c,
+               if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset,
+                               c, subvol_not_master_and_not_snapshot,
                                "subvolume %llu is not set as snapshot but is not master subvolume",
                                k.k->p.offset)) {
                        struct bkey_i_subvolume *s =
@@ -940,485 +87,30 @@ int bch2_check_subvols(struct bch_fs *c)
        int ret;
 
        ret = bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter,
+               for_each_btree_key_commit(trans, iter,
                        BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
-                       NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
-               check_subvol(&trans, &iter, k)));
-       if (ret)
-               bch_err_fn(c, ret);
-       return ret;
-}
-
-void bch2_fs_snapshots_exit(struct bch_fs *c)
-{
-       kfree(rcu_dereference_protected(c->snapshots, true));
-}
-
-int bch2_snapshots_read(struct bch_fs *c)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       int ret = 0;
-
-       ret = bch2_trans_run(c,
-               for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
-                       bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
-                       bch2_snapshot_set_equiv(&trans, k)));
+                       NULL, NULL, BCH_TRANS_COMMIT_lazy_rw|BCH_TRANS_COMMIT_no_enospc,
+               check_subvol(trans, &iter, k)));
        if (ret)
                bch_err_fn(c, ret);
        return ret;
 }
 
-/*
- * Mark a snapshot as deleted, for future cleanup:
- */
-static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
-{
-       struct btree_iter iter;
-       struct bkey_i_snapshot *s;
-       int ret = 0;
-
-       s = bch2_bkey_get_mut_typed(trans, &iter,
-                                   BTREE_ID_snapshots, POS(0, id),
-                                   0, snapshot);
-       ret = PTR_ERR_OR_ZERO(s);
-       if (unlikely(ret)) {
-               bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
-                                       trans->c, "missing snapshot %u", id);
-               return ret;
-       }
-
-       /* already deleted? */
-       if (BCH_SNAPSHOT_DELETED(&s->v))
-               goto err;
-
-       SET_BCH_SNAPSHOT_DELETED(&s->v, true);
-       SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
-       s->v.subvol = 0;
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
-       struct btree_iter tree_iter = (struct btree_iter) { NULL };
-       struct bkey_s_c_snapshot s;
-       u32 parent_id;
-       unsigned i;
-       int ret = 0;
-
-       s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
-                                    BTREE_ITER_INTENT, snapshot);
-       ret = bkey_err(s);
-       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                               "missing snapshot %u", id);
-
-       if (ret)
-               goto err;
-
-       BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
-       parent_id = le32_to_cpu(s.v->parent);
-
-       if (parent_id) {
-               struct bkey_i_snapshot *parent;
-
-               parent = bch2_bkey_get_mut_typed(trans, &p_iter,
-                                    BTREE_ID_snapshots, POS(0, parent_id),
-                                    0, snapshot);
-               ret = PTR_ERR_OR_ZERO(parent);
-               if (unlikely(ret)) {
-                       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
-                                               "missing snapshot %u", parent_id);
-                       goto err;
-               }
-
-               for (i = 0; i < 2; i++)
-                       if (le32_to_cpu(parent->v.children[i]) == id)
-                               break;
-
-               if (i == 2)
-                       bch_err(c, "snapshot %u missing child pointer to %u",
-                               parent_id, id);
-               else
-                       parent->v.children[i] = 0;
-
-               if (le32_to_cpu(parent->v.children[0]) <
-                   le32_to_cpu(parent->v.children[1]))
-                       swap(parent->v.children[0],
-                            parent->v.children[1]);
-       } else {
-               /*
-                * We're deleting the root of a snapshot tree: update the
-                * snapshot_tree entry to point to the new root, or delete it if
-                * this is the last snapshot ID in this tree:
-                */
-               struct bkey_i_snapshot_tree *s_t;
-
-               BUG_ON(s.v->children[1]);
-
-               s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
-                               BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
-                               0, snapshot_tree);
-               ret = PTR_ERR_OR_ZERO(s_t);
-               if (ret)
-                       goto err;
-
-               if (s.v->children[0]) {
-                       s_t->v.root_snapshot = s.v->children[0];
-               } else {
-                       s_t->k.type = KEY_TYPE_deleted;
-                       set_bkey_val_u64s(&s_t->k, 0);
-               }
-       }
-
-       ret = bch2_btree_delete_at(trans, &iter, 0);
-err:
-       bch2_trans_iter_exit(trans, &tree_iter);
-       bch2_trans_iter_exit(trans, &p_iter);
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
-                         u32 *new_snapids,
-                         u32 *snapshot_subvols,
-                         unsigned nr_snapids)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter iter;
-       struct bkey_i_snapshot *n;
-       struct bkey_s_c k;
-       unsigned i, j;
-       u32 depth = bch2_snapshot_depth(c, parent);
-       int ret;
-
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
-                            POS_MIN, BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       for (i = 0; i < nr_snapids; i++) {
-               k = bch2_btree_iter_prev_slot(&iter);
-               ret = bkey_err(k);
-               if (ret)
-                       goto err;
-
-               if (!k.k || !k.k->p.offset) {
-                       ret = -BCH_ERR_ENOSPC_snapshot_create;
-                       goto err;
-               }
-
-               n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
-               ret = PTR_ERR_OR_ZERO(n);
-               if (ret)
-                       goto err;
-
-               n->v.flags      = 0;
-               n->v.parent     = cpu_to_le32(parent);
-               n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
-               n->v.tree       = cpu_to_le32(tree);
-               n->v.depth      = cpu_to_le32(depth);
-
-               for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
-                       n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent));
-
-               bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
-               SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
-
-               ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
-                                        bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
-               if (ret)
-                       goto err;
-
-               new_snapids[i]  = iter.pos.offset;
-       }
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * Create new snapshot IDs as children of an existing snapshot ID:
- */
-static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       struct btree_iter iter;
-       struct bkey_i_snapshot *n_parent;
-       int ret = 0;
-
-       n_parent = bch2_bkey_get_mut_typed(trans, &iter,
-                       BTREE_ID_snapshots, POS(0, parent),
-                       0, snapshot);
-       ret = PTR_ERR_OR_ZERO(n_parent);
-       if (unlikely(ret)) {
-               if (bch2_err_matches(ret, ENOENT))
-                       bch_err(trans->c, "snapshot %u not found", parent);
-               return ret;
-       }
-
-       if (n_parent->v.children[0] || n_parent->v.children[1]) {
-               bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
-                            new_snapids, snapshot_subvols, nr_snapids);
-       if (ret)
-               goto err;
-
-       n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
-       n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
-       n_parent->v.subvol = 0;
-       SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
-err:
-       bch2_trans_iter_exit(trans, &iter);
-       return ret;
-}
-
-/*
- * Create a snapshot node that is the root of a new tree:
- */
-static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       struct bkey_i_snapshot_tree *n_tree;
-       int ret;
-
-       n_tree = __snapshot_tree_create(trans);
-       ret =   PTR_ERR_OR_ZERO(n_tree) ?:
-               create_snapids(trans, 0, n_tree->k.p.offset,
-                            new_snapids, snapshot_subvols, nr_snapids);
-       if (ret)
-               return ret;
-
-       n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
-       n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
-       return 0;
-}
-
-int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
-                             u32 *new_snapids,
-                             u32 *snapshot_subvols,
-                             unsigned nr_snapids)
-{
-       BUG_ON((parent == 0) != (nr_snapids == 1));
-       BUG_ON((parent != 0) != (nr_snapids == 2));
-
-       return parent
-               ? bch2_snapshot_node_create_children(trans, parent,
-                               new_snapids, snapshot_subvols, nr_snapids)
-               : bch2_snapshot_node_create_tree(trans,
-                               new_snapids, snapshot_subvols, nr_snapids);
-
-}
-
-static int snapshot_delete_key(struct btree_trans *trans,
-                              struct btree_iter *iter,
-                              struct bkey_s_c k,
-                              snapshot_id_list *deleted,
-                              snapshot_id_list *equiv_seen,
-                              struct bpos *last_pos)
-{
-       struct bch_fs *c = trans->c;
-       u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
-
-       if (!bkey_eq(k.k->p, *last_pos))
-               equiv_seen->nr = 0;
-       *last_pos = k.k->p;
-
-       if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
-           snapshot_list_has_id(equiv_seen, equiv)) {
-               return bch2_btree_delete_at(trans, iter,
-                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-       } else {
-               return snapshot_list_add(c, equiv_seen, equiv);
-       }
-}
-
-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter,
-                                         struct bkey_s_c k)
-{
-       struct bkey_s_c_snapshot snap;
-       u32 children[2];
-       int ret;
-
-       if (k.k->type != KEY_TYPE_snapshot)
-               return 0;
-
-       snap = bkey_s_c_to_snapshot(k);
-       if (BCH_SNAPSHOT_DELETED(snap.v) ||
-           BCH_SNAPSHOT_SUBVOL(snap.v))
-               return 0;
-
-       children[0] = le32_to_cpu(snap.v->children[0]);
-       children[1] = le32_to_cpu(snap.v->children[1]);
-
-       ret   = snapshot_live(trans, children[0]) ?:
-               snapshot_live(trans, children[1]);
-       if (ret < 0)
-               return ret;
-
-       if (!ret)
-               return bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
-       return 0;
-}
+/* Subvolumes: */
 
-int bch2_delete_dead_snapshots(struct bch_fs *c)
+int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
+                          enum bkey_invalid_flags flags, struct printbuf *err)
 {
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_s_c_snapshot snap;
-       snapshot_id_list deleted = { 0 };
-       u32 i, id;
        int ret = 0;
 
-       if (!test_bit(BCH_FS_STARTED, &c->flags)) {
-               ret = bch2_fs_read_write_early(c);
-               if (ret) {
-                       bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret));
-                       return ret;
-               }
-       }
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       /*
-        * For every snapshot node: If we have no live children and it's not
-        * pointed to by a subvolume, delete it:
-        */
-       ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots,
-                       POS_MIN, 0, k,
-                       NULL, NULL, 0,
-               bch2_delete_redundant_snapshot(&trans, &iter, k));
-       if (ret) {
-               bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret));
-               goto err;
-       }
-
-       for_each_btree_key2(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k,
-               bch2_snapshot_set_equiv(&trans, k));
-       if (ret) {
-               bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret));
-               goto err;
-       }
-
-       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
-                          POS_MIN, 0, k, ret) {
-               if (k.k->type != KEY_TYPE_snapshot)
-                       continue;
-
-               snap = bkey_s_c_to_snapshot(k);
-               if (BCH_SNAPSHOT_DELETED(snap.v)) {
-                       ret = snapshot_list_add(c, &deleted, k.k->p.offset);
-                       if (ret)
-                               break;
-               }
-       }
-       bch2_trans_iter_exit(&trans, &iter);
-
-       if (ret) {
-               bch_err(c, "error walking snapshots: %s", bch2_err_str(ret));
-               goto err;
-       }
-
-       for (id = 0; id < BTREE_ID_NR; id++) {
-               struct bpos last_pos = POS_MIN;
-               snapshot_id_list equiv_seen = { 0 };
-
-               if (!btree_type_has_snapshots(id))
-                       continue;
-
-               ret = for_each_btree_key_commit(&trans, iter,
-                               id, POS_MIN,
-                               BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL,
-                       snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos));
-
-               darray_exit(&equiv_seen);
-
-               if (ret) {
-                       bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret));
-                       goto err;
-               }
-       }
-
-       for (i = 0; i < deleted.nr; i++) {
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_snapshot_node_delete(&trans, deleted.data[i]));
-               if (ret) {
-                       bch_err(c, "error deleting snapshot %u: %s",
-                               deleted.data[i], bch2_err_str(ret));
-                       goto err;
-               }
-       }
-
-       clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-err:
-       darray_exit(&deleted);
-       bch2_trans_exit(&trans);
-       if (ret)
-               bch_err_fn(c, ret);
+       bkey_fsck_err_on(bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
+                        bkey_gt(k.k->p, SUBVOL_POS_MAX), c, err,
+                        subvol_pos_bad,
+                        "invalid pos");
+fsck_err:
        return ret;
 }
 
-static void bch2_delete_dead_snapshots_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
-
-       if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags))
-               bch2_delete_dead_snapshots(c);
-       bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-void bch2_delete_dead_snapshots_async(struct bch_fs *c)
-{
-       if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
-           !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
-               bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
-}
-
-static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
-                                          struct btree_trans_commit_hook *h)
-{
-       struct bch_fs *c = trans->c;
-
-       set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags);
-
-       if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots)
-               return 0;
-
-       bch2_delete_dead_snapshots_async(c);
-       return 0;
-}
-
-/* Subvolumes: */
-
-int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          unsigned flags, struct printbuf *err)
-{
-       if (bkey_lt(k.k->p, SUBVOL_POS_MIN) ||
-           bkey_gt(k.k->p, SUBVOL_POS_MAX)) {
-               prt_printf(err, "invalid pos");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
-}
-
 void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
                            struct bkey_s_c k)
 {
@@ -1459,26 +151,27 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 {
        struct bch_snapshot snap;
 
-       return  snapshot_lookup(trans, snapshot, &snap) ?:
+       return  bch2_snapshot_lookup(trans, snapshot, &snap) ?:
                bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
 }
 
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
                                u32 *snapid)
 {
        struct btree_iter iter;
-       struct bkey_s_c k;
+       struct bkey_s_c_subvolume subvol;
        int ret;
 
-       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
-                              BTREE_ITER_CACHED|
-                              BTREE_ITER_WITH_UPDATES);
-       ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume;
+       subvol = bch2_bkey_get_iter_typed(trans, &iter,
+                                         BTREE_ID_subvolumes, POS(0, subvolid),
+                                         BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES,
+                                         subvolume);
+       ret = bkey_err(subvol);
+       bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+                               "missing subvolume %u", subvolid);
 
        if (likely(!ret))
-               *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
-       else if (bch2_err_matches(ret, ENOENT))
-               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+               *snapid = le32_to_cpu(subvol.v->snapshot);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1508,7 +201,12 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 }
 
 /*
- * Scan for subvolumes with parent @subvolid_to_delete, reparent:
+ * Separate from the snapshot tree in the snapshots btree, we record the tree
+ * structure of how snapshot subvolumes were created - the parent subvolume of
+ * each snapshot subvolume.
+ *
+ * When a subvolume is deleted, we scan for child subvolumes and reparant them,
+ * to avoid dangling references:
  */
 static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
 {
@@ -1521,7 +219,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
                                   BTREE_ITER_CACHED, &s)) ?:
                for_each_btree_key_commit(trans, iter,
                                BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
-                               NULL, NULL, BTREE_INSERT_NOFAIL,
+                               NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                        bch2_subvolume_reparent(trans, &iter, k,
                                        subvolid_to_delete, le32_to_cpu(s.parent)));
 }
@@ -1534,7 +232,6 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
        struct btree_iter iter;
        struct bkey_s_c_subvolume subvol;
-       struct btree_trans_commit_hook *h;
        u32 snapid;
        int ret = 0;
 
@@ -1550,22 +247,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 
        snapid = le32_to_cpu(subvol.v->snapshot);
 
-       ret = bch2_btree_delete_at(trans, &iter, 0);
-       if (ret)
-               goto err;
-
-       ret = bch2_snapshot_node_set_deleted(trans, snapid);
-       if (ret)
-               goto err;
-
-       h = bch2_trans_kmalloc(trans, sizeof(*h));
-       ret = PTR_ERR_OR_ZERO(h);
-       if (ret)
-               goto err;
-
-       h->fn = bch2_delete_dead_snapshots_hook;
-       bch2_trans_commit_hook(trans, h);
-err:
+       ret =   bch2_btree_delete_at(trans, &iter, 0) ?:
+               bch2_snapshot_node_set_deleted(trans, snapid);
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1573,7 +256,7 @@ err:
 static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
 {
        return bch2_subvolumes_reparent(trans, subvolid) ?:
-               commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+               commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
                          __bch2_subvolume_delete(trans, subvolid));
 }
 
@@ -1597,9 +280,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
                bch2_evict_subvolume_inodes(c, &s);
 
                for (id = s.data; id < s.data + s.nr; id++) {
-                       ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id));
+                       ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
                        if (ret) {
-                               bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret));
+                               bch_err_msg(c, ret, "deleting subvolume %u", *id);
                                break;
                        }
                }
index 6905e91a947087fabdcce1525ade64d5cd4ef684..a1003d30ab0a0c613b644c54fba09964d9ec4b29 100644 (file)
 
 enum bkey_invalid_flags;
 
-void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c,
-                              enum bkey_invalid_flags, struct printbuf *);
-
-#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) {       \
-       .key_invalid    = bch2_snapshot_tree_invalid,           \
-       .val_to_text    = bch2_snapshot_tree_to_text,           \
-       .min_val_size   = 8,                                    \
-})
-
-int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *);
-
-void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c,
-                         enum bkey_invalid_flags, struct printbuf *);
-int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
-                      struct bkey_s_c, struct bkey_s_c, unsigned);
-
-#define bch2_bkey_ops_snapshot ((struct bkey_ops) {            \
-       .key_invalid    = bch2_snapshot_invalid,                \
-       .val_to_text    = bch2_snapshot_to_text,                \
-       .atomic_trigger = bch2_mark_snapshot,                   \
-       .min_val_size   = 24,                                   \
-})
-
-static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
-{
-       return &t->s[U32_MAX - id];
-}
-
-static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
-{
-       return __snapshot_t(rcu_dereference(c->snapshots), id);
-}
-
-static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
-{
-       rcu_read_lock();
-       id = snapshot_t(c, id)->tree;
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-       return snapshot_t(c, id)->parent;
-}
-
-static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
-{
-       rcu_read_lock();
-       id = __bch2_snapshot_parent_early(c, id);
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       u32 parent = snapshot_t(c, id)->parent;
-
-       if (parent &&
-           snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
-               panic("id %u depth=%u parent %u depth=%u\n",
-                     id, snapshot_t(c, id)->depth,
-                     parent, snapshot_t(c, parent)->depth);
-
-       return parent;
-#else
-       return snapshot_t(c, id)->parent;
-#endif
-}
-
-static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
-{
-       rcu_read_lock();
-       id = __bch2_snapshot_parent(c, id);
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
-{
-       rcu_read_lock();
-       while (n--)
-               id = __bch2_snapshot_parent(c, id);
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
-{
-       u32 parent;
-
-       rcu_read_lock();
-       while ((parent = __bch2_snapshot_parent(c, id)))
-               id = parent;
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-       return snapshot_t(c, id)->equiv;
-}
-
-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
-{
-       rcu_read_lock();
-       id = __bch2_snapshot_equiv(c, id);
-       rcu_read_unlock();
-
-       return id;
-}
-
-static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
-{
-       return id == bch2_snapshot_equiv(c, id);
-}
-
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s;
-       bool ret;
-
-       rcu_read_lock();
-       s = snapshot_t(c, id);
-       ret = s->children[0];
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
-       return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *s;
-       u32 parent = __bch2_snapshot_parent(c, id);
-
-       if (!parent)
-               return 0;
-
-       s = snapshot_t(c, __bch2_snapshot_parent(c, id));
-       if (id == s->children[0])
-               return s->children[1];
-       if (id == s->children[1])
-               return s->children[0];
-       return 0;
-}
-
-bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
-
-static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
-{
-       return id == ancestor
-               ? true
-               : __bch2_snapshot_is_ancestor(c, id, ancestor);
-}
-
-static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
-{
-       const struct snapshot_t *t;
-       bool ret;
-
-       rcu_read_lock();
-       t = snapshot_t(c, id);
-       ret = (t->children[0]|t->children[1]) != 0;
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
-{
-       u32 *i;
-
-       darray_for_each(*s, i)
-               if (*i == id)
-                       return true;
-       return false;
-}
-
-static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-       u32 *i;
-
-       darray_for_each(*s, i)
-               if (bch2_snapshot_is_ancestor(c, id, *i))
-                       return true;
-       return false;
-}
-
-static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
-{
-       int ret;
-
-       BUG_ON(snapshot_list_has_id(s, id));
-       ret = darray_push(s, id);
-       if (ret)
-               bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
-       return ret;
-}
-
-int bch2_check_snapshot_trees(struct bch_fs *);
-int bch2_check_snapshots(struct bch_fs *);
 int bch2_check_subvols(struct bch_fs *);
 
-void bch2_fs_snapshots_exit(struct bch_fs *);
-int bch2_snapshots_read(struct bch_fs *);
-
-int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c,
-                          unsigned, struct printbuf *);
+int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
+                          enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {           \
@@ -238,14 +21,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c)
 
 int bch2_subvolume_get(struct btree_trans *, unsigned,
                       bool, int, struct bch_subvolume *);
-int bch2_snapshot_get_subvol(struct btree_trans *, u32,
-                            struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
-/* only exported for tests: */
-int bch2_snapshot_node_create(struct btree_trans *, u32,
-                             u32 *, u32 *, unsigned);
-
 int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 
index 86833445af205643b81bd08b3b204005c7bee071..2d2e66a4e4681ee5ba6ba18666d135ab961a2cbf 100644 (file)
@@ -20,7 +20,7 @@ struct snapshot_t {
 };
 
 struct snapshot_table {
-       struct snapshot_t       s[0];
+       DECLARE_FLEX_ARRAY(struct snapshot_t, s);
 };
 
 typedef struct {
index c9a5a7cb97cf842f3afacdc1846464bc444fdd7c..f4cad903f4d69da7776825f50bf561a1980a02a0 100644 (file)
@@ -1,21 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "btree_update_interior.h"
-#include "buckets.h"
 #include "checksum.h"
 #include "counters.h"
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "io.h"
 #include "journal.h"
-#include "journal_io.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "quota.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "super.h"
 #include "trace.h"
@@ -24,6 +23,9 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 
+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
+};
+
 struct bch2_metadata_version {
        u16             version;
        const char      *name;
@@ -95,7 +97,7 @@ const char * const bch2_sb_fields[] = {
 static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
                                  struct printbuf *);
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
                                      enum bch_sb_field_type type)
 {
        struct bch_sb_field *f;
@@ -150,7 +152,7 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
 void bch2_sb_field_delete(struct bch_sb_handle *sb,
                          enum bch_sb_field_type type)
 {
-       struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+       struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
 
        if (f)
                __bch2_sb_field_resize(sb, f, 0);
@@ -162,7 +164,8 @@ void bch2_free_super(struct bch_sb_handle *sb)
 {
        kfree(sb->bio);
        if (!IS_ERR_OR_NULL(sb->bdev))
-               blkdev_put(sb->bdev, sb->mode);
+               blkdev_put(sb->bdev, sb->holder);
+       kfree(sb->holder);
 
        kfree(sb->sb);
        memset(sb, 0, sizeof(*sb));
@@ -183,7 +186,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
        if (sb->sb && sb->buffer_size >= new_buffer_size)
                return 0;
 
-       if (sb->have_layout) {
+       if (sb->sb && sb->have_layout) {
                u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
                if (new_bytes > max_bytes) {
@@ -199,8 +202,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
        if (dynamic_fault("bcachefs:add:super_realloc"))
                return -BCH_ERR_ENOMEM_sb_realloc_injected;
 
+       new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
+       if (!new_sb)
+               return -BCH_ERR_ENOMEM_sb_buf_realloc;
+
+       sb->sb = new_sb;
+
        if (sb->have_bio) {
-               unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE);
+               unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size);
 
                bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
                if (!bio)
@@ -212,21 +221,16 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
                sb->bio = bio;
        }
 
-       new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
-       if (!new_sb)
-               return -BCH_ERR_ENOMEM_sb_buf_realloc;
-
-       sb->sb = new_sb;
        sb->buffer_size = new_buffer_size;
 
        return 0;
 }
 
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
                                          enum bch_sb_field_type type,
                                          unsigned u64s)
 {
-       struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
+       struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type);
        ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
        ssize_t d = -old_u64s + u64s;
 
@@ -243,16 +247,16 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
                /* XXX: we're not checking that offline device have enough space */
 
                for_each_online_member(ca, c, i) {
-                       struct bch_sb_handle *sb = &ca->disk_sb;
+                       struct bch_sb_handle *dev_sb = &ca->disk_sb;
 
-                       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+                       if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
                                percpu_ref_put(&ca->ref);
                                return NULL;
                        }
                }
        }
 
-       f = bch2_sb_field_get(sb->sb, type);
+       f = bch2_sb_field_get_id(sb->sb, type);
        f = __bch2_sb_field_resize(sb, f, u64s);
        if (f)
                f->type = cpu_to_le32(type);
@@ -352,7 +356,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 {
        struct bch_sb *sb = disk_sb->sb;
        struct bch_sb_field *f;
-       struct bch_sb_field_members *mi;
+       struct bch_sb_field_members_v1 *mi;
        enum bch_opt_id opt_id;
        u16 block_size;
        int ret;
@@ -381,7 +385,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
        }
 
        if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) {
-               prt_printf(out, "Bad intenal UUID (got zeroes)");
+               prt_printf(out, "Bad internal UUID (got zeroes)");
                return -BCH_ERR_invalid_sb_uuid;
        }
 
@@ -455,7 +459,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
        }
 
        /* members must be validated first: */
-       mi = bch2_sb_get_members(sb);
+       mi = bch2_sb_field_get(sb, members_v1);
        if (!mi) {
                prt_printf(out, "Invalid superblock: member info area missing");
                return -BCH_ERR_invalid_sb_members_missing;
@@ -466,7 +470,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
                return ret;
 
        vstruct_for_each(sb, f) {
-               if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
+               if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1)
                        continue;
 
                ret = bch2_sb_field_validate(sb, f, out);
@@ -482,7 +486,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 static void bch2_sb_update(struct bch_fs *c)
 {
        struct bch_sb *src = c->disk_sb.sb;
-       struct bch_sb_field_members *mi = bch2_sb_get_members(src);
        struct bch_dev *ca;
        unsigned i;
 
@@ -508,8 +511,10 @@ static void bch2_sb_update(struct bch_fs *c)
        c->sb.features          = le64_to_cpu(src->features[0]);
        c->sb.compat            = le64_to_cpu(src->compat[0]);
 
-       for_each_member_device(ca, c, i)
-               ca->mi = bch2_mi_to_cpu(mi->members + i);
+       for_each_member_device(ca, c, i) {
+               struct bch_member m = bch2_sb_member_get(src, i);
+               ca->mi = bch2_mi_to_cpu(&m);
+       }
 }
 
 static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
@@ -542,18 +547,20 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
                if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
                        continue;
 
-               src_f = bch2_sb_field_get(src, i);
-               dst_f = bch2_sb_field_get(dst, i);
+               src_f = bch2_sb_field_get_id(src, i);
+               dst_f = bch2_sb_field_get_id(dst, i);
 
                d = (src_f ? le32_to_cpu(src_f->u64s) : 0) -
                    (dst_f ? le32_to_cpu(dst_f->u64s) : 0);
                if (d > 0) {
-                       int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d);
+                       int ret = bch2_sb_realloc(dst_handle,
+                                       le32_to_cpu(dst_handle->sb->u64s) + d);
+
                        if (ret)
                                return ret;
 
                        dst = dst_handle->sb;
-                       dst_f = bch2_sb_field_get(dst, i);
+                       dst_f = bch2_sb_field_get_id(dst, i);
                }
 
                dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
@@ -662,27 +669,30 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 retry:
 #endif
        memset(sb, 0, sizeof(*sb));
-       sb->mode        = FMODE_READ;
+       sb->mode        = BLK_OPEN_READ;
        sb->have_bio    = true;
+       sb->holder      = kmalloc(1, GFP_KERNEL);
+       if (!sb->holder)
+               return -ENOMEM;
 
 #ifndef __KERNEL__
        if (opt_get(*opts, direct_io) == false)
-               sb->mode |= FMODE_BUFFERED;
+               sb->mode |= BLK_OPEN_BUFFERED;
 #endif
 
        if (!opt_get(*opts, noexcl))
-               sb->mode |= FMODE_EXCL;
+               sb->mode |= BLK_OPEN_EXCL;
 
        if (!opt_get(*opts, nochanges))
-               sb->mode |= FMODE_WRITE;
+               sb->mode |= BLK_OPEN_WRITE;
 
-       sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+       sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
        if (IS_ERR(sb->bdev) &&
            PTR_ERR(sb->bdev) == -EACCES &&
            opt_get(*opts, read_only)) {
-               sb->mode &= ~FMODE_WRITE;
+               sb->mode &= ~BLK_OPEN_WRITE;
 
-               sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
+               sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
                if (!IS_ERR(sb->bdev))
                        opt_set(*opts, nochanges, true);
        }
@@ -711,7 +721,7 @@ retry:
        if (opt_defined(*opts, sb))
                goto err;
 
-       printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+       printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
               path, err.buf);
        printbuf_reset(&err);
 
@@ -773,7 +783,7 @@ got_super:
 
        ret = bch2_sb_validate(sb, &err, READ);
        if (ret) {
-               printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+               printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
                       path, err.buf);
                goto err_no_print;
        }
@@ -781,7 +791,7 @@ out:
        printbuf_exit(&err);
        return ret;
 err:
-       printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+       printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
               path, err.buf);
 err_no_print:
        bch2_free_super(sb);
@@ -796,7 +806,12 @@ static void write_super_endio(struct bio *bio)
 
        /* XXX: return errors directly */
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca,
+                              bio_data_dir(bio)
+                              ? BCH_MEMBER_ERROR_write
+                              : BCH_MEMBER_ERROR_read,
+                              "superblock %s error: %s",
+                              bio_data_dir(bio) ? "write" : "read",
                               bch2_blk_status_to_str(bio->bi_status)))
                ca->sb_write_error = 1;
 
@@ -883,6 +898,9 @@ int bch2_write_super(struct bch_fs *c)
        SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
        bch2_sb_counters_from_cpu(c);
+       bch2_sb_members_from_cpu(c);
+       bch2_sb_members_cpy_v2_v1(&c->disk_sb);
+       bch2_sb_errors_from_cpu(c);
 
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
@@ -1005,235 +1023,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
        mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_members: */
-
-static int bch2_sb_members_validate(struct bch_sb *sb,
-                                   struct bch_sb_field *f,
-                                   struct printbuf *err)
-{
-       struct bch_sb_field_members *mi = field_to_type(f, members);
-       unsigned i;
-
-       if ((void *) (mi->members + sb->nr_devices) >
-           vstruct_end(&mi->field)) {
-               prt_printf(err, "too many devices for section size");
-               return -BCH_ERR_invalid_sb_members;
-       }
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
-                       prt_printf(err, "device %u: too many buckets (got %llu, max %lu)",
-                              i, le64_to_cpu(m->nbuckets), LONG_MAX);
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le64_to_cpu(m->nbuckets) -
-                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
-                       prt_printf(err, "device %u: not enough buckets (got %llu, max %u)",
-                              i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le16_to_cpu(m->bucket_size) <
-                   le16_to_cpu(sb->block_size)) {
-                       prt_printf(err, "device %u: bucket size %u smaller than block size %u",
-                              i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
-                       return -BCH_ERR_invalid_sb_members;
-               }
-
-               if (le16_to_cpu(m->bucket_size) <
-                   BCH_SB_BTREE_NODE_SIZE(sb)) {
-                       prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu",
-                              i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
-                       return -BCH_ERR_invalid_sb_members;
-               }
-       }
-
-       return 0;
-}
-
-static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
-                                   struct bch_sb_field *f)
-{
-       struct bch_sb_field_members *mi = field_to_type(f, members);
-       struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb);
-       unsigned i;
-
-       for (i = 0; i < sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               unsigned data_have = bch2_sb_dev_has_data(sb, i);
-               u64 bucket_size = le16_to_cpu(m->bucket_size);
-               u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size;
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               prt_printf(out, "Device:");
-               prt_tab(out);
-               prt_printf(out, "%u", i);
-               prt_newline(out);
-
-               printbuf_indent_add(out, 2);
-
-               prt_printf(out, "UUID:");
-               prt_tab(out);
-               pr_uuid(out, m->uuid.b);
-               prt_newline(out);
-
-               prt_printf(out, "Size:");
-               prt_tab(out);
-               prt_units_u64(out, device_size << 9);
-               prt_newline(out);
-
-               prt_printf(out, "Bucket size:");
-               prt_tab(out);
-               prt_units_u64(out, bucket_size << 9);
-               prt_newline(out);
-
-               prt_printf(out, "First bucket:");
-               prt_tab(out);
-               prt_printf(out, "%u", le16_to_cpu(m->first_bucket));
-               prt_newline(out);
-
-               prt_printf(out, "Buckets:");
-               prt_tab(out);
-               prt_printf(out, "%llu", le64_to_cpu(m->nbuckets));
-               prt_newline(out);
-
-               prt_printf(out, "Last mount:");
-               prt_tab(out);
-               if (m->last_mount)
-                       pr_time(out, le64_to_cpu(m->last_mount));
-               else
-                       prt_printf(out, "(never)");
-               prt_newline(out);
-
-               prt_printf(out, "State:");
-               prt_tab(out);
-               prt_printf(out, "%s",
-                      BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR
-                      ? bch2_member_states[BCH_MEMBER_STATE(m)]
-                      : "unknown");
-               prt_newline(out);
-
-               prt_printf(out, "Label:");
-               prt_tab(out);
-               if (BCH_MEMBER_GROUP(m)) {
-                       unsigned idx = BCH_MEMBER_GROUP(m) - 1;
-
-                       if (idx < disk_groups_nr(gi))
-                               prt_printf(out, "%s (%u)",
-                                      gi->entries[idx].label, idx);
-                       else
-                               prt_printf(out, "(bad disk labels section)");
-               } else {
-                       prt_printf(out, "(none)");
-               }
-               prt_newline(out);
-
-               prt_printf(out, "Data allowed:");
-               prt_tab(out);
-               if (BCH_MEMBER_DATA_ALLOWED(m))
-                       prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m));
-               else
-                       prt_printf(out, "(none)");
-               prt_newline(out);
-
-               prt_printf(out, "Has data:");
-               prt_tab(out);
-               if (data_have)
-                       prt_bitflags(out, bch2_data_types, data_have);
-               else
-                       prt_printf(out, "(none)");
-               prt_newline(out);
-
-               prt_printf(out, "Discard:");
-               prt_tab(out);
-               prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m));
-               prt_newline(out);
-
-               prt_printf(out, "Freespace initialized:");
-               prt_tab(out);
-               prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
-               prt_newline(out);
-
-               printbuf_indent_sub(out, 2);
-       }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_members = {
-       .validate       = bch2_sb_members_validate,
-       .to_text        = bch2_sb_members_to_text,
-};
-
-/* BCH_SB_FIELD_crypt: */
-
-static int bch2_sb_crypt_validate(struct bch_sb *sb,
-                                 struct bch_sb_field *f,
-                                 struct printbuf *err)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&crypt->field), sizeof(*crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       if (BCH_CRYPT_KDF_TYPE(crypt)) {
-               prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
-               return -BCH_ERR_invalid_sb_crypt;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
-
-       prt_printf(out, "KFD:               %llu", BCH_CRYPT_KDF_TYPE(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt n:          %llu", BCH_KDF_SCRYPT_N(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt r:          %llu", BCH_KDF_SCRYPT_R(crypt));
-       prt_newline(out);
-       prt_printf(out, "scrypt p:          %llu", BCH_KDF_SCRYPT_P(crypt));
-       prt_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
-       .validate       = bch2_sb_crypt_validate,
-       .to_text        = bch2_sb_crypt_to_text,
-};
-
-/* BCH_SB_FIELD_clean: */
-
-int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
-{
-       struct jset_entry *entry;
-       int ret;
-
-       for (entry = clean->start;
-            entry < (struct jset_entry *) vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               ret = bch2_journal_entry_validate(c, NULL, entry,
-                                                 le16_to_cpu(c->disk_sb.sb->version),
-                                                 BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
-                                                 write);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
-}
-
 /* Downgrade if superblock is at a higher version than currently supported: */
 void bch2_sb_maybe_downgrade(struct bch_fs *c)
 {
@@ -1260,232 +1049,6 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version)
        c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 }
 
-int bch2_fs_mark_dirty(struct bch_fs *c)
-{
-       int ret;
-
-       /*
-        * Unconditionally write superblock, to verify it hasn't changed before
-        * we go rw:
-        */
-
-       mutex_lock(&c->sb_lock);
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-       bch2_sb_maybe_downgrade(c);
-       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-       ret = bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return ret;
-}
-
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-       struct jset_entry *entry = *end;
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-       memset(entry, 0, u64s * sizeof(u64));
-       /*
-        * The u64s field counts from the start of data, ignoring the shared
-        * fields.
-        */
-       entry->u64s = cpu_to_le16(u64s - 1);
-
-       *end = vstruct_next(*end);
-       return entry;
-}
-
-void bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                          struct jset_entry **end,
-                                          u64 journal_seq)
-{
-       struct bch_dev *ca;
-       unsigned i, dev;
-
-       percpu_down_read(&c->mark_lock);
-
-       if (!journal_seq) {
-               for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-                       bch2_fs_usage_acc_to_base(c, i);
-       } else {
-               bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
-       }
-
-       {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_inodes;
-               u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-       }
-
-       {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_key_version;
-               u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-       }
-
-       for (i = 0; i < BCH_REPLICAS_MAX; i++) {
-               struct jset_entry_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u)),
-                                    struct jset_entry_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = BCH_FS_USAGE_reserved;
-               u->entry.level  = i;
-               u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-       }
-
-       for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
-                       cpu_replicas_entry(&c->replicas, i);
-               struct jset_entry_data_usage *u =
-                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
-                                    struct jset_entry_data_usage, entry);
-
-               u->entry.type   = BCH_JSET_ENTRY_data_usage;
-               u->v            = cpu_to_le64(c->usage_base->replicas[i]);
-               unsafe_memcpy(&u->r, e, replicas_entry_bytes(e),
-                             "embedded variable length struct");
-       }
-
-       for_each_member_device(ca, c, dev) {
-               unsigned b = sizeof(struct jset_entry_dev_usage) +
-                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
-               struct jset_entry_dev_usage *u =
-                       container_of(jset_entry_init(end, b),
-                                    struct jset_entry_dev_usage, entry);
-
-               u->entry.type = BCH_JSET_ENTRY_dev_usage;
-               u->dev = cpu_to_le32(dev);
-               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
-
-               for (i = 0; i < BCH_DATA_NR; i++) {
-                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
-                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
-                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
-               }
-       }
-
-       percpu_up_read(&c->mark_lock);
-
-       for (i = 0; i < 2; i++) {
-               struct jset_entry_clock *clock =
-                       container_of(jset_entry_init(end, sizeof(*clock)),
-                                    struct jset_entry_clock, entry);
-
-               clock->entry.type = BCH_JSET_ENTRY_clock;
-               clock->rw       = i;
-               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
-       }
-}
-
-void bch2_fs_mark_clean(struct bch_fs *c)
-{
-       struct bch_sb_field_clean *sb_clean;
-       struct jset_entry *entry;
-       unsigned u64s;
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       if (BCH_SB_CLEAN(c->disk_sb.sb))
-               goto out;
-
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
-       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
-       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
-
-       u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
-
-       sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
-       if (!sb_clean) {
-               bch_err(c, "error resizing superblock while setting filesystem clean");
-               goto out;
-       }
-
-       sb_clean->flags         = 0;
-       sb_clean->journal_seq   = cpu_to_le64(atomic64_read(&c->journal.seq));
-
-       /* Trying to catch outstanding bug: */
-       BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
-
-       entry = sb_clean->start;
-       bch2_journal_super_entries_add_common(c, &entry, 0);
-       entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
-       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
-
-       memset(entry, 0,
-              vstruct_end(&sb_clean->field) - (void *) entry);
-
-       /*
-        * this should be in the write path, and we should be validating every
-        * superblock section:
-        */
-       ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE);
-       if (ret) {
-               bch_err(c, "error writing marking filesystem clean: validate error");
-               goto out;
-       }
-
-       bch2_write_super(c);
-out:
-       mutex_unlock(&c->sb_lock);
-}
-
-static int bch2_sb_clean_validate(struct bch_sb *sb,
-                                 struct bch_sb_field *f,
-                                 struct printbuf *err)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-
-       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
-               prt_printf(err, "wrong size (got %zu should be %zu)",
-                      vstruct_bytes(&clean->field), sizeof(*clean));
-               return -BCH_ERR_invalid_sb_clean;
-       }
-
-       return 0;
-}
-
-static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb,
-                                 struct bch_sb_field *f)
-{
-       struct bch_sb_field_clean *clean = field_to_type(f, clean);
-       struct jset_entry *entry;
-
-       prt_printf(out, "flags:          %x",   le32_to_cpu(clean->flags));
-       prt_newline(out);
-       prt_printf(out, "journal_seq:    %llu", le64_to_cpu(clean->journal_seq));
-       prt_newline(out);
-
-       for (entry = clean->start;
-            entry != vstruct_end(&clean->field);
-            entry = vstruct_next(entry)) {
-               if (entry->type == BCH_JSET_ENTRY_btree_keys &&
-                   !entry->u64s)
-                       continue;
-
-               bch2_journal_entry_to_text(out, NULL, entry);
-               prt_newline(out);
-       }
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
-       .validate       = bch2_sb_clean_validate,
-       .to_text        = bch2_sb_clean_to_text,
-};
-
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #define x(f, nr)                                       \
        [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
@@ -1572,7 +1135,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
                     bool print_layout, unsigned fields)
 {
-       struct bch_sb_field_members *mi;
        struct bch_sb_field *f;
        u64 fields_have = 0;
        unsigned nr_devices = 0;
@@ -1580,15 +1142,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
        if (!out->nr_tabstops)
                printbuf_tabstop_push(out, 44);
 
-       mi = bch2_sb_get_members(sb);
-       if (mi) {
-               struct bch_member *m;
-
-               for (m = mi->members;
-                    m < mi->members + sb->nr_devices;
-                    m++)
-                       nr_devices += bch2_member_exists(m);
-       }
+       for (int i = 0; i < sb->nr_devices; i++)
+               nr_devices += bch2_dev_exists(sb, i);
 
        prt_printf(out, "External UUID:");
        prt_tab(out);
@@ -1628,7 +1183,7 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
        prt_printf(out, "Created:");
        prt_tab(out);
        if (sb->time_base_lo)
-               pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
+               bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC));
        else
                prt_printf(out, "(not set)");
        prt_newline(out);
index 904adea6a0da20e9699a920c00e909fca4703ac5..f5abd102bff7502bd2f142dfde8487c82f8aed29 100644 (file)
@@ -6,6 +6,7 @@
 #include "eytzinger.h"
 #include "super_types.h"
 #include "super.h"
+#include "sb-members.h"
 
 #include <asm/byteorder.h>
 
@@ -22,31 +23,24 @@ u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
                                 unsigned,
                                 unsigned);
 
-struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
-struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
-                                         enum bch_sb_field_type, unsigned);
-void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
+static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
+{
+       return le32_to_cpu(f->u64s) * sizeof(u64);
+}
 
 #define field_to_type(_f, _name)                                       \
        container_of_or_null(_f, struct bch_sb_field_##_name, field)
 
-#define x(_name, _nr)                                                  \
-static inline struct bch_sb_field_##_name *                            \
-bch2_sb_get_##_name(struct bch_sb *sb)                                 \
-{                                                                      \
-       return field_to_type(bch2_sb_field_get(sb,                      \
-                               BCH_SB_FIELD_##_name), _name);          \
-}                                                                      \
-                                                                       \
-static inline struct bch_sb_field_##_name *                            \
-bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)        \
-{                                                                      \
-       return field_to_type(bch2_sb_field_resize(sb,                   \
-                               BCH_SB_FIELD_##_name, u64s), _name);    \
-}
+struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *, enum bch_sb_field_type);
+#define bch2_sb_field_get(_sb, _name)                                  \
+       field_to_type(bch2_sb_field_get_id(_sb, BCH_SB_FIELD_##_name), _name)
+
+struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *,
+                                            enum bch_sb_field_type, unsigned);
+#define bch2_sb_field_resize(_sb, _name, _u64s)                                \
+       field_to_type(bch2_sb_field_resize_id(_sb, BCH_SB_FIELD_##_name, _u64s), _name)
 
-BCH_SB_FIELDS()
-#undef x
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
 
 extern const char * const bch2_sb_fields[];
 
@@ -58,6 +52,7 @@ struct bch_sb_field_ops {
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
 {
        __le64 ret;
+
        memcpy(&ret, &c->sb.uuid, sizeof(ret));
        return ret;
 }
@@ -88,52 +83,9 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
                __bch2_check_set_feature(c, feat);
 }
 
-/* BCH_SB_FIELD_members: */
-
-static inline bool bch2_member_exists(struct bch_member *m)
-{
-       return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
-}
-
-static inline bool bch2_dev_exists(struct bch_sb *sb,
-                                  struct bch_sb_field_members *mi,
-                                  unsigned dev)
-{
-       return dev < sb->nr_devices &&
-               bch2_member_exists(&mi->members[dev]);
-}
-
-static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
-{
-       return (struct bch_member_cpu) {
-               .nbuckets       = le64_to_cpu(mi->nbuckets),
-               .first_bucket   = le16_to_cpu(mi->first_bucket),
-               .bucket_size    = le16_to_cpu(mi->bucket_size),
-               .group          = BCH_MEMBER_GROUP(mi),
-               .state          = BCH_MEMBER_STATE(mi),
-               .discard        = BCH_MEMBER_DISCARD(mi),
-               .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
-               .durability     = BCH_MEMBER_DURABILITY(mi)
-                       ? BCH_MEMBER_DURABILITY(mi) - 1
-                       : 1,
-               .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
-               .valid          = bch2_member_exists(mi),
-       };
-}
-
-/* BCH_SB_FIELD_clean: */
-
-void bch2_journal_super_entries_add_common(struct bch_fs *,
-                                          struct jset_entry **, u64);
-
-int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int);
-
 void bch2_sb_maybe_downgrade(struct bch_fs *);
 void bch2_sb_upgrade(struct bch_fs *, unsigned);
 
-int bch2_fs_mark_dirty(struct bch_fs *);
-void bch2_fs_mark_clean(struct bch_fs *);
-
 void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
                           struct bch_sb_field *);
 void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
index eee56969c77934a7788b5e8083f1dc69d61cab6b..bb9451082e872ca6752e085ca1c884563821a157 100644 (file)
@@ -13,6 +13,7 @@
 #include "bkey_sort.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
+#include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
+#include "fs-io-buffered.h"
+#include "fs-io-direct.h"
 #include "fsck.h"
 #include "inode.h"
-#include "io.h"
+#include "io_read.h"
+#include "io_write.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "sb-clean.h"
+#include "sb-errors.h"
+#include "sb-members.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
@@ -63,6 +71,7 @@
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_DESCRIPTION("bcachefs filesystem");
 
 #define KTYPE(type)                                                    \
 static const struct attribute_group type ## _group = {                 \
@@ -392,6 +401,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        bch_info(c, "going read-write");
 
+       ret = bch2_sb_members_v2_init(c);
+       if (ret)
+               goto err;
+
        ret = bch2_fs_mark_dirty(c);
        if (ret)
                goto err;
@@ -416,6 +429,10 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                return ret;
        }
 
+       ret = bch2_journal_reclaim_start(&c->journal);
+       if (ret)
+               goto err;
+
        if (!early) {
                ret = bch2_fs_read_write_late(c);
                if (ret)
@@ -425,7 +442,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 #ifndef BCH_WRITE_REF_DEBUG
        percpu_ref_reinit(&c->writes);
 #else
-       for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
+       for (i = 0; i < BCH_WRITE_REF_NR; i++) {
                BUG_ON(atomic_long_read(&c->writes[i]));
                atomic_long_inc(&c->writes[i]);
        }
@@ -460,19 +477,23 @@ int bch2_fs_read_write_early(struct bch_fs *c)
 static void __bch2_fs_free(struct bch_fs *c)
 {
        unsigned i;
-       int cpu;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
 
        bch2_free_pending_node_rewrites(c);
+       bch2_fs_sb_errors_exit(c);
        bch2_fs_counters_exit(c);
        bch2_fs_snapshots_exit(c);
        bch2_fs_quota_exit(c);
+       bch2_fs_fs_io_direct_exit(c);
+       bch2_fs_fs_io_buffered_exit(c);
        bch2_fs_fsio_exit(c);
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
-       bch2_fs_io_exit(c);
+       bch2_fs_nocow_locking_exit(c);
+       bch2_fs_io_write_exit(c);
+       bch2_fs_io_read_exit(c);
        bch2_fs_buckets_waiting_for_journal_exit(c);
        bch2_fs_btree_interior_update_exit(c);
        bch2_fs_btree_iter_exit(c);
@@ -489,12 +510,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        percpu_free_rwsem(&c->mark_lock);
        free_percpu(c->online_reserved);
 
-       if (c->btree_paths_bufs)
-               for_each_possible_cpu(cpu)
-                       kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
-
        darray_exit(&c->btree_roots_extra);
-       free_percpu(c->btree_paths_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -568,13 +584,6 @@ void __bch2_fs_stop(struct bch_fs *c)
                cancel_work_sync(&ca->io_error_work);
 
        cancel_work_sync(&c->read_only_work);
-
-       for (i = 0; i < c->sb.nr_devices; i++) {
-               struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
-
-               if (ca)
-                       bch2_free_super(&ca->disk_sb);
-       }
 }
 
 void bch2_fs_free(struct bch_fs *c)
@@ -588,9 +597,14 @@ void bch2_fs_free(struct bch_fs *c)
        closure_sync(&c->cl);
        closure_debug_destroy(&c->cl);
 
-       for (i = 0; i < c->sb.nr_devices; i++)
-               if (c->devs[i])
-                       bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
+       for (i = 0; i < c->sb.nr_devices; i++) {
+               struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true);
+
+               if (ca) {
+                       bch2_free_super(&ca->disk_sb);
+                       bch2_dev_free(ca);
+               }
+       }
 
        bch_verbose(c, "shutdown complete");
 
@@ -627,7 +641,9 @@ static int bch2_fs_online(struct bch_fs *c)
        ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
            kobject_add(&c->internal, &c->kobj, "internal") ?:
            kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
            kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
            bch2_opts_create_sysfs_files(&c->opts_dir);
        if (ret) {
@@ -655,7 +671,6 @@ err:
 
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
-       struct bch_sb_field_members *mi;
        struct bch_fs *c;
        struct printbuf name = PRINTBUF;
        unsigned i, iter_size;
@@ -702,6 +717,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        bch2_fs_quota_init(c);
        bch2_fs_ec_init_early(c);
        bch2_fs_move_init(c);
+       bch2_fs_sb_errors_init_early(c);
 
        INIT_LIST_HEAD(&c->list);
 
@@ -709,6 +725,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        mutex_init(&c->bio_bounce_pages_lock);
        mutex_init(&c->snapshot_table_lock);
+       init_rwsem(&c->snapshot_create_lock);
 
        spin_lock_init(&c->btree_write_error_lock);
 
@@ -717,8 +734,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        INIT_LIST_HEAD(&c->journal_iters);
 
-       INIT_LIST_HEAD(&c->fsck_errors);
-       mutex_init(&c->fsck_error_lock);
+       INIT_LIST_HEAD(&c->fsck_error_msgs);
+       mutex_init(&c->fsck_error_msgs_lock);
 
        seqcount_init(&c->gc_pos_lock);
 
@@ -735,7 +752,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        c->journal.flush_write_time     = &c->times[BCH_TIME_journal_flush_write];
        c->journal.noflush_write_time   = &c->times[BCH_TIME_journal_noflush_write];
-       c->journal.blocked_time         = &c->times[BCH_TIME_blocked_journal];
        c->journal.flush_seq_time       = &c->times[BCH_TIME_journal_flush_seq];
 
        bch2_fs_btree_cache_init_early(&c->btree_cache);
@@ -780,6 +796,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
        if (c->opts.inodes_use_key_cache)
                c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+       c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops;
 
        c->block_bits           = ilog2(block_sectors(c));
        c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
@@ -817,7 +834,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
-           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
@@ -828,6 +844,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        }
 
        ret = bch2_fs_counters_init(c) ?:
+           bch2_fs_sb_errors_init(c) ?:
            bch2_io_clock_init(&c->io_clock[READ]) ?:
            bch2_io_clock_init(&c->io_clock[WRITE]) ?:
            bch2_fs_journal_init(&c->journal) ?:
@@ -839,18 +856,20 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_buckets_waiting_for_journal_init(c) ?:
            bch2_fs_btree_write_buffer_init(c) ?:
            bch2_fs_subvolumes_init(c) ?:
-           bch2_fs_io_init(c) ?:
+           bch2_fs_io_read_init(c) ?:
+           bch2_fs_io_write_init(c) ?:
            bch2_fs_nocow_locking_init(c) ?:
            bch2_fs_encryption_init(c) ?:
            bch2_fs_compress_init(c) ?:
            bch2_fs_ec_init(c) ?:
-           bch2_fs_fsio_init(c);
+           bch2_fs_fsio_init(c) ?:
+           bch2_fs_fs_io_buffered_init(c) ?:
+           bch2_fs_fs_io_direct_init(c);
        if (ret)
                goto err;
 
-       mi = bch2_sb_get_members(c->disk_sb.sb);
        for (i = 0; i < c->sb.nr_devices; i++)
-               if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
+               if (bch2_dev_exists(c->disk_sb.sb, i) &&
                    bch2_dev_alloc(c, i)) {
                        ret = -EEXIST;
                        goto err;
@@ -915,7 +934,6 @@ static void print_mount_opts(struct bch_fs *c)
 
 int bch2_fs_start(struct bch_fs *c)
 {
-       struct bch_sb_field_members *mi;
        struct bch_dev *ca;
        time64_t now = ktime_get_real_seconds();
        unsigned i;
@@ -929,12 +947,14 @@ int bch2_fs_start(struct bch_fs *c)
 
        mutex_lock(&c->sb_lock);
 
-       for_each_online_member(ca, c, i)
-               bch2_sb_from_fs(c, ca);
+       ret = bch2_sb_members_v2_init(c);
+       if (ret) {
+               mutex_unlock(&c->sb_lock);
+               goto err;
+       }
 
-       mi = bch2_sb_get_members(c->disk_sb.sb);
        for_each_online_member(ca, c, i)
-               mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+               bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
 
        mutex_unlock(&c->sb_lock);
 
@@ -942,12 +962,6 @@ int bch2_fs_start(struct bch_fs *c)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
-               mutex_lock(&c->btree_transaction_stats[i].lock);
-               bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
-               mutex_unlock(&c->btree_transaction_stats[i].lock);
-       }
-
        ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
                ? bch2_fs_recovery(c)
                : bch2_fs_initialize(c);
@@ -981,22 +995,18 @@ out:
        up_write(&c->state_lock);
        return ret;
 err:
-       bch_err(c, "error starting filesystem: %s", bch2_err_str(ret));
+       bch_err_msg(c, ret, "starting filesystem");
        goto out;
 }
 
 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
-       struct bch_sb_field_members *sb_mi;
-
-       sb_mi = bch2_sb_get_members(sb);
-       if (!sb_mi)
-               return -BCH_ERR_member_info_missing;
+       struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx);
 
        if (le16_to_cpu(sb->block_size) != block_sectors(c))
                return -BCH_ERR_mismatched_block_size;
 
-       if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
+       if (le16_to_cpu(m.bucket_size) <
            BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
                return -BCH_ERR_bucket_size_too_small;
 
@@ -1007,12 +1017,11 @@ static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
 {
        struct bch_sb *newest =
                le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
-       struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
 
        if (!uuid_equal(&fs->uuid, &sb->uuid))
                return -BCH_ERR_device_not_a_member_of_filesystem;
 
-       if (!bch2_dev_exists(newest, mi, sb->dev_idx))
+       if (!bch2_dev_exists(newest, sb->dev_idx))
                return -BCH_ERR_device_has_been_removed;
 
        if (fs->block_size != sb->block_size)
@@ -1127,6 +1136,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
                                        struct bch_member *member)
 {
        struct bch_dev *ca;
+       unsigned i;
 
        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
        if (!ca)
@@ -1144,6 +1154,10 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
        bch2_time_stats_init(&ca->io_latency[WRITE]);
 
        ca->mi = bch2_mi_to_cpu(member);
+
+       for (i = 0; i < ARRAY_SIZE(member->errors); i++)
+               atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i]));
+
        ca->uuid = member->uuid;
 
        ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
@@ -1182,15 +1196,14 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
-       struct bch_member *member =
-               bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+       struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
        struct bch_dev *ca = NULL;
        int ret = 0;
 
        if (bch2_fs_init_fault("dev_alloc"))
                goto err;
 
-       ca = __bch2_dev_alloc(c, member);
+       ca = __bch2_dev_alloc(c, &member);
        if (!ca)
                goto err;
 
@@ -1228,8 +1241,6 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 
        /* Commit: */
        ca->disk_sb = *sb;
-       if (sb->mode & FMODE_EXCL)
-               ca->disk_sb.bdev->bd_holder = ca;
        memset(sb, 0, sizeof(*sb));
 
        ca->dev = ca->disk_sb.bdev->bd_dev;
@@ -1327,7 +1338,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
 static bool bch2_fs_may_start(struct bch_fs *c)
 {
-       struct bch_sb_field_members *mi;
        struct bch_dev *ca;
        unsigned i, flags = 0;
 
@@ -1340,10 +1350,9 @@ static bool bch2_fs_may_start(struct bch_fs *c)
        if (!c->opts.degraded &&
            !c->opts.very_degraded) {
                mutex_lock(&c->sb_lock);
-               mi = bch2_sb_get_members(c->disk_sb.sb);
 
                for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
-                       if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
+                       if (!bch2_dev_exists(c->disk_sb.sb, i))
                                continue;
 
                        ca = bch_dev_locked(c, i);
@@ -1383,7 +1392,7 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
                         enum bch_member_state new_state, int flags)
 {
-       struct bch_sb_field_members *mi;
+       struct bch_member *m;
        int ret = 0;
 
        if (ca->mi.state == new_state)
@@ -1398,8 +1407,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        bch_notice(ca, "%s", bch2_member_states[new_state]);
 
        mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb.sb);
-       SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
+       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+       SET_BCH_MEMBER_STATE(m, new_state);
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
@@ -1448,14 +1457,14 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
                bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
                                        BTREE_TRIGGER_NORUN, NULL);
        if (ret)
-               bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "removing dev alloc info");
 
        return ret;
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
-       struct bch_sb_field_members *mi;
+       struct bch_member *m;
        unsigned dev_idx = ca->dev_idx, data;
        int ret;
 
@@ -1477,31 +1486,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
        if (ret) {
-               bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "dropping data");
                goto err;
        }
 
        ret = bch2_dev_remove_alloc(c, ca);
        if (ret) {
-               bch_err(ca, "Remove failed, error deleting alloc info");
+               bch_err_msg(ca, ret, "deleting alloc info");
                goto err;
        }
 
        ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
        if (ret) {
-               bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "flushing journal");
                goto err;
        }
 
        ret = bch2_journal_flush(&c->journal);
        if (ret) {
-               bch_err(ca, "Remove failed, journal error");
+               bch_err(ca, "journal error");
                goto err;
        }
 
        ret = bch2_replicas_gc2(c);
        if (ret) {
-               bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "in replicas_gc2()");
                goto err;
        }
 
@@ -1543,8 +1552,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
         * this device must be gone:
         */
        mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb.sb);
-       memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
+       m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
+       memset(&m->uuid, 0, sizeof(m->uuid));
 
        bch2_write_super(c);
 
@@ -1567,7 +1576,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        struct bch_opts opts = bch2_opts_empty();
        struct bch_sb_handle sb;
        struct bch_dev *ca = NULL;
-       struct bch_sb_field_members *mi;
+       struct bch_sb_field_members_v2 *mi;
        struct bch_member dev_mi;
        unsigned dev_idx, nr_devices, u64s;
        struct printbuf errbuf = PRINTBUF;
@@ -1576,14 +1585,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_read_super(path, &opts, &sb);
        if (ret) {
-               bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret));
+               bch_err_msg(c, ret, "reading super");
                goto err;
        }
 
-       dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+       dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
 
        if (BCH_MEMBER_GROUP(&dev_mi)) {
-               bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+               bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
                if (label.allocation_failure) {
                        ret = -ENOMEM;
                        goto err;
@@ -1592,13 +1601,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_dev_may_add(sb.sb, c);
        if (ret) {
-               bch_err(c, "device add error: %s", bch2_err_str(ret));
+               bch_err_fn(c, ret);
                goto err;
        }
 
        ca = __bch2_dev_alloc(c, &dev_mi);
        if (!ca) {
-               bch2_free_super(&sb);
                ret = -ENOMEM;
                goto err;
        }
@@ -1606,14 +1614,12 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        bch2_dev_usage_init(ca);
 
        ret = __bch2_dev_attach_bdev(ca, &sb);
-       if (ret) {
-               bch2_dev_free(ca);
+       if (ret)
                goto err;
-       }
 
        ret = bch2_dev_journal_alloc(ca);
        if (ret) {
-               bch_err(c, "device add error: journal alloc failed");
+               bch_err_msg(c, ret, "allocating journal");
                goto err;
        }
 
@@ -1622,48 +1628,40 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 
        ret = bch2_sb_from_fs(c, ca);
        if (ret) {
-               bch_err(c, "device add error: new device superblock too small");
-               goto err_unlock;
-       }
-
-       mi = bch2_sb_get_members(ca->disk_sb.sb);
-
-       if (!bch2_sb_resize_members(&ca->disk_sb,
-                               le32_to_cpu(mi->field.u64s) +
-                               sizeof(dev_mi) / sizeof(u64))) {
-               bch_err(c, "device add error: new device superblock too small");
-               ret = -BCH_ERR_ENOSPC_sb_members;
+               bch_err_msg(c, ret, "setting up new superblock");
                goto err_unlock;
        }
 
        if (dynamic_fault("bcachefs:add:no_slot"))
                goto no_slot;
 
-       mi = bch2_sb_get_members(c->disk_sb.sb);
        for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-               if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
+               if (!bch2_dev_exists(c->disk_sb.sb, dev_idx))
                        goto have_slot;
 no_slot:
-       bch_err(c, "device add error: already have maximum number of devices");
        ret = -BCH_ERR_ENOSPC_sb_members;
+       bch_err_msg(c, ret, "setting up new superblock");
        goto err_unlock;
 
 have_slot:
        nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
-       u64s = (sizeof(struct bch_sb_field_members) +
-               sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 
-       mi = bch2_sb_resize_members(&c->disk_sb, u64s);
+       mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+       u64s = DIV_ROUND_UP(sizeof(struct bch_sb_field_members_v2) +
+                           le16_to_cpu(mi->member_bytes) * nr_devices, sizeof(u64));
+
+       mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s);
        if (!mi) {
-               bch_err(c, "device add error: no room in superblock for member info");
                ret = -BCH_ERR_ENOSPC_sb_members;
+               bch_err_msg(c, ret, "setting up new superblock");
                goto err_unlock;
        }
+       struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
 
        /* success: */
 
-       mi->members[dev_idx] = dev_mi;
-       mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
+       *m = dev_mi;
+       m->last_mount = cpu_to_le64(ktime_get_real_seconds());
        c->disk_sb.sb->nr_devices       = nr_devices;
 
        ca->disk_sb.sb->dev_idx = dev_idx;
@@ -1672,7 +1670,7 @@ have_slot:
        if (BCH_MEMBER_GROUP(&dev_mi)) {
                ret = __bch2_dev_group_set(c, ca, label.buf);
                if (ret) {
-                       bch_err(c, "device add error: error setting label");
+                       bch_err_msg(c, ret, "creating new label");
                        goto err_unlock;
                }
        }
@@ -1684,13 +1682,13 @@ have_slot:
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "marking new superblock");
                goto err_late;
        }
 
        ret = bch2_fs_freespace_init(c);
        if (ret) {
-               bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "initializing free space");
                goto err_late;
        }
 
@@ -1723,7 +1721,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 {
        struct bch_opts opts = bch2_opts_empty();
        struct bch_sb_handle sb = { NULL };
-       struct bch_sb_field_members *mi;
        struct bch_dev *ca;
        unsigned dev_idx;
        int ret;
@@ -1740,7 +1737,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
        if (ret) {
-               bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret));
+               bch_err_msg(c, ret, "bringing %s online", path);
                goto err;
        }
 
@@ -1752,27 +1749,33 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s",
-                       path, bch2_err_str(ret));
+               bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
                goto err;
        }
 
        if (ca->mi.state == BCH_MEMBER_STATE_rw)
                __bch2_dev_read_write(c, ca);
 
-       mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb.sb);
+       if (!ca->mi.freespace_initialized) {
+               ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+               bch_err_msg(ca, ret, "initializing free space");
+               if (ret)
+                       goto err;
+       }
 
-       mi->members[ca->dev_idx].last_mount =
-               cpu_to_le64(ktime_get_real_seconds());
+       if (!ca->journal.nr) {
+               ret = bch2_dev_journal_alloc(ca);
+               bch_err_msg(ca, ret, "allocating journal");
+               if (ret)
+                       goto err;
+       }
 
+       mutex_lock(&c->sb_lock);
+       bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+               cpu_to_le64(ktime_get_real_seconds());
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       ret = bch2_fs_freespace_init(c);
-       if (ret)
-               bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret));
-
        up_write(&c->state_lock);
        return 0;
 err:
@@ -1805,10 +1808,12 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
 
 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-       struct bch_member *mi;
+       struct bch_member *m;
+       u64 old_nbuckets;
        int ret = 0;
 
        down_write(&c->state_lock);
+       old_nbuckets = ca->mi.nbuckets;
 
        if (nbuckets < ca->mi.nbuckets) {
                bch_err(ca, "Cannot shrink yet");
@@ -1826,7 +1831,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
        ret = bch2_dev_buckets_resize(c, ca, nbuckets);
        if (ret) {
-               bch_err(ca, "Resize error: %s", bch2_err_str(ret));
+               bch_err_msg(ca, ret, "resizing buckets");
                goto err;
        }
 
@@ -1835,12 +1840,24 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                goto err;
 
        mutex_lock(&c->sb_lock);
-       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-       mi->nbuckets = cpu_to_le64(nbuckets);
+       m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+       m->nbuckets = cpu_to_le64(nbuckets);
 
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
+       if (ca->mi.freespace_initialized) {
+               ret = bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
+               if (ret)
+                       goto err;
+
+               /*
+                * XXX: this is all wrong transactionally - we'll be able to do
+                * this correctly after the disk space accounting rewrite
+                */
+               ca->usage_base->d[BCH_DATA_free].buckets += nbuckets - old_nbuckets;
+       }
+
        bch2_recalc_capacity(c);
 err:
        up_write(&c->state_lock);
@@ -1869,10 +1886,9 @@ found:
 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                            struct bch_opts opts)
 {
-       struct bch_sb_handle *sb = NULL;
+       DARRAY(struct bch_sb_handle) sbs = { 0 };
        struct bch_fs *c = NULL;
-       struct bch_sb_field_members *mi;
-       unsigned i, best_sb = 0;
+       struct bch_sb_handle *sb, *best = NULL;
        struct printbuf errbuf = PRINTBUF;
        int ret = 0;
 
@@ -1884,51 +1900,46 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                goto err;
        }
 
-       sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
-       if (!sb) {
-               ret = -ENOMEM;
+       ret = darray_make_room(&sbs, nr_devices);
+       if (ret)
                goto err;
-       }
 
-       for (i = 0; i < nr_devices; i++) {
-               ret = bch2_read_super(devices[i], &opts, &sb[i]);
+       for (unsigned i = 0; i < nr_devices; i++) {
+               struct bch_sb_handle sb = { NULL };
+
+               ret = bch2_read_super(devices[i], &opts, &sb);
                if (ret)
                        goto err;
 
+               BUG_ON(darray_push(&sbs, sb));
        }
 
-       for (i = 1; i < nr_devices; i++)
-               if (le64_to_cpu(sb[i].sb->seq) >
-                   le64_to_cpu(sb[best_sb].sb->seq))
-                       best_sb = i;
-
-       mi = bch2_sb_get_members(sb[best_sb].sb);
+       darray_for_each(sbs, sb)
+               if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+                       best = sb;
 
-       i = 0;
-       while (i < nr_devices) {
-               if (i != best_sb &&
-                   !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
-                       pr_info("%pg has been removed, skipping", sb[i].bdev);
-                       bch2_free_super(&sb[i]);
-                       array_remove_item(sb, nr_devices, i);
+       darray_for_each_reverse(sbs, sb) {
+               if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
+                       pr_info("%pg has been removed, skipping", sb->bdev);
+                       bch2_free_super(sb);
+                       darray_remove_item(&sbs, sb);
+                       best -= best > sb;
                        continue;
                }
 
-               ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
+               ret = bch2_dev_in_fs(best->sb, sb->sb);
                if (ret)
                        goto err_print;
-               i++;
        }
 
-       c = bch2_fs_alloc(sb[best_sb].sb, opts);
-       if (IS_ERR(c)) {
-               ret = PTR_ERR(c);
+       c = bch2_fs_alloc(best->sb, opts);
+       ret = PTR_ERR_OR_ZERO(c);
+       if (ret)
                goto err;
-       }
 
        down_write(&c->state_lock);
-       for (i = 0; i < nr_devices; i++) {
-               ret = bch2_dev_attach_bdev(c, &sb[i]);
+       darray_for_each(sbs, sb) {
+               ret = bch2_dev_attach_bdev(c, sb);
                if (ret) {
                        up_write(&c->state_lock);
                        goto err;
@@ -1947,7 +1958,9 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                        goto err;
        }
 out:
-       kfree(sb);
+       darray_for_each(sbs, sb)
+               bch2_free_super(sb);
+       darray_exit(&sbs);
        printbuf_exit(&errbuf);
        module_put(THIS_MODULE);
        return c;
@@ -1957,9 +1970,6 @@ err_print:
 err:
        if (!IS_ERR_OR_NULL(c))
                bch2_fs_stop(c);
-       if (sb)
-               for (i = 0; i < nr_devices; i++)
-                       bch2_free_super(&sb[i]);
        c = ERR_PTR(ret);
        goto out;
 }
@@ -2000,6 +2010,7 @@ err:
 BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
+__maybe_unused
 static unsigned bch2_metadata_version = bcachefs_metadata_version_current;
 module_param_named(version, bch2_metadata_version, uint, 0400);
 
index 36bcb9ec2b3ad9c07a8a4f6e0f1154d30c38b0ad..bf762df18012b1a1b463724d665551506fc74384 100644 (file)
@@ -8,220 +8,6 @@
 
 #include <linux/math64.h>
 
-static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
-{
-       return div_u64(s, ca->mi.bucket_size);
-}
-
-static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
-{
-       return ((sector_t) b) * ca->mi.bucket_size;
-}
-
-static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
-{
-       u32 remainder;
-
-       div_u64_rem(s, ca->mi.bucket_size, &remainder);
-       return remainder;
-}
-
-static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s,
-                                                u32 *offset)
-{
-       return div_u64_rem(s, ca->mi.bucket_size, offset);
-}
-
-static inline bool bch2_dev_is_online(struct bch_dev *ca)
-{
-       return !percpu_ref_is_zero(&ca->io_ref);
-}
-
-static inline bool bch2_dev_is_readable(struct bch_dev *ca)
-{
-       return bch2_dev_is_online(ca) &&
-               ca->mi.state != BCH_MEMBER_STATE_failed;
-}
-
-static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
-{
-       if (!percpu_ref_tryget(&ca->io_ref))
-               return false;
-
-       if (ca->mi.state == BCH_MEMBER_STATE_rw ||
-           (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
-               return true;
-
-       percpu_ref_put(&ca->io_ref);
-       return false;
-}
-
-static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
-{
-       return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
-}
-
-static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
-                                        unsigned dev)
-{
-       unsigned i;
-
-       for (i = 0; i < devs.nr; i++)
-               if (devs.devs[i] == dev)
-                       return true;
-
-       return false;
-}
-
-static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
-                                         unsigned dev)
-{
-       unsigned i;
-
-       for (i = 0; i < devs->nr; i++)
-               if (devs->devs[i] == dev) {
-                       array_remove_item(devs->devs, devs->nr, i);
-                       return;
-               }
-}
-
-static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
-                                        unsigned dev)
-{
-       if (!bch2_dev_list_has_dev(*devs, dev)) {
-               BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
-               devs->devs[devs->nr++] = dev;
-       }
-}
-
-static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
-{
-       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
-}
-
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-                                             const struct bch_devs_mask *mask)
-{
-       struct bch_dev *ca = NULL;
-
-       while ((*iter = mask
-               ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
-               : *iter) < c->sb.nr_devices &&
-              !(ca = rcu_dereference_check(c->devs[*iter],
-                                           lockdep_is_held(&c->state_lock))))
-               (*iter)++;
-
-       return ca;
-}
-
-#define for_each_member_device_rcu(ca, c, iter, mask)                  \
-       for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
-
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
-{
-       struct bch_dev *ca;
-
-       rcu_read_lock();
-       if ((ca = __bch2_next_dev(c, iter, NULL)))
-               percpu_ref_get(&ca->ref);
-       rcu_read_unlock();
-
-       return ca;
-}
-
-/*
- * If you break early, you must drop your ref on the current device
- */
-#define for_each_member_device(ca, c, iter)                            \
-       for ((iter) = 0;                                                \
-            (ca = bch2_get_next_dev(c, &(iter)));                      \
-            percpu_ref_put(&ca->ref), (iter)++)
-
-static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
-                                                     unsigned *iter,
-                                                     int state_mask)
-{
-       struct bch_dev *ca;
-
-       rcu_read_lock();
-       while ((ca = __bch2_next_dev(c, iter, NULL)) &&
-              (!((1 << ca->mi.state) & state_mask) ||
-               !percpu_ref_tryget(&ca->io_ref)))
-               (*iter)++;
-       rcu_read_unlock();
-
-       return ca;
-}
-
-#define __for_each_online_member(ca, c, iter, state_mask)              \
-       for ((iter) = 0;                                                \
-            (ca = bch2_get_next_online_dev(c, &(iter), state_mask));   \
-            percpu_ref_put(&ca->io_ref), (iter)++)
-
-#define for_each_online_member(ca, c, iter)                            \
-       __for_each_online_member(ca, c, iter, ~0)
-
-#define for_each_rw_member(ca, c, iter)                                        \
-       __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
-
-#define for_each_readable_member(ca, c, iter)                          \
-       __for_each_online_member(ca, c, iter,                           \
-               (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
-
-/*
- * If a key exists that references a device, the device won't be going away and
- * we can omit rcu_read_lock():
- */
-static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
-{
-       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-       return rcu_dereference_check(c->devs[idx], 1);
-}
-
-static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
-{
-       EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
-
-       return rcu_dereference_protected(c->devs[idx],
-                                        lockdep_is_held(&c->sb_lock) ||
-                                        lockdep_is_held(&c->state_lock));
-}
-
-/* XXX kill, move to struct bch_fs */
-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
-{
-       struct bch_devs_mask devs;
-       struct bch_dev *ca;
-       unsigned i;
-
-       memset(&devs, 0, sizeof(devs));
-       for_each_online_member(ca, c, i)
-               __set_bit(ca->dev_idx, devs.d);
-       return devs;
-}
-
-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
-{
-       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
-       u64 b_offset    = bucket_to_sector(ca, b);
-       u64 b_end       = bucket_to_sector(ca, b + 1);
-       unsigned i;
-
-       if (!b)
-               return true;
-
-       for (i = 0; i < layout->nr_superblocks; i++) {
-               u64 offset = le64_to_cpu(layout->sb_offset[i]);
-               u64 end = offset + (1 << layout->sb_max_size_bits);
-
-               if (!(offset >= b_end || end <= b_offset))
-                       return true;
-       }
-
-       return false;
-}
-
 struct bch_fs *bch2_dev_to_fs(dev_t);
 struct bch_fs *bch2_uuid_to_fs(__uuid_t);
 
index 89419fc7930d004f5b68cc80a53630ac625003d3..7dda4985b99fe6cfdde52c6df869e3df446d48d0 100644 (file)
@@ -6,8 +6,9 @@ struct bch_sb_handle {
        struct bch_sb           *sb;
        struct block_device     *bdev;
        struct bio              *bio;
+       void                    *holder;
        size_t                  buffer_size;
-       fmode_t                 mode;
+       blk_mode_t              mode;
        unsigned                have_layout:1;
        unsigned                have_bio:1;
        unsigned                fs_sb:1;
@@ -36,16 +37,4 @@ struct bch_member_cpu {
        u8                      valid;
 };
 
-struct bch_disk_group_cpu {
-       bool                            deleted;
-       u16                             parent;
-       struct bch_devs_mask            devs;
-};
-
-struct bch_disk_groups_cpu {
-       struct rcu_head                 rcu;
-       unsigned                        nr;
-       struct bch_disk_group_cpu       entries[];
-};
-
 #endif /* _BCACHEFS_SUPER_TYPES_H */
index 740305e67bdb30291cddb6c61df3687bd618f608..8df45da5a9bf9b8a795623afdedd69ca99589701 100644 (file)
@@ -113,10 +113,6 @@ do {                                                                       \
                prt_human_readable_s64(out, val);                       \
 } while (0)
 
-#define var_printf(_var, fmt)  sysfs_printf(_var, fmt, var(_var))
-#define var_print(_var)                sysfs_print(_var, var(_var))
-#define var_hprint(_var)       sysfs_hprint(_var, var(_var))
-
 #define sysfs_strtoul(file, var)                                       \
 do {                                                                   \
        if (attr == &sysfs_ ## file)                                    \
@@ -139,30 +135,6 @@ do {                                                                       \
        _v;                                                             \
 })
 
-#define strtoul_restrict_or_return(cp, min, max)                       \
-({                                                                     \
-       unsigned long __v = 0;                                          \
-       int _r = strtoul_safe_restrict(cp, __v, min, max);              \
-       if (_r)                                                         \
-               return _r;                                              \
-       __v;                                                            \
-})
-
-#define strtoi_h_or_return(cp)                                         \
-({                                                                     \
-       u64 _v;                                                         \
-       int _r = strtoi_h(cp, &_v);                                     \
-       if (_r)                                                         \
-               return _r;                                              \
-       _v;                                                             \
-})
-
-#define sysfs_hatoi(file, var)                                         \
-do {                                                                   \
-       if (attr == &sysfs_ ## file)                                    \
-               return strtoi_h(buf, &var) ?: (ssize_t) size;           \
-} while (0)
-
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
@@ -177,7 +149,9 @@ read_attribute(bucket_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
 rw_attribute(durability);
-read_attribute(iodone);
+read_attribute(io_done);
+read_attribute(io_errors);
+write_attribute(io_errors_reset);
 
 read_attribute(io_latency_read);
 read_attribute(io_latency_write);
@@ -240,7 +214,7 @@ read_attribute(copy_gc_wait);
 
 rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
 rw_attribute(promote_whole_extents);
 
 read_attribute(new_stripes);
@@ -248,7 +222,6 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
-read_attribute(data_jobs);
 read_attribute(moving_ctxts);
 
 #ifdef CONFIG_BCACHEFS_TESTS
@@ -281,7 +254,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        enum btree_id id;
@@ -292,18 +265,18 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
            incompressible_sectors = 0,
            compressed_sectors_compressed = 0,
            compressed_sectors_uncompressed = 0;
-       int ret;
+       int ret = 0;
 
        if (!test_bit(BCH_FS_STARTED, &c->flags))
                return -EPERM;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       trans = bch2_trans_get(c);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                if (!btree_type_has_ptrs(id))
                        continue;
 
-               for_each_btree_key(&trans, iter, id, POS_MIN,
+               for_each_btree_key(trans, iter, id, POS_MIN,
                                   BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                        const union bch_extent_entry *entry;
@@ -337,10 +310,10 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
                        else if (compressed)
                                nr_compressed_extents++;
                }
-               bch2_trans_iter_exit(&trans, &iter);
+               bch2_trans_iter_exit(trans, &iter);
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                return ret;
@@ -370,7 +343,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
 static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
+       prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree));
        bch2_bpos_to_text(out, c->gc_gens_pos);
        prt_printf(out, "\n");
 }
@@ -415,8 +388,8 @@ SHOW(bch2_fs)
        if (attr == &sysfs_copy_gc_wait)
                bch2_copygc_wait_to_text(out, c);
 
-       if (attr == &sysfs_rebalance_work)
-               bch2_rebalance_work_to_text(out, c);
+       if (attr == &sysfs_rebalance_status)
+               bch2_rebalance_status_to_text(out, c);
 
        sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 
@@ -458,9 +431,6 @@ SHOW(bch2_fs)
        if (attr == &sysfs_io_timers_write)
                bch2_io_timers_to_text(out, &c->io_clock[WRITE]);
 
-       if (attr == &sysfs_data_jobs)
-               bch2_data_jobs_to_text(out, c);
-
        if (attr == &sysfs_moving_ctxts)
                bch2_fs_moving_ctxts_to_text(out, c);
 
@@ -678,10 +648,9 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_copy_gc_wait,
 
        &sysfs_rebalance_enabled,
-       &sysfs_rebalance_work,
+       &sysfs_rebalance_status,
        sysfs_pd_controller_files(rebalance),
 
-       &sysfs_data_jobs,
        &sysfs_moving_ctxts,
 
        &sysfs_internal_uuid,
@@ -740,10 +709,8 @@ STORE(bch2_fs_opts_dir)
        bch2_opt_set_by_id(&c->opts, id, v);
 
        if ((id == Opt_background_target ||
-            id == Opt_background_compression) && v) {
-               bch2_rebalance_add_work(c, S64_MAX);
-               rebalance_wakeup(c);
-       }
+            id == Opt_background_compression) && v)
+               bch2_set_rebalance_needs_scan(c, 0);
 
        ret = size;
 err:
@@ -915,7 +882,7 @@ static const char * const bch2_rw[] = {
        NULL
 };
 
-static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
+static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
 {
        int rw, i;
 
@@ -943,13 +910,8 @@ SHOW(bch2_dev)
        sysfs_print(discard,            ca->mi.discard);
 
        if (attr == &sysfs_label) {
-               if (ca->mi.group) {
-                       mutex_lock(&c->sb_lock);
-                       bch2_disk_path_to_text(out, c->disk_sb.sb,
-                                              ca->mi.group - 1);
-                       mutex_unlock(&c->sb_lock);
-               }
-
+               if (ca->mi.group)
+                       bch2_disk_path_to_text(out, c, ca->mi.group - 1);
                prt_char(out, '\n');
        }
 
@@ -963,8 +925,11 @@ SHOW(bch2_dev)
                prt_char(out, '\n');
        }
 
-       if (attr == &sysfs_iodone)
-               dev_iodone_to_text(out, ca);
+       if (attr == &sysfs_io_done)
+               dev_io_done_to_text(out, ca);
+
+       if (attr == &sysfs_io_errors)
+               bch2_dev_io_errors_to_text(out, ca);
 
        sysfs_print(io_latency_read,            atomic64_read(&ca->cur_latency[READ]));
        sysfs_print(io_latency_write,           atomic64_read(&ca->cur_latency[WRITE]));
@@ -995,7 +960,7 @@ STORE(bch2_dev)
                bool v = strtoul_or_return(buf);
 
                mutex_lock(&c->sb_lock);
-               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+               mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 
                if (v != BCH_MEMBER_DISCARD(mi)) {
                        SET_BCH_MEMBER_DISCARD(mi, v);
@@ -1008,9 +973,9 @@ STORE(bch2_dev)
                u64 v = strtoul_or_return(buf);
 
                mutex_lock(&c->sb_lock);
-               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+               mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 
-               if (v != BCH_MEMBER_DURABILITY(mi)) {
+               if (v + 1 != BCH_MEMBER_DURABILITY(mi)) {
                        SET_BCH_MEMBER_DURABILITY(mi, v + 1);
                        bch2_write_super(c);
                }
@@ -1031,6 +996,9 @@ STORE(bch2_dev)
                        return ret;
        }
 
+       if (attr == &sysfs_io_errors_reset)
+               bch2_dev_errors_reset(ca);
+
        return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -1048,7 +1016,9 @@ struct attribute *bch2_dev_files[] = {
        &sysfs_label,
 
        &sysfs_has_data,
-       &sysfs_iodone,
+       &sysfs_io_done,
+       &sysfs_io_errors,
+       &sysfs_io_errors_reset,
 
        &sysfs_io_latency_read,
        &sysfs_io_latency_write,
index 1d4b0a583586348d42d6f39c1b2d364676a5e2a6..2fc9e60c754b4914b9cd5b5b460c99e1f4b7c3a5 100644 (file)
@@ -4,7 +4,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
-#include "subvolume.h"
+#include "snapshot.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
@@ -31,7 +31,7 @@ static void delete_test_keys(struct bch_fs *c)
 
 static int test_delete(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
@@ -39,44 +39,40 @@ static int test_delete(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&k.k_i);
        k.k.p.snapshot = U32_MAX;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(&trans, &iter, &k.k_i, 0));
-       if (ret) {
-               bch_err_msg(c, ret, "update error");
+               bch2_trans_update(trans, &iter, &k.k_i, 0));
+       bch_err_msg(c, ret, "update error");
+       if (ret)
                goto err;
-       }
 
        pr_info("deleting once");
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
-       if (ret) {
-               bch_err_msg(c, ret, "delete error (first)");
+               bch2_btree_delete_at(trans, &iter, 0));
+       bch_err_msg(c, ret, "delete error (first)");
+       if (ret)
                goto err;
-       }
 
        pr_info("deleting twice");
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
-       if (ret) {
-               bch_err_msg(c, ret, "delete error (second)");
+               bch2_btree_delete_at(trans, &iter, 0));
+       bch_err_msg(c, ret, "delete error (second)");
+       if (ret)
                goto err;
-       }
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
@@ -84,214 +80,193 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&k.k_i);
        k.k.p.snapshot = U32_MAX;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p,
                             BTREE_ITER_INTENT);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_trans_update(&trans, &iter, &k.k_i, 0));
-       if (ret) {
-               bch_err_msg(c, ret, "update error");
+               bch2_trans_update(trans, &iter, &k.k_i, 0));
+       bch_err_msg(c, ret, "update error");
+       if (ret)
                goto err;
-       }
 
-       bch2_trans_unlock(&trans);
+       bch2_trans_unlock(trans);
        bch2_journal_flush_all_pins(&c->journal);
 
-       ret = commit_do(&trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                bch2_btree_iter_traverse(&iter) ?:
-               bch2_btree_delete_at(&trans, &iter, 0));
-       if (ret) {
-               bch_err_msg(c, ret, "delete error");
+               bch2_btree_delete_at(trans, &iter, 0));
+       bch_err_msg(c, ret, "delete error");
+       if (ret)
                goto err;
-       }
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i;
-               k.k.p.snapshot = U32_MAX;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i;
+               ck.k.p.snapshot = U32_MAX;
 
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-                                       NULL, NULL, 0);
-               if (ret) {
-                       bch_err_msg(c, ret, "insert error");
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+               bch_err_msg(c, ret, "insert error");
+               if (ret)
                        goto err;
-               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(k.k->p.offset != i++);
                0;
        }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating forwards");
+       bch_err_msg(c, ret, "error iterating forwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i != nr);
 
        pr_info("iterating backwards");
 
-       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
                                         SPOS(0, U64_MAX, U32_MAX), 0, k,
                ({
                        BUG_ON(k.k->p.offset != --i);
                        0;
                }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating backwards");
+       bch_err_msg(c, ret, "error iterating backwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test extents");
 
        for (i = 0; i < nr; i += 8) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i + 8;
-               k.k.p.snapshot = U32_MAX;
-               k.k.size = 8;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i + 8;
+               ck.k.p.snapshot = U32_MAX;
+               ck.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                                       NULL, NULL, 0);
-               if (ret) {
-                       bch_err_msg(c, ret, "insert error");
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+               bch_err_msg(c, ret, "insert error");
+               if (ret)
                        goto err;
-               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i);
                i = k.k->p.offset;
                0;
        }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating forwards");
+       bch_err_msg(c, ret, "error iterating forwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i != nr);
 
        pr_info("iterating backwards");
 
-       ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
                                         SPOS(0, U64_MAX, U32_MAX), 0, k,
                ({
                        BUG_ON(k.k->p.offset != i);
                        i = bkey_start_offset(k.k);
                        0;
                }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating backwards");
+       bch_err_msg(c, ret, "error iterating backwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i++) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i * 2;
-               k.k.p.snapshot = U32_MAX;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i * 2;
+               ck.k.p.snapshot = U32_MAX;
 
-               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
-                                       NULL, NULL, 0);
-               if (ret) {
-                       bch_err_msg(c, ret, "insert error");
+               ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
+               bch_err_msg(c, ret, "insert error");
+               if (ret)
                        goto err;
-               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(k.k->p.offset != i);
                i += 2;
                0;
        }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating forwards");
+       bch_err_msg(c, ret, "error iterating forwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i != nr * 2);
 
@@ -299,7 +274,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  BTREE_ITER_SLOTS, k, ({
                if (i >= nr * 2)
@@ -317,45 +292,41 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
        }
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        delete_test_keys(c);
 
        pr_info("inserting test keys");
 
        for (i = 0; i < nr; i += 16) {
-               struct bkey_i_cookie k;
+               struct bkey_i_cookie ck;
 
-               bkey_cookie_init(&k.k_i);
-               k.k.p.offset = i + 16;
-               k.k.p.snapshot = U32_MAX;
-               k.k.size = 8;
+               bkey_cookie_init(&ck.k_i);
+               ck.k.p.offset = i + 16;
+               ck.k.p.snapshot = U32_MAX;
+               ck.k.size = 8;
 
-               ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                                       NULL, NULL, 0);
-               if (ret) {
-                       bch_err_msg(c, ret, "insert error");
+               ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
+               bch_err_msg(c, ret, "insert error");
+               if (ret)
                        goto err;
-               }
        }
 
        pr_info("iterating forwards");
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k, ({
                BUG_ON(bkey_start_offset(k.k) != i + 8);
@@ -363,10 +334,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                i += 16;
                0;
        }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating forwards");
+       bch_err_msg(c, ret, "error iterating forwards");
+       if (ret)
                goto err;
-       }
 
        BUG_ON(i != nr);
 
@@ -374,7 +344,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents,
+       ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
                                 SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                 BTREE_ITER_SLOTS, k, ({
                if (i == nr)
@@ -386,13 +356,12 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                i = k.k->p.offset;
                0;
        }));
-       if (ret) {
-               bch_err_msg(c, ret, "error iterating forwards by slots");
+       bch_err_msg(c, ret, "error iterating forwards by slots");
+       if (ret)
                goto err;
-       }
        ret = 0;
 err:
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return 0;
 }
 
@@ -402,43 +371,41 @@ err:
  */
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return 0;
 }
 
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             SPOS(0, 0, U32_MAX), 0);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
        BUG_ON(k.k);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return 0;
 }
 
@@ -458,10 +425,8 @@ static int insert_test_extent(struct bch_fs *c,
        k.k_i.k.size = end - start;
        k.k_i.k.version.lo = test_version++;
 
-       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
-                               NULL, NULL, 0);
-       if (ret)
-               bch_err_fn(c, ret);
+       ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -515,10 +480,9 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
        k.k_i.k.size = len;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i,
+               bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
                                            BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -538,7 +502,7 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum)
 /* Test skipping over keys in unrelated snapshots: */
 static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_cookie cookie;
@@ -546,20 +510,19 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
 
        bkey_cookie_init(&cookie.k_i);
        cookie.k.p.snapshot = snapid_hi;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
        if (ret)
                return ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       trans = bch2_trans_get(c);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, snapid_lo), 0);
-       lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
+       lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX))));
 
        BUG_ON(k.k->p.snapshot != U32_MAX);
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -572,13 +535,12 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
 
        bkey_cookie_init(&cookie.k_i);
        cookie.k.p.snapshot = U32_MAX;
-       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
-                               NULL, NULL, 0);
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0);
        if (ret)
                return ret;
 
        ret = bch2_trans_do(c, NULL, NULL, 0,
-                     bch2_snapshot_node_create(&trans, U32_MAX,
+                     bch2_snapshot_node_create(trans, U32_MAX,
                                                snapids,
                                                snapid_subvols,
                                                2));
@@ -589,12 +551,8 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
                swap(snapids[0], snapids[1]);
 
        ret = test_snapshot_filter(c, snapids[0], snapids[1]);
-       if (ret) {
-               bch_err_msg(c, ret, "from test_snapshot_filter");
-               return ret;
-       }
-
-       return 0;
+       bch_err_msg(c, ret, "from test_snapshot_filter");
+       return ret;
 }
 
 /* perf tests */
@@ -609,38 +567,34 @@ static u64 test_rand(void)
 
 static int rand_insert(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bkey_i_cookie k;
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i++) {
                bkey_cookie_init(&k.k_i);
                k.k.p.offset = test_rand();
                k.k.p.snapshot = U32_MAX;
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int rand_insert_multi(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct bkey_i_cookie k[8];
        int ret = 0;
        unsigned j;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
                for (j = 0; j < ARRAY_SIZE(k); j++) {
                        bkey_cookie_init(&k[j].k_i);
@@ -648,46 +602,45 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
                        k[j].k.p.snapshot = U32_MAX;
                }
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
-                       __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
                bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-               lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
+               lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter)));
                ret = bkey_err(k);
                if (ret)
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -703,8 +656,7 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
        k = bch2_btree_iter_peek(iter);
        ret = bkey_err(k);
-       if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               bch_err_msg(trans->c, ret, "lookup error");
+       bch_err_msg(trans->c, ret, "lookup error");
        if (ret)
                return ret;
 
@@ -719,26 +671,25 @@ static int rand_mixed_trans(struct btree_trans *trans,
 
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_i_cookie cookie;
        int ret = 0;
        u64 i, rand;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs,
                             SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
                rand = test_rand();
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       rand_mixed_trans(&trans, &iter, &cookie, i, rand));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       rand_mixed_trans(trans, &iter, &cookie, i, rand));
                if (ret)
                        break;
        }
 
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_iter_exit(trans, &iter);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -766,22 +717,20 @@ err:
 
 static int rand_delete(struct bch_fs *c, u64 nr)
 {
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        int ret = 0;
        u64 i;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        for (i = 0; i < nr; i++) {
                struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
-               ret = commit_do(&trans, NULL, NULL, 0,
-                       __do_delete(&trans, pos));
+               ret = commit_do(trans, NULL, NULL, 0,
+                       __do_delete(trans, pos));
                if (ret)
                        break;
        }
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
        return ret;
 }
 
@@ -794,14 +743,14 @@ static int seq_insert(struct bch_fs *c, u64 nr)
        bkey_cookie_init(&insert.k_i);
 
        return bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                                        SPOS(0, 0, U32_MAX),
                                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k,
                                        NULL, NULL, 0, ({
                        if (iter.pos.offset >= nr)
                                break;
                        insert.k.p = iter.pos;
-                       bch2_trans_update(&trans, &iter, &insert.k_i, 0);
+                       bch2_trans_update(trans, &iter, &insert.k_i, 0);
                })));
 }
 
@@ -811,7 +760,7 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
        struct bkey_s_c k;
 
        return bch2_trans_run(c,
-               for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
                                  SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
                                  0, k,
                0));
@@ -823,14 +772,14 @@ static int seq_overwrite(struct bch_fs *c, u64 nr)
        struct bkey_s_c k;
 
        return bch2_trans_run(c,
-               for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs,
+               for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
                                        SPOS(0, 0, U32_MAX),
                                        BTREE_ITER_INTENT, k,
                                        NULL, NULL, 0, ({
                        struct bkey_i_cookie u;
 
                        bkey_reassemble(&u.k_i, k);
-                       bch2_trans_update(&trans, &iter, &u.k_i, 0);
+                       bch2_trans_update(trans, &iter, &u.k_i, 0);
                })));
 }
 
index d294b3d71b367cc14886857b6612efac8167ad2e..dc48b52b01b49c4ed7af877921dd7e2b446d75a8 100644 (file)
@@ -7,10 +7,11 @@
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "keylist.h"
+#include "move_types.h"
 #include "opts.h"
+#include "six.h"
 
 #include <linux/blktrace_api.h>
-#include <linux/six.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
index a743ab47796654a471d4cb1e33d0c6302525b0a5..09a530325dd05e43d18f7e81a3d64a3fbd95c6ea 100644 (file)
@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(btree_node,
        TP_printk("%d,%d %u %s %llu:%llu:%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->level,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
 );
 
@@ -137,6 +137,25 @@ DEFINE_EVENT(bio, read_promote,
        TP_ARGS(bio)
 );
 
+TRACE_EVENT(read_nopromote,
+       TP_PROTO(struct bch_fs *c, int ret),
+       TP_ARGS(c, ret),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev             )
+               __array(char,           ret, 32         )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret));
+       ),
+
+       TP_printk("%d,%d ret %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ret)
+);
+
 DEFINE_EVENT(bio, read_bounce,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
@@ -177,10 +196,9 @@ DEFINE_EVENT(bio, journal_write,
 TRACE_EVENT(journal_reclaim_start,
        TP_PROTO(struct bch_fs *c, bool direct, bool kicked,
                 u64 min_nr, u64 min_key_cache,
-                u64 prereserved, u64 prereserved_total,
                 u64 btree_cache_dirty, u64 btree_cache_total,
                 u64 btree_key_cache_dirty, u64 btree_key_cache_total),
-       TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total,
+       TP_ARGS(c, direct, kicked, min_nr, min_key_cache,
                btree_cache_dirty, btree_cache_total,
                btree_key_cache_dirty, btree_key_cache_total),
 
@@ -190,8 +208,6 @@ TRACE_EVENT(journal_reclaim_start,
                __field(bool,           kicked                  )
                __field(u64,            min_nr                  )
                __field(u64,            min_key_cache           )
-               __field(u64,            prereserved             )
-               __field(u64,            prereserved_total       )
                __field(u64,            btree_cache_dirty       )
                __field(u64,            btree_cache_total       )
                __field(u64,            btree_key_cache_dirty   )
@@ -204,22 +220,18 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->kicked                 = kicked;
                __entry->min_nr                 = min_nr;
                __entry->min_key_cache          = min_key_cache;
-               __entry->prereserved            = prereserved;
-               __entry->prereserved_total      = prereserved_total;
                __entry->btree_cache_dirty      = btree_cache_dirty;
                __entry->btree_cache_total      = btree_cache_total;
                __entry->btree_key_cache_dirty  = btree_key_cache_dirty;
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+       TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu btree cache %llu/%llu key cache %llu/%llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->direct,
                  __entry->kicked,
                  __entry->min_nr,
                  __entry->min_key_cache,
-                 __entry->prereserved,
-                 __entry->prereserved_total,
                  __entry->btree_cache_dirty,
                  __entry->btree_cache_total,
                  __entry->btree_key_cache_dirty,
@@ -403,37 +415,55 @@ TRACE_EVENT(btree_path_relock_fail,
                __field(u8,                     level           )
                TRACE_BPOS_entries(pos)
                __array(char,                   node, 24        )
+               __field(u8,                     self_read_count )
+               __field(u8,                     self_intent_count)
+               __field(u8,                     read_count      )
+               __field(u8,                     intent_count    )
                __field(u32,                    iter_lock_seq   )
                __field(u32,                    node_lock_seq   )
        ),
 
        TP_fast_assign(
                struct btree *b = btree_path_node(path, level);
+               struct six_lock_count c;
 
                strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
                __entry->btree_id               = path->btree_id;
                __entry->level                  = path->level;
                TRACE_BPOS_assign(pos, path->pos);
-               if (IS_ERR(b))
+
+               c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+               __entry->self_read_count        = c.n[SIX_LOCK_read];
+               __entry->self_intent_count      = c.n[SIX_LOCK_intent];
+
+               if (IS_ERR(b)) {
                        strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node));
-               else
+               } else {
+                       c = six_lock_counts(&path->l[level].b->c.lock);
+                       __entry->read_count     = c.n[SIX_LOCK_read];
+                       __entry->intent_count   = c.n[SIX_LOCK_intent];
                        scnprintf(__entry->node, sizeof(__entry->node), "%px", b);
+               }
                __entry->iter_lock_seq          = path->l[level].lock_seq;
                __entry->node_lock_seq          = is_btree_node(path, level)
                        ? six_lock_seq(&path->l[level].b->c.lock)
                        : 0;
        ),
 
-       TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u",
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
                  __entry->level,
                  __entry->node,
+                 __entry->self_read_count,
+                 __entry->self_intent_count,
+                 __entry->read_count,
+                 __entry->intent_count,
                  __entry->iter_lock_seq,
                  __entry->node_lock_seq)
 );
@@ -475,7 +505,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
                __entry->self_intent_count      = c.n[SIX_LOCK_intent];
                c = six_lock_counts(&path->l[level].b->c.lock);
                __entry->read_count             = c.n[SIX_LOCK_read];
-               __entry->intent_count           = c.n[SIX_LOCK_read];
+               __entry->intent_count           = c.n[SIX_LOCK_intent];
                __entry->iter_lock_seq          = path->l[level].lock_seq;
                __entry->node_lock_seq          = is_btree_node(path, level)
                        ? six_lock_seq(&path->l[level].b->c.lock)
@@ -485,7 +515,7 @@ TRACE_EVENT(btree_path_upgrade_fail,
        TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
@@ -730,25 +760,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
 );
 
 TRACE_EVENT(move_data,
-       TP_PROTO(struct bch_fs *c, u64 sectors_moved,
-                u64 keys_moved),
-       TP_ARGS(c, sectors_moved, keys_moved),
+       TP_PROTO(struct bch_fs *c,
+                struct bch_move_stats *stats),
+       TP_ARGS(c, stats),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            sectors_moved   )
+               __field(dev_t,          dev             )
                __field(u64,            keys_moved      )
+               __field(u64,            keys_raced      )
+               __field(u64,            sectors_seen    )
+               __field(u64,            sectors_moved   )
+               __field(u64,            sectors_raced   )
        ),
 
        TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->sectors_moved = sectors_moved;
-               __entry->keys_moved = keys_moved;
+               __entry->dev            = c->dev;
+               __entry->keys_moved     = atomic64_read(&stats->keys_moved);
+               __entry->keys_raced     = atomic64_read(&stats->keys_raced);
+               __entry->sectors_seen   = atomic64_read(&stats->sectors_seen);
+               __entry->sectors_moved  = atomic64_read(&stats->sectors_moved);
+               __entry->sectors_raced  = atomic64_read(&stats->sectors_raced);
        ),
 
-       TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+       TP_printk("%d,%d keys moved %llu raced %llu"
+                 "sectors seen %llu moved %llu raced %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->sectors_moved, __entry->keys_moved)
+                 __entry->keys_moved,
+                 __entry->keys_raced,
+                 __entry->sectors_seen,
+                 __entry->sectors_moved,
+                 __entry->sectors_raced)
 );
 
 TRACE_EVENT(evacuate_bucket,
@@ -975,7 +1016,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
        TP_printk("%s %pS btree %s pos %llu:%llu:%u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot)
@@ -995,13 +1036,16 @@ DEFINE_EVENT(transaction_restart_iter,   trans_restart_btree_node_split,
        TP_ARGS(trans, caller_ip, path)
 );
 
+struct get_locks_fail;
+
 TRACE_EVENT(trans_restart_upgrade,
        TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip,
                 struct btree_path *path,
                 unsigned old_locks_want,
-                unsigned new_locks_want),
-       TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want),
+                unsigned new_locks_want,
+                struct get_locks_fail *f),
+       TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
 
        TP_STRUCT__entry(
                __array(char,                   trans_fn, 32    )
@@ -1009,6 +1053,11 @@ TRACE_EVENT(trans_restart_upgrade,
                __field(u8,                     btree_id        )
                __field(u8,                     old_locks_want  )
                __field(u8,                     new_locks_want  )
+               __field(u8,                     level           )
+               __field(u32,                    path_seq        )
+               __field(u32,                    node_seq        )
+               __field(u32,                    path_alloc_seq  )
+               __field(u32,                    downgrade_seq)
                TRACE_BPOS_entries(pos)
        ),
 
@@ -1018,18 +1067,28 @@ TRACE_EVENT(trans_restart_upgrade,
                __entry->btree_id               = path->btree_id;
                __entry->old_locks_want         = old_locks_want;
                __entry->new_locks_want         = new_locks_want;
+               __entry->level                  = f->l;
+               __entry->path_seq               = path->l[f->l].lock_seq;
+               __entry->node_seq               = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
+               __entry->path_alloc_seq         = path->alloc_seq;
+               __entry->downgrade_seq          = path->downgrade_seq;
                TRACE_BPOS_assign(pos, path->pos)
        ),
 
-       TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u",
+       TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
                  __entry->old_locks_want,
-                 __entry->new_locks_want)
+                 __entry->new_locks_want,
+                 __entry->level,
+                 __entry->path_seq,
+                 __entry->node_seq,
+                 __entry->path_alloc_seq,
+                 __entry->downgrade_seq)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
@@ -1182,7 +1241,7 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
        TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u",
                  __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 bch2_btree_ids[__entry->btree_id],
+                 bch2_btree_id_str(__entry->btree_id),
                  __entry->pos_inode,
                  __entry->pos_offset,
                  __entry->pos_snapshot,
@@ -1190,6 +1249,42 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
                  __entry->new_u64s)
 );
 
+TRACE_EVENT(path_downgrade,
+       TP_PROTO(struct btree_trans *trans,
+                unsigned long caller_ip,
+                struct btree_path *path,
+                unsigned old_locks_want),
+       TP_ARGS(trans, caller_ip, path, old_locks_want),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+               __field(unsigned,               old_locks_want  )
+               __field(unsigned,               new_locks_want  )
+               __field(unsigned,               btree           )
+               TRACE_BPOS_entries(pos)
+       ),
+
+       TP_fast_assign(
+               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+               __entry->old_locks_want         = old_locks_want;
+               __entry->new_locks_want         = path->locks_want;
+               __entry->btree                  = path->btree_id;
+               TRACE_BPOS_assign(pos, path->pos);
+       ),
+
+       TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 __entry->old_locks_want,
+                 __entry->new_locks_want,
+                 bch2_btree_id_str(__entry->btree),
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot)
+);
+
 DEFINE_EVENT(transaction_event,        trans_restart_write_buffer_flush,
        TP_PROTO(struct btree_trans *trans,
                 unsigned long caller_ip),
index ae4f6de3c27435609cd7b7f009c710683b88893c..c3303b02e5de42629f0baab36d81d63bfc5a4a68 100644 (file)
@@ -112,10 +112,10 @@ got_unit:
 
 #define parse_or_ret(cp, _f)                   \
 do {                                           \
-       int ret = _f;                           \
-       if (ret < 0)                            \
-               return ret;                     \
-       cp += ret;                              \
+       int _ret = _f;                          \
+       if (_ret < 0)                           \
+               return _ret;                    \
+       cp += _ret;                             \
 } while (0)
 
 static int __bch2_strtou64_h(const char *cp, u64 *res)
@@ -216,6 +216,7 @@ u64 bch2_read_flag_list(char *opt, const char * const list[])
 
        while ((p = strsep(&s, ","))) {
                int flag = match_string(list, -1, p);
+
                if (flag < 0) {
                        ret = -1;
                        break;
@@ -268,6 +269,7 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
 
 int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
 {
+#ifdef CONFIG_STACKTRACE
        unsigned nr_entries = 0;
        int ret = 0;
 
@@ -288,6 +290,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
        up_read(&task->signal->exec_update_lock);
 
        return ret;
+#else
+       return 0;
+#endif
 }
 
 void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
@@ -310,6 +315,58 @@ int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
        return ret;
 }
 
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+       time_t t = sec;
+       char buf[64];
+       ctime_r(&t, buf);
+       strim(buf);
+       prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+       char buf[64];
+       snprintf(buf, sizeof(buf), "%ptT", &sec);
+       prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+       const char      *name;
+       u64             nsecs;
+} time_units[] = {
+       { "ns",         1                },
+       { "us",         NSEC_PER_USEC    },
+       { "ms",         NSEC_PER_MSEC    },
+       { "s",          NSEC_PER_SEC     },
+       { "m",          (u64) NSEC_PER_SEC * 60},
+       { "h",          (u64) NSEC_PER_SEC * 3600},
+       { "eon",        U64_MAX          },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+       const struct time_unit *u;
+
+       for (u = time_units;
+            u + 1 < time_units + ARRAY_SIZE(time_units) &&
+            ns >= u[1].nsecs << 1;
+            u++)
+               ;
+
+       return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+       const struct time_unit *u = pick_time_units(ns);
+
+       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
 /* time stats: */
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -354,6 +411,7 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
                mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
                stats->max_duration = max(stats->max_duration, duration);
                stats->min_duration = min(stats->min_duration, duration);
+               stats->total_duration += duration;
                bch2_quantiles_update(&stats->quantiles, duration);
        }
 
@@ -367,20 +425,24 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
        }
 }
 
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+                                          struct bch2_time_stat_buffer *b)
+{
+       for (struct bch2_time_stat_buffer_entry *i = b->entries;
+            i < b->entries + ARRAY_SIZE(b->entries);
+            i++)
+               bch2_time_stats_update_one(stats, i->start, i->end);
+       b->nr = 0;
+}
+
 static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
                                                  struct bch2_time_stat_buffer *b)
 {
-       struct bch2_time_stat_buffer_entry *i;
        unsigned long flags;
 
        spin_lock_irqsave(&stats->lock, flags);
-       for (i = b->entries;
-            i < b->entries + ARRAY_SIZE(b->entries);
-            i++)
-               bch2_time_stats_update_one(stats, i->start, i->end);
+       __bch2_time_stats_clear_buffer(stats, b);
        spin_unlock_irqrestore(&stats->lock, flags);
-
-       b->nr = 0;
 }
 
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
@@ -418,40 +480,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
                preempt_enable();
        }
 }
-#endif
-
-static const struct time_unit {
-       const char      *name;
-       u64             nsecs;
-} time_units[] = {
-       { "ns",         1                },
-       { "us",         NSEC_PER_USEC    },
-       { "ms",         NSEC_PER_MSEC    },
-       { "s",          NSEC_PER_SEC     },
-       { "m",          (u64) NSEC_PER_SEC * 60},
-       { "h",          (u64) NSEC_PER_SEC * 3600},
-       { "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-       const struct time_unit *u;
-
-       for (u = time_units;
-            u + 1 < time_units + ARRAY_SIZE(time_units) &&
-            ns >= u[1].nsecs << 1;
-            u++)
-               ;
-
-       return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
-       const struct time_unit *u = pick_time_units(ns);
-
-       prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
 
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
@@ -462,8 +490,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
        prt_printf(out, "%s", u->name);
 }
 
-#define TABSTOP_SIZE 12
-
 static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
 {
        prt_str(out, name);
@@ -472,12 +498,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
        prt_newline(out);
 }
 
+#define TABSTOP_SIZE 12
+
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
        const struct time_unit *u;
        s64 f_mean = 0, d_mean = 0;
        u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
        int i;
+
+       if (stats->buffer) {
+               int cpu;
+
+               spin_lock_irq(&stats->lock);
+               for_each_possible_cpu(cpu)
+                       __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+               spin_unlock_irq(&stats->lock);
+       }
+
        /*
         * avoid divide by zero
         */
@@ -523,6 +561,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 
        pr_name_and_units(out, "min:", stats->min_duration);
        pr_name_and_units(out, "max:", stats->max_duration);
+       pr_name_and_units(out, "total:", stats->total_duration);
 
        prt_printf(out, "mean:");
        prt_tab(out);
@@ -580,6 +619,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
                last_q = q;
        }
 }
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
 
 void bch2_time_stats_exit(struct bch2_time_stats *stats)
 {
@@ -600,11 +642,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats)
 
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
- * some work
- *
- * @d - the struct bch_ratelimit to update
- *
- * Returns the amount of time to delay by, in jiffies
+ *             some work
+ * @d:         the struct bch_ratelimit to update
+ * Returns:    the amount of time to delay by, in jiffies
  */
 u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 {
@@ -617,9 +657,8 @@ u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
 
 /**
  * bch2_ratelimit_increment() - increment @d by the amount of work done
- *
- * @d - the struct bch_ratelimit to update
- * @done - the amount of work done, in arbitrary units
+ * @d:         the struct bch_ratelimit to update
+ * @done:      the amount of work done, in arbitrary units
  */
 void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 {
@@ -756,10 +795,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
        }
 }
 
-int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
 {
        while (size) {
-               struct page *page = alloc_pages_noprof(gfp_mask, 0);
+               struct page *page = alloc_pages(gfp_mask, 0);
                unsigned len = min_t(size_t, PAGE_SIZE, size);
 
                if (!page)
@@ -797,9 +836,10 @@ void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
        struct bvec_iter iter;
 
        __bio_for_each_segment(bv, dst, iter, dst_iter) {
-               void *dstp = kmap_atomic(bv.bv_page);
+               void *dstp = kmap_local_page(bv.bv_page);
+
                memcpy(dstp + bv.bv_offset, src, bv.bv_len);
-               kunmap_atomic(dstp);
+               kunmap_local(dstp);
 
                src += bv.bv_len;
        }
@@ -811,9 +851,10 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
        struct bvec_iter iter;
 
        __bio_for_each_segment(bv, src, iter, src_iter) {
-               void *srcp = kmap_atomic(bv.bv_page);
+               void *srcp = kmap_local_page(bv.bv_page);
+
                memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
-               kunmap_atomic(srcp);
+               kunmap_local(srcp);
 
                dst += bv.bv_len;
        }
index 5fa29dab393bb6492163a36cfc3ea7c101059335..54e309d94b9bebeb43c0503b145b6ad38b0771c5 100644 (file)
@@ -60,13 +60,12 @@ static inline void vpfree(void *p, size_t size)
                free_pages((unsigned long) p, get_order(size));
 }
 
-static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
 {
-       return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN,
-                                             get_order(size)) ?:
-               __vmalloc_noprof(size, gfp_mask);
+       return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
+                                        get_order(size)) ?:
+               __vmalloc(size, gfp_mask);
 }
-#define vpmalloc(_size, _gfp)  alloc_hooks(vpmalloc_noprof(_size, _gfp))
 
 static inline void kvpfree(void *p, size_t size)
 {
@@ -76,13 +75,12 @@ static inline void kvpfree(void *p, size_t size)
                vpfree(p, size);
 }
 
-static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask)
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
 {
        return size < PAGE_SIZE
-               ? kmalloc_noprof(size, gfp_mask)
-               : vpmalloc_noprof(size, gfp_mask);
+               ? kmalloc(size, gfp_mask)
+               : vpmalloc(size, gfp_mask);
 }
-#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp))
 
 int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
 
@@ -246,26 +244,7 @@ do {                                                                       \
 #define prt_bitflags(...)              bch2_prt_bitflags(__VA_ARGS__)
 
 void bch2_pr_time_units(struct printbuf *, u64);
-
-#ifdef __KERNEL__
-static inline void pr_time(struct printbuf *out, u64 time)
-{
-       prt_printf(out, "%llu", time);
-}
-#else
-#include <time.h>
-static inline void pr_time(struct printbuf *out, u64 _time)
-{
-       char time_str[64];
-       time_t time = _time;
-       struct tm *tm = localtime(&time);
-       size_t err = strftime(time_str, sizeof(time_str), "%c", tm);
-       if (!err)
-               prt_printf(out, "(formatting error)");
-       else
-               prt_printf(out, "%s", time_str);
-}
-#endif
+void bch2_prt_datetime(struct printbuf *, time64_t);
 
 #ifdef __KERNEL__
 static inline void uuid_unparse_lower(u8 *uuid, char *out)
@@ -393,8 +372,9 @@ struct bch2_time_stat_buffer {
 struct bch2_time_stats {
        spinlock_t      lock;
        /* all fields are in nanoseconds */
-       u64             max_duration;
        u64             min_duration;
+       u64             max_duration;
+       u64             total_duration;
        u64             max_freq;
        u64             min_freq;
        u64             last_event;
@@ -409,15 +389,39 @@ struct bch2_time_stats {
 
 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
 void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
 
 static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
 {
        __bch2_time_stats_update(stats, start, local_clock());
 }
 
+static inline bool track_event_change(struct bch2_time_stats *stats,
+                                     u64 *start, bool v)
+{
+       if (v != !!*start) {
+               if (!v) {
+                       bch2_time_stats_update(stats, *start);
+                       *start = 0;
+               } else {
+                       *start = local_clock() ?: 1;
+                       return true;
+               }
+       }
+
+       return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+                                     u64 *start, bool v)
+{
+       bool ret = v && !*start;
+       *start = v;
+       return ret;
+}
+#endif
+
 void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
 void bch2_time_stats_exit(struct bch2_time_stats *);
@@ -468,8 +472,10 @@ struct bch_pd_controller {
        s64                     last_change;
        s64                     last_target;
 
-       /* If true, the rate will not increase if bch2_ratelimit_delay()
-        * is not being called often enough. */
+       /*
+        * If true, the rate will not increase if bch2_ratelimit_delay()
+        * is not being called often enough.
+        */
        bool                    backpressure;
 };
 
@@ -532,9 +538,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }
 
 void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t);
-#define bch2_bio_alloc_pages(_bio, _size, _gfp)                                \
-       alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp))
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
 
 static inline sector_t bdev_sectors(struct block_device *bdev)
 {
@@ -607,6 +611,7 @@ static inline void __memcpy_u64s(void *dst, const void *src,
 {
 #ifdef CONFIG_X86_64
        long d0, d1, d2;
+
        asm volatile("rep ; movsq"
                     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
                     : "0" (u64s), "1" (dst), "2" (src)
@@ -683,6 +688,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src,
 
 #ifdef CONFIG_X86_64
        long d0, d1, d2;
+
        asm volatile("std ;\n"
                     "rep ; movsq\n"
                     "cld ;\n"
@@ -775,12 +781,12 @@ static inline void __move_gap(void *array, size_t element_size,
 
 #define bubble_sort(_base, _nr, _cmp)                                  \
 do {                                                                   \
-       ssize_t _i, _end;                                               \
+       ssize_t _i, _last;                                              \
        bool _swapped = true;                                           \
                                                                        \
-       for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+       for (_last= (ssize_t) (_nr) - 1; _last > 0 && _swapped; --_last) {\
                _swapped = false;                                       \
-               for (_i = 0; _i < _end; _i++)                           \
+               for (_i = 0; _i < _last; _i++)                          \
                        if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {   \
                                swap((_base)[_i], (_base)[_i + 1]);     \
                                _swapped = true;                        \
@@ -841,6 +847,11 @@ static inline int u8_cmp(u8 l, u8 r)
        return cmp_int(l, r);
 }
 
+static inline int cmp_le32(__le32 l, __le32 r)
+{
+       return cmp_int(le32_to_cpu(l), le32_to_cpu(r));
+}
+
 #include <linux/uuid.h>
 
 #endif /* _BCACHEFS_UTIL_H */
index ef030fc0244807ee10064a9dab5d83fd9791f45f..cb4f33ed9ab374fbd50bbf74419335564f55df9a 100644 (file)
 
 /**
  * bch2_varint_encode - encode a variable length integer
- * @out - destination to encode to
- * @v  - unsigned integer to encode
- *
- * Returns the size in bytes of the encoded integer - at most 9 bytes
+ * @out:       destination to encode to
+ * @v:         unsigned integer to encode
+ * Returns:    size in bytes of the encoded integer - at most 9 bytes
  */
 int bch2_varint_encode(u8 *out, u64 v)
 {
@@ -40,11 +39,10 @@ int bch2_varint_encode(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode - encode a variable length integer
- * @in - varint to decode
- * @end        - end of buffer to decode from
- * @out        - on success, decoded integer
- *
- * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * @in:                varint to decode
+ * @end:       end of buffer to decode from
+ * @out:       on success, decoded integer
+ * Returns:    size in bytes of the decoded integer - or -1 on failure (would
  * have read past the end of the buffer)
  */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
@@ -59,6 +57,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
        if (likely(bytes < 9)) {
                __le64 v_le = 0;
+
                memcpy(&v_le, in, bytes);
                v = le64_to_cpu(v_le);
                v >>= bytes;
@@ -72,6 +71,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
 
 /**
  * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ * @out:       destination to encode to
+ * @v:         unsigned integer to encode
+ * Returns:    size in bytes of the encoded integer - at most 9 bytes
  *
  * This version assumes it's always safe to write 8 bytes to @out, even if the
  * encoded integer would be smaller.
@@ -95,6 +97,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
 
 /**
  * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ * @in:                varint to decode
+ * @end:       end of buffer to decode from
+ * @out:       on success, decoded integer
+ * Returns:    size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
  *
  * This version assumes that it is safe to read at most 8 bytes past the end of
  * @end (we still return an error if the varint extends past @end).
index 53a694d71967196ad2784f89da5ea5c3966644a1..a6561b4b36a6e15cf020a82ba2c6741659dbf757 100644 (file)
        (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
 
 #define vstruct_next(_s)                                               \
-       ((typeof(_s))                   ((_s)->_data + __vstruct_u64s(_s)))
+       ((typeof(_s))                   ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_last(_s)                                               \
-       ((typeof(&(_s)->start[0]))      ((_s)->_data + __vstruct_u64s(_s)))
+       ((typeof(&(_s)->start[0]))      ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 #define vstruct_end(_s)                                                        \
-       ((void *)                       ((_s)->_data + __vstruct_u64s(_s)))
+       ((void *)                       ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
 
 #define vstruct_for_each(_s, _i)                                       \
        for (_i = (_s)->start;                                          \
index 70f78006daf20b18536059a76a87e9a5b1d76f2d..79d982674c180307f5d5a4da42fabaa480878573 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "acl.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "extents.h"
@@ -69,46 +70,38 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
        .cmp_bkey       = xattr_cmp_bkey,
 };
 
-int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+int bch2_xattr_invalid(struct bch_fs *c, struct bkey_s_c k,
                       enum bkey_invalid_flags flags,
                       struct printbuf *err)
 {
-       const struct xattr_handler *handler;
        struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
+       unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+                                          le16_to_cpu(xattr.v->x_val_len));
+       int ret = 0;
 
-       if (bkey_val_u64s(k.k) <
-           xattr_val_u64s(xattr.v->x_name_len,
-                          le16_to_cpu(xattr.v->x_val_len))) {
-               prt_printf(err, "value too small (%zu < %u)",
-                      bkey_val_u64s(k.k),
-                      xattr_val_u64s(xattr.v->x_name_len,
-                                     le16_to_cpu(xattr.v->x_val_len)));
-               return -BCH_ERR_invalid_bkey;
-       }
+       bkey_fsck_err_on(bkey_val_u64s(k.k) < val_u64s, c, err,
+                        xattr_val_size_too_small,
+                        "value too small (%zu < %u)",
+                        bkey_val_u64s(k.k), val_u64s);
 
        /* XXX why +4 ? */
-       if (bkey_val_u64s(k.k) >
-           xattr_val_u64s(xattr.v->x_name_len,
-                          le16_to_cpu(xattr.v->x_val_len) + 4)) {
-               prt_printf(err, "value too big (%zu > %u)",
-                      bkey_val_u64s(k.k),
-                      xattr_val_u64s(xattr.v->x_name_len,
-                                     le16_to_cpu(xattr.v->x_val_len) + 4));
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       handler = bch2_xattr_type_to_handler(xattr.v->x_type);
-       if (!handler) {
-               prt_printf(err, "invalid type (%u)", xattr.v->x_type);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) {
-               prt_printf(err, "xattr name has invalid characters");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       val_u64s = xattr_val_u64s(xattr.v->x_name_len,
+                                 le16_to_cpu(xattr.v->x_val_len) + 4);
+
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > val_u64s, c, err,
+                        xattr_val_size_too_big,
+                        "value too big (%zu > %u)",
+                        bkey_val_u64s(k.k), val_u64s);
+
+       bkey_fsck_err_on(!bch2_xattr_type_to_handler(xattr.v->x_type), c, err,
+                        xattr_invalid_type,
+                        "invalid type (%u)", xattr.v->x_type);
+
+       bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), c, err,
+                        xattr_name_invalid_chars,
+                        "xattr name has invalid characters");
+fsck_err:
+       return ret;
 }
 
 void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -130,6 +123,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
               xattr.v->x_name,
               le16_to_cpu(xattr.v->x_val_len),
               (char *) xattr_val(xattr.v));
+
+       if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS ||
+           xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT) {
+               prt_char(out, ' ');
+               bch2_acl_to_text(out, xattr_val(xattr.v),
+                                le16_to_cpu(xattr.v->x_val_len));
+       }
 }
 
 static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode,
@@ -299,24 +299,22 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        struct bch_fs *c = dentry->d_sb->s_fs_info;
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
-       struct btree_trans trans;
+       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
        u64 offset = 0, inum = inode->ei_inode.bi_inum;
        u32 snapshot;
        int ret;
-
-       bch2_trans_init(&trans, c, 0, 0);
 retry:
-       bch2_trans_begin(&trans);
+       bch2_trans_begin(trans);
        iter = (struct btree_iter) { NULL };
 
-       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       ret = bch2_subvolume_get_snapshot(trans, inode->ei_subvol, &snapshot);
        if (ret)
                goto err;
 
-       for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs,
+       for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_xattrs,
                           SPOS(inum, offset, snapshot),
                           POS(inum, U64_MAX), 0, k, ret) {
                if (k.k->type != KEY_TYPE_xattr)
@@ -328,12 +326,12 @@ retry:
        }
 
        offset = iter.pos.offset;
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 err:
        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                goto retry;
 
-       bch2_trans_exit(&trans);
+       bch2_trans_put(trans);
 
        if (ret)
                goto out;
@@ -358,7 +356,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags));
+               bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
 
        return bch2_err_class(ret);
 }
@@ -373,18 +371,14 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct bch_inode_unpacked inode_u;
-       struct btree_trans trans;
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       ret = commit_do(&trans, NULL, NULL, 0,
-                       bch2_xattr_set(&trans, inode_inum(inode), &inode_u,
+       ret = bch2_trans_run(c,
+               commit_do(trans, NULL, NULL, 0,
+                       bch2_xattr_set(trans, inode_inum(inode), &inode_u,
                                       &hash, name, value, size,
-                                      handler->flags, flags));
-       if (!ret)
-               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
-       bch2_trans_exit(&trans);
+                                      handler->flags, flags)) ?:
+               (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0));
 
        return bch2_err_class(ret);
 }
@@ -494,7 +488,8 @@ struct inode_opt_set {
        bool                    defined;
 };
 
-static int inode_opt_set_fn(struct bch_inode_info *inode,
+static int inode_opt_set_fn(struct btree_trans *trans,
+                           struct bch_inode_info *inode,
                            struct bch_inode_unpacked *bi,
                            void *p)
 {
@@ -557,6 +552,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                s.v = v + 1;
                s.defined = true;
        } else {
+               /*
+                * Check if this option was set on the parent - if so, switched
+                * back to inheriting from the parent:
+                *
+                * rename() also has to deal with keeping inherited options up
+                * to date - see bch2_reinherit_attrs()
+                */
+               spin_lock(&dentry->d_lock);
                if (!IS_ROOT(dentry)) {
                        struct bch_inode_info *dir =
                                to_bch_ei(d_inode(dentry->d_parent));
@@ -565,6 +568,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                } else {
                        s.v = 0;
                }
+               spin_unlock(&dentry->d_lock);
 
                s.defined = false;
        }
@@ -587,7 +591,7 @@ err:
        if (value &&
            (opt_id == Opt_background_compression ||
             opt_id == Opt_background_target))
-               bch2_rebalance_add_work(c, inode->v.i_blocks);
+               bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
 
        return bch2_err_class(ret);
 }
index f5a52e3a6016e7e67db20d01f1c78c18f1b5a228..1337f31a5c492c8401eefefdcd56c4a450d73514 100644 (file)
@@ -6,7 +6,7 @@
 
 extern const struct bch_hash_desc bch2_xattr_hash_desc;
 
-int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c,
+int bch2_xattr_invalid(struct bch_fs *, struct bkey_s_c,
                       enum bkey_invalid_flags, struct printbuf *);
 void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
index ea901a462494b97fb220b60897d140a0295da3ce..5cac6eb9c61904d4ad609dea1e337b165534a5e6 100644 (file)
@@ -162,7 +162,7 @@ sector_t get_capacity(struct gendisk *disk)
        return bytes >> 9;
 }
 
-void blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, void *holder)
 {
        fdatasync(bdev->bd_fd);
        close(bdev->bd_sync_fd);
@@ -170,25 +170,25 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
        free(bdev);
 }
 
-struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
-                                       void *holder)
+struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+                                       void *holder, const struct blk_holder_ops *hop)
 {
        struct block_device *bdev;
        int fd, sync_fd, buffered_fd, flags = 0;
 
-       if ((mode & (FMODE_READ|FMODE_WRITE)) == (FMODE_READ|FMODE_WRITE))
+       if ((mode & (BLK_OPEN_READ|BLK_OPEN_WRITE)) == (BLK_OPEN_READ|BLK_OPEN_WRITE))
                flags = O_RDWR;
-       else if (mode & FMODE_READ)
+       else if (mode & BLK_OPEN_READ)
                flags = O_RDONLY;
-       else if (mode & FMODE_WRITE)
+       else if (mode & BLK_OPEN_WRITE)
                flags = O_WRONLY;
 
-       if (!(mode & FMODE_BUFFERED))
+       if (!(mode & BLK_OPEN_BUFFERED))
                flags |= O_DIRECT;
 
 #if 0
        /* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */
-       if (mode & FMODE_EXCL)
+       if (mode & BLK_OPEN_EXCL)
                flags |= O_EXCL;
 #endif
        buffered_fd = open(path, flags & ~O_DIRECT);
@@ -289,6 +289,8 @@ static void sync_write(struct bio *bio, struct iovec * iov, unsigned i)
        sync_check(bio, ret);
 }
 
+static DECLARE_WAIT_QUEUE_HEAD(aio_events_completed);
+
 static int aio_completion_thread(void *arg)
 {
        struct io_event events[8], *ev;
@@ -303,6 +305,8 @@ static int aio_completion_thread(void *arg)
                        continue;
                if (ret < 0)
                        die("io_getevents() error: %s", strerror(-ret));
+               if (ret)
+                       wake_up(&aio_events_completed);
 
                for (ev = events; ev < events + ret; ev++) {
                        struct bio *bio = (struct bio *) ev->data;
@@ -394,7 +398,10 @@ static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode)
        }, *iocbp = &iocb;
 
        atomic_inc(&running_requests);
-       ret = io_submit(aio_ctx, 1, &iocbp);
+
+       wait_event(aio_events_completed,
+                  (ret = io_submit(aio_ctx, 1, &iocbp)) != -EAGAIN);;
+
        if (ret != 1)
                die("io_submit err: %s", strerror(-ret));
 }
index 0855e698ced11a7ebdccb983759664acff067618..f86c9eeafb35ad9da21ebddda8a182ea27970ff8 100644 (file)
@@ -21,6 +21,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
        BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
 
        if (!r) {
+               smp_acquire__after_ctrl_dep();
+
+               cl->closure_get_happened = false;
+
                if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
                        atomic_set(&cl->remaining,
                                   CLOSURE_REMAINING_INITIALIZER);
@@ -43,7 +47,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 /* For clearing flags with the same atomic op as a put */
 void closure_sub(struct closure *cl, int v)
 {
-       closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+       closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
 }
 EXPORT_SYMBOL(closure_sub);
 
@@ -52,7 +56,7 @@ EXPORT_SYMBOL(closure_sub);
  */
 void closure_put(struct closure *cl)
 {
-       closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+       closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
 
@@ -90,6 +94,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
        if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
                return false;
 
+       cl->closure_get_happened = true;
        closure_set_waiting(cl, _RET_IP_);
        atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
        llist_add(&cl->list, &waitlist->list);
index 0b5715b36a00ee210dfcd6b53c3e53d6e706b196..dfefad40b7a5de06b3d12c8790ab5fd3b77f8b43 100644 (file)
@@ -131,7 +131,21 @@ void run_shrinkers(gfp_t gfp_mask, bool allocation_failed)
 static int shrinker_thread(void *arg)
 {
        while (!kthread_should_stop()) {
-               sleep(1);
+               struct timespec to;
+               int v;
+
+               clock_gettime(CLOCK_MONOTONIC, &to);
+               to.tv_sec += 1;
+               __set_current_state(TASK_INTERRUPTIBLE);
+               errno = 0;
+               while ((v = READ_ONCE(current->state)) != TASK_RUNNING &&
+                      errno != ETIMEDOUT)
+                       futex(&current->state, FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG,
+                             v, &to, NULL, (uint32_t)~0);
+               if (kthread_should_stop())
+                       break;
+               if (v != TASK_RUNNING)
+                       __set_current_state(TASK_RUNNING);
                run_shrinkers(GFP_KERNEL, false);
        }
 
diff --git a/linux/six.c b/linux/six.c
deleted file mode 100644 (file)
index 0b9c4bb..0000000
+++ /dev/null
@@ -1,893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-#include <linux/export.h>
-#include <linux/log2.h>
-#include <linux/percpu.h>
-#include <linux/preempt.h>
-#include <linux/rcupdate.h>
-#include <linux/sched.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/rt.h>
-#include <linux/six.h>
-#include <linux/slab.h>
-
-#include <trace/events/lock.h>
-
-#ifdef DEBUG
-#define EBUG_ON(cond)                  BUG_ON(cond)
-#else
-#define EBUG_ON(cond)                  do {} while (0)
-#endif
-
-#define six_acquire(l, t, r, ip)       lock_acquire(l, 0, t, r, 1, NULL, ip)
-#define six_release(l, ip)             lock_release(l, ip)
-
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type);
-
-#define SIX_LOCK_HELD_read_OFFSET      0
-#define SIX_LOCK_HELD_read             ~(~0U << 26)
-#define SIX_LOCK_HELD_intent           (1U << 26)
-#define SIX_LOCK_HELD_write            (1U << 27)
-#define SIX_LOCK_WAITING_read          (1U << (28 + SIX_LOCK_read))
-#define SIX_LOCK_WAITING_intent                (1U << (28 + SIX_LOCK_intent))
-#define SIX_LOCK_WAITING_write         (1U << (28 + SIX_LOCK_write))
-#define SIX_LOCK_NOSPIN                        (1U << 31)
-
-struct six_lock_vals {
-       /* Value we add to the lock in order to take the lock: */
-       u32                     lock_val;
-
-       /* If the lock has this value (used as a mask), taking the lock fails: */
-       u32                     lock_fail;
-
-       /* Mask that indicates lock is held for this type: */
-       u32                     held_mask;
-
-       /* Waitlist we wakeup when releasing the lock: */
-       enum six_lock_type      unlock_wakeup;
-};
-
-static const struct six_lock_vals l[] = {
-       [SIX_LOCK_read] = {
-               .lock_val       = 1U << SIX_LOCK_HELD_read_OFFSET,
-               .lock_fail      = SIX_LOCK_HELD_write,
-               .held_mask      = SIX_LOCK_HELD_read,
-               .unlock_wakeup  = SIX_LOCK_write,
-       },
-       [SIX_LOCK_intent] = {
-               .lock_val       = SIX_LOCK_HELD_intent,
-               .lock_fail      = SIX_LOCK_HELD_intent,
-               .held_mask      = SIX_LOCK_HELD_intent,
-               .unlock_wakeup  = SIX_LOCK_intent,
-       },
-       [SIX_LOCK_write] = {
-               .lock_val       = SIX_LOCK_HELD_write,
-               .lock_fail      = SIX_LOCK_HELD_read,
-               .held_mask      = SIX_LOCK_HELD_write,
-               .unlock_wakeup  = SIX_LOCK_read,
-       },
-};
-
-static inline void six_set_bitmask(struct six_lock *lock, u32 mask)
-{
-       if ((atomic_read(&lock->state) & mask) != mask)
-               atomic_or(mask, &lock->state);
-}
-
-static inline void six_clear_bitmask(struct six_lock *lock, u32 mask)
-{
-       if (atomic_read(&lock->state) & mask)
-               atomic_and(~mask, &lock->state);
-}
-
-static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
-                                u32 old, struct task_struct *owner)
-{
-       if (type != SIX_LOCK_intent)
-               return;
-
-       if (!(old & SIX_LOCK_HELD_intent)) {
-               EBUG_ON(lock->owner);
-               lock->owner = owner;
-       } else {
-               EBUG_ON(lock->owner != current);
-       }
-}
-
-static inline unsigned pcpu_read_count(struct six_lock *lock)
-{
-       unsigned read_count = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               read_count += *per_cpu_ptr(lock->readers, cpu);
-       return read_count;
-}
-
-/*
- * __do_six_trylock() - main trylock routine
- *
- * Returns 1 on success, 0 on failure
- *
- * In percpu reader mode, a failed trylock may cause a spurious trylock failure
- * for anoter thread taking the competing lock type, and we may havve to do a
- * wakeup: when a wakeup is required, we return -1 - wakeup_type.
- */
-static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type,
-                           struct task_struct *task, bool try)
-{
-       int ret;
-       u32 old;
-
-       EBUG_ON(type == SIX_LOCK_write && lock->owner != task);
-       EBUG_ON(type == SIX_LOCK_write &&
-               (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write)));
-
-       /*
-        * Percpu reader mode:
-        *
-        * The basic idea behind this algorithm is that you can implement a lock
-        * between two threads without any atomics, just memory barriers:
-        *
-        * For two threads you'll need two variables, one variable for "thread a
-        * has the lock" and another for "thread b has the lock".
-        *
-        * To take the lock, a thread sets its variable indicating that it holds
-        * the lock, then issues a full memory barrier, then reads from the
-        * other thread's variable to check if the other thread thinks it has
-        * the lock. If we raced, we backoff and retry/sleep.
-        *
-        * Failure to take the lock may cause a spurious trylock failure in
-        * another thread, because we temporarily set the lock to indicate that
-        * we held it. This would be a problem for a thread in six_lock(), when
-        * they are calling trylock after adding themself to the waitlist and
-        * prior to sleeping.
-        *
-        * Therefore, if we fail to get the lock, and there were waiters of the
-        * type we conflict with, we will have to issue a wakeup.
-        *
-        * Since we may be called under wait_lock (and by the wakeup code
-        * itself), we return that the wakeup has to be done instead of doing it
-        * here.
-        */
-       if (type == SIX_LOCK_read && lock->readers) {
-               preempt_disable();
-               this_cpu_inc(*lock->readers); /* signal that we own lock */
-
-               smp_mb();
-
-               old = atomic_read(&lock->state);
-               ret = !(old & l[type].lock_fail);
-
-               this_cpu_sub(*lock->readers, !ret);
-               preempt_enable();
-
-               if (!ret && (old & SIX_LOCK_WAITING_write))
-                       ret = -1 - SIX_LOCK_write;
-       } else if (type == SIX_LOCK_write && lock->readers) {
-               if (try) {
-                       atomic_add(SIX_LOCK_HELD_write, &lock->state);
-                       smp_mb__after_atomic();
-               }
-
-               ret = !pcpu_read_count(lock);
-
-               if (try && !ret) {
-                       old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state);
-                       if (old & SIX_LOCK_WAITING_read)
-                               ret = -1 - SIX_LOCK_read;
-               }
-       } else {
-               old = atomic_read(&lock->state);
-               do {
-                       ret = !(old & l[type].lock_fail);
-                       if (!ret || (type == SIX_LOCK_write && !try)) {
-                               smp_mb();
-                               break;
-                       }
-               } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val));
-
-               EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask));
-       }
-
-       if (ret > 0)
-               six_set_owner(lock, type, old, task);
-
-       EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 &&
-               (atomic_read(&lock->state) & SIX_LOCK_HELD_write));
-
-       return ret;
-}
-
-static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type)
-{
-       struct six_lock_waiter *w, *next;
-       struct task_struct *task;
-       bool saw_one;
-       int ret;
-again:
-       ret = 0;
-       saw_one = false;
-       raw_spin_lock(&lock->wait_lock);
-
-       list_for_each_entry_safe(w, next, &lock->wait_list, list) {
-               if (w->lock_want != lock_type)
-                       continue;
-
-               if (saw_one && lock_type != SIX_LOCK_read)
-                       goto unlock;
-               saw_one = true;
-
-               ret = __do_six_trylock(lock, lock_type, w->task, false);
-               if (ret <= 0)
-                       goto unlock;
-
-               __list_del(w->list.prev, w->list.next);
-               task = w->task;
-               /*
-                * Do no writes to @w besides setting lock_acquired - otherwise
-                * we would need a memory barrier:
-                */
-               barrier();
-               w->lock_acquired = true;
-               wake_up_process(task);
-       }
-
-       six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type);
-unlock:
-       raw_spin_unlock(&lock->wait_lock);
-
-       if (ret < 0) {
-               lock_type = -ret - 1;
-               goto again;
-       }
-}
-
-__always_inline
-static void six_lock_wakeup(struct six_lock *lock, u32 state,
-                           enum six_lock_type lock_type)
-{
-       if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read))
-               return;
-
-       if (!(state & (SIX_LOCK_WAITING_read << lock_type)))
-               return;
-
-       __six_lock_wakeup(lock, lock_type);
-}
-
-__always_inline
-static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try)
-{
-       int ret;
-
-       ret = __do_six_trylock(lock, type, current, try);
-       if (ret < 0)
-               __six_lock_wakeup(lock, -ret - 1);
-
-       return ret > 0;
-}
-
-/**
- * six_trylock_ip - attempt to take a six lock without blocking
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-       if (!do_six_trylock(lock, type, true))
-               return false;
-
-       if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip);
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_trylock_ip);
-
-/**
- * six_relock_ip - attempt to re-take a lock that was held previously
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @seq:       lock sequence number obtained from six_lock_seq() while lock was
- *             held previously
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * Return: true on success, false on failure.
- */
-bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
-                  unsigned seq, unsigned long ip)
-{
-       if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip))
-               return false;
-
-       if (six_lock_seq(lock) != seq) {
-               six_unlock_ip(lock, type, ip);
-               return false;
-       }
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_relock_ip);
-
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
-
-static inline bool six_can_spin_on_owner(struct six_lock *lock)
-{
-       struct task_struct *owner;
-       bool ret;
-
-       if (need_resched())
-               return false;
-
-       rcu_read_lock();
-       owner = READ_ONCE(lock->owner);
-       ret = !owner || owner_on_cpu(owner);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool six_spin_on_owner(struct six_lock *lock,
-                                    struct task_struct *owner,
-                                    u64 end_time)
-{
-       bool ret = true;
-       unsigned loop = 0;
-
-       rcu_read_lock();
-       while (lock->owner == owner) {
-               /*
-                * Ensure we emit the owner->on_cpu, dereference _after_
-                * checking lock->owner still matches owner. If that fails,
-                * owner might point to freed memory. If it still matches,
-                * the rcu_read_lock() ensures the memory stays valid.
-                */
-               barrier();
-
-               if (!owner_on_cpu(owner) || need_resched()) {
-                       ret = false;
-                       break;
-               }
-
-               if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
-                       six_set_bitmask(lock, SIX_LOCK_NOSPIN);
-                       ret = false;
-                       break;
-               }
-
-               cpu_relax();
-       }
-       rcu_read_unlock();
-
-       return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
-       struct task_struct *task = current;
-       u64 end_time;
-
-       if (type == SIX_LOCK_write)
-               return false;
-
-       preempt_disable();
-       if (!six_can_spin_on_owner(lock))
-               goto fail;
-
-       if (!osq_lock(&lock->osq))
-               goto fail;
-
-       end_time = sched_clock() + 10 * NSEC_PER_USEC;
-
-       while (1) {
-               struct task_struct *owner;
-
-               /*
-                * If there's an owner, wait for it to either
-                * release the lock or go to sleep.
-                */
-               owner = READ_ONCE(lock->owner);
-               if (owner && !six_spin_on_owner(lock, owner, end_time))
-                       break;
-
-               if (do_six_trylock(lock, type, false)) {
-                       osq_unlock(&lock->osq);
-                       preempt_enable();
-                       return true;
-               }
-
-               /*
-                * When there's no owner, we might have preempted between the
-                * owner acquiring the lock and setting the owner field. If
-                * we're an RT task that will live-lock because we won't let
-                * the owner complete.
-                */
-               if (!owner && (need_resched() || rt_task(task)))
-                       break;
-
-               /*
-                * The cpu_relax() call is a compiler barrier which forces
-                * everything in this loop to be re-loaded. We don't need
-                * memory barriers as we'll eventually observe the right
-                * values at the cost of a few extra spins.
-                */
-               cpu_relax();
-       }
-
-       osq_unlock(&lock->osq);
-fail:
-       preempt_enable();
-
-       /*
-        * If we fell out of the spin path because of need_resched(),
-        * reschedule now, before we try-lock again. This avoids getting
-        * scheduled out right after we obtained the lock.
-        */
-       if (need_resched())
-               schedule();
-
-       return false;
-}
-
-#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
-       return false;
-}
-
-#endif
-
-noinline
-static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
-                            struct six_lock_waiter *wait,
-                            six_lock_should_sleep_fn should_sleep_fn, void *p,
-                            unsigned long ip)
-{
-       int ret = 0;
-
-       if (type == SIX_LOCK_write) {
-               EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-               atomic_add(SIX_LOCK_HELD_write, &lock->state);
-               smp_mb__after_atomic();
-       }
-
-       trace_contention_begin(lock, 0);
-       lock_contended(&lock->dep_map, ip);
-
-       if (six_optimistic_spin(lock, type))
-               goto out;
-
-       wait->task              = current;
-       wait->lock_want         = type;
-       wait->lock_acquired     = false;
-
-       raw_spin_lock(&lock->wait_lock);
-       six_set_bitmask(lock, SIX_LOCK_WAITING_read << type);
-       /*
-        * Retry taking the lock after taking waitlist lock, in case we raced
-        * with an unlock:
-        */
-       ret = __do_six_trylock(lock, type, current, false);
-       if (ret <= 0) {
-               wait->start_time = local_clock();
-
-               if (!list_empty(&lock->wait_list)) {
-                       struct six_lock_waiter *last =
-                               list_last_entry(&lock->wait_list,
-                                       struct six_lock_waiter, list);
-
-                       if (time_before_eq64(wait->start_time, last->start_time))
-                               wait->start_time = last->start_time + 1;
-               }
-
-               list_add_tail(&wait->list, &lock->wait_list);
-       }
-       raw_spin_unlock(&lock->wait_lock);
-
-       if (unlikely(ret > 0)) {
-               ret = 0;
-               goto out;
-       }
-
-       if (unlikely(ret < 0)) {
-               __six_lock_wakeup(lock, -ret - 1);
-               ret = 0;
-       }
-
-       while (1) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-
-               if (wait->lock_acquired)
-                       break;
-
-               ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
-               if (unlikely(ret)) {
-                       raw_spin_lock(&lock->wait_lock);
-                       if (!wait->lock_acquired)
-                               list_del(&wait->list);
-                       raw_spin_unlock(&lock->wait_lock);
-
-                       if (unlikely(wait->lock_acquired))
-                               do_six_unlock_type(lock, type);
-                       break;
-               }
-
-               schedule();
-       }
-
-       __set_current_state(TASK_RUNNING);
-out:
-       if (ret && type == SIX_LOCK_write) {
-               six_clear_bitmask(lock, SIX_LOCK_HELD_write);
-               six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read);
-       }
-       trace_contention_end(lock, 0);
-
-       return ret;
-}
-
-/**
- * six_lock_ip_waiter - take a lock, with full waitlist interface
- * @lock:      lock to take
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @wait:      pointer to wait object, which will be added to lock's waitlist
- * @should_sleep_fn: callback run after adding to waitlist, immediately prior
- *             to scheduling
- * @p:         passed through to @should_sleep_fn
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * This is the most general six_lock() variant, with parameters to support full
- * cycle detection for deadlock avoidance.
- *
- * The code calling this function must implement tracking of held locks, and the
- * @wait object should be embedded into the struct that tracks held locks -
- * which must also be accessible in a thread-safe way.
- *
- * @should_sleep_fn should invoke the cycle detector; it should walk each
- * lock's waiters, and for each waiter recursively walk their held locks.
- *
- * When this function must block, @wait will be added to @lock's waitlist before
- * calling trylock, and before calling @should_sleep_fn, and @wait will not be
- * removed from the lock waitlist until the lock has been successfully acquired,
- * or we abort.
- *
- * @wait.start_time will be monotonically increasing for any given waitlist, and
- * thus may be used as a loop cursor.
- *
- * Return: 0 on success, or the return code from @should_sleep_fn on failure.
- */
-int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type,
-                      struct six_lock_waiter *wait,
-                      six_lock_should_sleep_fn should_sleep_fn, void *p,
-                      unsigned long ip)
-{
-       int ret;
-
-       wait->start_time = 0;
-
-       if (type != SIX_LOCK_write)
-               six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip);
-
-       ret = do_six_trylock(lock, type, true) ? 0
-               : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip);
-
-       if (ret && type != SIX_LOCK_write)
-               six_release(&lock->dep_map, ip);
-       if (!ret)
-               lock_acquired(&lock->dep_map, ip);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_ip_waiter);
-
-__always_inline
-static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type)
-{
-       u32 state;
-
-       if (type == SIX_LOCK_intent)
-               lock->owner = NULL;
-
-       if (type == SIX_LOCK_read &&
-           lock->readers) {
-               smp_mb(); /* unlock barrier */
-               this_cpu_dec(*lock->readers);
-               smp_mb(); /* between unlocking and checking for waiters */
-               state = atomic_read(&lock->state);
-       } else {
-               u32 v = l[type].lock_val;
-
-               if (type != SIX_LOCK_read)
-                       v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN;
-
-               EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask));
-               state = atomic_sub_return_release(v, &lock->state);
-       }
-
-       six_lock_wakeup(lock, state, l[type].unlock_wakeup);
-}
-
-/**
- * six_unlock_ip - drop a six lock
- * @lock:      lock to unlock
- * @type:      SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write
- * @ip:                ip parameter for lockdep/lockstat, i.e. _THIS_IP_
- *
- * When a lock is held multiple times (because six_lock_incement()) was used),
- * this decrements the 'lock held' counter by one.
- *
- * For example:
- * six_lock_read(&foo->lock);                          read count 1
- * six_lock_increment(&foo->lock, SIX_LOCK_read);      read count 2
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 1
- * six_lock_unlock(&foo->lock, SIX_LOCK_read);         read count 0
- */
-void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip)
-{
-       EBUG_ON(type == SIX_LOCK_write &&
-               !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-       EBUG_ON((type == SIX_LOCK_write ||
-                type == SIX_LOCK_intent) &&
-               lock->owner != current);
-
-       if (type != SIX_LOCK_write)
-               six_release(&lock->dep_map, ip);
-       else
-               lock->seq++;
-
-       if (type == SIX_LOCK_intent &&
-           lock->intent_lock_recurse) {
-               --lock->intent_lock_recurse;
-               return;
-       }
-
-       do_six_unlock_type(lock, type);
-}
-EXPORT_SYMBOL_GPL(six_unlock_ip);
-
-/**
- * six_lock_downgrade - convert an intent lock to a read lock
- * @lock:      lock to dowgrade
- *
- * @lock will have read count incremented and intent count decremented
- */
-void six_lock_downgrade(struct six_lock *lock)
-{
-       six_lock_increment(lock, SIX_LOCK_read);
-       six_unlock_intent(lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_downgrade);
-
-/**
- * six_lock_tryupgrade - attempt to convert read lock to an intent lock
- * @lock:      lock to upgrade
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_lock_tryupgrade(struct six_lock *lock)
-{
-       u32 old = atomic_read(&lock->state), new;
-
-       do {
-               new = old;
-
-               if (new & SIX_LOCK_HELD_intent)
-                       return false;
-
-               if (!lock->readers) {
-                       EBUG_ON(!(new & SIX_LOCK_HELD_read));
-                       new -= l[SIX_LOCK_read].lock_val;
-               }
-
-               new |= SIX_LOCK_HELD_intent;
-       } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new));
-
-       if (lock->readers)
-               this_cpu_dec(*lock->readers);
-
-       six_set_owner(lock, SIX_LOCK_intent, old, current);
-
-       return true;
-}
-EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
-
-/**
- * six_trylock_convert - attempt to convert a held lock from one type to another
- * @lock:      lock to upgrade
- * @from:      SIX_LOCK_read or SIX_LOCK_intent
- * @to:                SIX_LOCK_read or SIX_LOCK_intent
- *
- * On success, @lock will have intent count incremented and read count
- * decremented
- *
- * Return: true on success, false on failure
- */
-bool six_trylock_convert(struct six_lock *lock,
-                        enum six_lock_type from,
-                        enum six_lock_type to)
-{
-       EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
-
-       if (to == from)
-               return true;
-
-       if (to == SIX_LOCK_read) {
-               six_lock_downgrade(lock);
-               return true;
-       } else {
-               return six_lock_tryupgrade(lock);
-       }
-}
-EXPORT_SYMBOL_GPL(six_trylock_convert);
-
-/**
- * six_lock_increment - increase held lock count on a lock that is already held
- * @lock:      lock to increment
- * @type:      SIX_LOCK_read or SIX_LOCK_intent
- *
- * @lock must already be held, with a lock type that is greater than or equal to
- * @type
- *
- * A corresponding six_unlock_type() call will be required for @lock to be fully
- * unlocked.
- */
-void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
-{
-       six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_);
-
-       /* XXX: assert already locked, and that we don't overflow: */
-
-       switch (type) {
-       case SIX_LOCK_read:
-               if (lock->readers) {
-                       this_cpu_inc(*lock->readers);
-               } else {
-                       EBUG_ON(!(atomic_read(&lock->state) &
-                                 (SIX_LOCK_HELD_read|
-                                  SIX_LOCK_HELD_intent)));
-                       atomic_add(l[type].lock_val, &lock->state);
-               }
-               break;
-       case SIX_LOCK_intent:
-               EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent));
-               lock->intent_lock_recurse++;
-               break;
-       case SIX_LOCK_write:
-               BUG();
-               break;
-       }
-}
-EXPORT_SYMBOL_GPL(six_lock_increment);
-
-/**
- * six_lock_wakeup_all - wake up all waiters on @lock
- * @lock:      lock to wake up waiters for
- *
- * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then
- * abort the lock operation.
- *
- * This function is never needed in a bug-free program; it's only useful in
- * debug code, e.g. to determine if a cycle detector is at fault.
- */
-void six_lock_wakeup_all(struct six_lock *lock)
-{
-       u32 state = atomic_read(&lock->state);
-       struct six_lock_waiter *w;
-
-       six_lock_wakeup(lock, state, SIX_LOCK_read);
-       six_lock_wakeup(lock, state, SIX_LOCK_intent);
-       six_lock_wakeup(lock, state, SIX_LOCK_write);
-
-       raw_spin_lock(&lock->wait_lock);
-       list_for_each_entry(w, &lock->wait_list, list)
-               wake_up_process(w->task);
-       raw_spin_unlock(&lock->wait_lock);
-}
-EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
-
-/**
- * six_lock_counts - return held lock counts, for each lock type
- * @lock:      lock to return counters for
- *
- * Return: the number of times a lock is held for read, intent and write.
- */
-struct six_lock_count six_lock_counts(struct six_lock *lock)
-{
-       struct six_lock_count ret;
-
-       ret.n[SIX_LOCK_read]    = !lock->readers
-               ? atomic_read(&lock->state) & SIX_LOCK_HELD_read
-               : pcpu_read_count(lock);
-       ret.n[SIX_LOCK_intent]  = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) +
-               lock->intent_lock_recurse;
-       ret.n[SIX_LOCK_write]   = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write);
-
-       return ret;
-}
-EXPORT_SYMBOL_GPL(six_lock_counts);
-
-/**
- * six_lock_readers_add - directly manipulate reader count of a lock
- * @lock:      lock to add/subtract readers for
- * @nr:                reader count to add/subtract
- *
- * When an upper layer is implementing lock reentrency, we may have both read
- * and intent locks on the same lock.
- *
- * When we need to take a write lock, the read locks will cause self-deadlock,
- * because six locks themselves do not track which read locks are held by the
- * current thread and which are held by a different thread - it does no
- * per-thread tracking of held locks.
- *
- * The upper layer that is tracking held locks may however, if trylock() has
- * failed, count up its own read locks, subtract them, take the write lock, and
- * then re-add them.
- *
- * As in any other situation when taking a write lock, @lock must be held for
- * intent one (or more) times, so @lock will never be left unlocked.
- */
-void six_lock_readers_add(struct six_lock *lock, int nr)
-{
-       if (lock->readers) {
-               this_cpu_add(*lock->readers, nr);
-       } else {
-               EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0);
-               /* reader count starts at bit 0 */
-               atomic_add(nr, &lock->state);
-       }
-}
-EXPORT_SYMBOL_GPL(six_lock_readers_add);
-
-/**
- * six_lock_exit - release resources held by a lock prior to freeing
- * @lock:      lock to exit
- *
- * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is
- * required to free the percpu read counts.
- */
-void six_lock_exit(struct six_lock *lock)
-{
-       WARN_ON(lock->readers && pcpu_read_count(lock));
-       WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read);
-
-       free_percpu(lock->readers);
-       lock->readers = NULL;
-}
-EXPORT_SYMBOL_GPL(six_lock_exit);
-
-void __six_lock_init(struct six_lock *lock, const char *name,
-                    struct lock_class_key *key, enum six_lock_init_flags flags)
-{
-       atomic_set(&lock->state, 0);
-       raw_spin_lock_init(&lock->wait_lock);
-       INIT_LIST_HEAD(&lock->wait_list);
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       debug_check_no_locks_freed((void *) lock, sizeof(*lock));
-       lockdep_init_map(&lock->dep_map, name, key, 0);
-#endif
-
-       /*
-        * Don't assume that we have real percpu variables available in
-        * userspace:
-        */
-#ifdef __KERNEL__
-       if (flags & SIX_LOCK_INIT_PCPU) {
-               /*
-                * We don't return an error here on memory allocation failure
-                * since percpu is an optimization, and locks will work with the
-                * same semantics in non-percpu mode: callers can check for
-                * failure if they wish by checking lock->readers, but generally
-                * will not want to treat it as an error.
-                */
-               lock->readers = alloc_percpu(unsigned);
-       }
-#endif
-}
-EXPORT_SYMBOL_GPL(__six_lock_init);
diff --git a/mkfs.bcachefs b/mkfs.bcachefs
deleted file mode 100755 (executable)
index b3631ba..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" format "$@"
diff --git a/mount.bcachefs b/mount.bcachefs
deleted file mode 100755 (executable)
index 5900232..0000000
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/sh
-
-SDIR="$(readlink -f "$0")"
-exec "${SDIR%/*}/bcachefs" mount "$@"
index a8f023a88302ee77762a8c8e82741e1b6dd10be6..24d93deeec65dc75fd481c17327a13760f073dd5 100644 (file)
@@ -45,10 +45,13 @@ rm -f %{buildroot}/%{_datadir}/initramfs-tools/scripts/local-premount/bcachefs
 rm -f %{buildroot}/usr/lib/libbcachefs.so
 
 %files
-%{_sbindir}/mount.bcachefs
 %{_sbindir}/bcachefs
+%{_sbindir}/mount.bcachefs
 %{_sbindir}/fsck.bcachefs
 %{_sbindir}/mkfs.bcachefs
+%{_sbindir}/mount.fuse.bcachefs
+%{_sbindir}/fsck.fuse.bcachefs
+%{_sbindir}/mkfs.fuse.bcachefs
 %{_mandir}/man8/bcachefs.8.gz
 
 %changelog
diff --git a/qcow2.c b/qcow2.c
index d01fa9417088198fee5310a705abd0e42ac3b131..3b75fc17a37e1526ac32a46a017cd7617d80fe0c 100644 (file)
--- a/qcow2.c
+++ b/qcow2.c
@@ -72,7 +72,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
 void qcow2_write_image(int infd, int outfd, ranges *data,
                       unsigned block_size)
 {
-       u64 image_size = get_size(NULL, infd);
+       u64 image_size = get_size(infd);
        unsigned l2_size = block_size / sizeof(u64);
        unsigned l1_size = DIV_ROUND_UP(image_size, (u64) block_size * l2_size);
        struct qcow2_hdr hdr = { 0 };
index c4dd7f58c704203818d43dc7129c8c0485d40de1..d270cb41625d6d066184089efdb762f5b6d7f5bf 100644 (file)
@@ -4,27 +4,67 @@ version = 3
 
 [[package]]
 name = "aho-corasick"
-version = "0.7.20"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
 dependencies = [
  "memchr",
 ]
 
 [[package]]
-name = "android_system_properties"
-version = "0.1.5"
+name = "anstream"
+version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
 dependencies = [
- "libc",
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is-terminal",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c677ab05e09154296dd37acecd46420c17b9713e8366facafa8fc0885167cf4c"
+dependencies = [
+ "anstyle",
+ "windows-sys",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.68"
+version = "1.0.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61"
+checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 
 [[package]]
 name = "atty"
@@ -53,9 +93,10 @@ dependencies = [
  "byteorder",
  "chrono",
  "clap",
+ "clap_complete",
  "colored",
  "either",
- "errno",
+ "errno 0.2.8",
  "gag",
  "getset",
  "itertools",
@@ -74,7 +115,7 @@ dependencies = [
  "anyhow",
  "bindgen",
  "bitfield",
- "bitflags",
+ "bitflags 1.3.2",
  "byteorder",
  "chrono",
  "colored",
@@ -92,7 +133,7 @@ name = "bindgen"
 version = "0.64.0"
 source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "cexpr",
  "clang-sys",
  "lazy_static",
@@ -103,7 +144,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -119,22 +160,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
-name = "bumpalo"
-version = "3.12.0"
+name = "bitflags"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
 
 [[package]]
 name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "cc"
-version = "1.0.79"
+version = "1.0.83"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "libc",
+]
 
 [[package]]
 name = "cexpr"
@@ -153,24 +197,18 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.23"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
 dependencies = [
- "iana-time-zone",
- "js-sys",
- "num-integer",
  "num-traits",
- "time",
- "wasm-bindgen",
- "winapi",
 ]
 
 [[package]]
 name = "clang-sys"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a"
+checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
 dependencies = [
  "glob",
  "libc",
@@ -178,118 +216,77 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.1.4"
+version = "4.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76"
+checksum = "fb690e81c7840c0d7aade59f242ea3b41b9bc27bcd5997890e7702ae4b32e487"
 dependencies = [
- "bitflags",
+ "clap_builder",
  "clap_derive",
- "clap_lex",
- "is-terminal",
  "once_cell",
- "strsim",
- "termcolor",
- "terminal_size",
 ]
 
 [[package]]
-name = "clap_derive"
-version = "4.1.0"
+name = "clap_builder"
+version = "4.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
+checksum = "5ed2e96bc16d8d740f6f48d663eddf4b8a0983e79210fd55479b7bcd0a69860e"
 dependencies = [
- "heck",
- "proc-macro-error",
- "proc-macro2",
- "quote",
- "syn",
-]
-
-[[package]]
-name = "clap_lex"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
-dependencies = [
- "os_str_bytes",
-]
-
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+ "terminal_size",
 ]
 
 [[package]]
-name = "colored"
-version = "2.0.0"
+name = "clap_complete"
+version = "4.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
+checksum = "bffe91f06a11b4b9420f62103854e90867812cd5d01557f853c5ee8e791b12ae"
 dependencies = [
- "atty",
- "lazy_static",
- "winapi",
+ "clap",
 ]
 
 [[package]]
-name = "core-foundation-sys"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
-
-[[package]]
-name = "cxx"
-version = "1.0.89"
+name = "clap_derive"
+version = "4.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9"
+checksum = "54a9bb5758fc5dfe728d1019941681eccaf0cf8a4189b692a0ee2f2ecf90a050"
 dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.38",
 ]
 
 [[package]]
-name = "cxx-build"
-version = "1.0.89"
+name = "clap_lex"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn",
-]
+checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
 [[package]]
-name = "cxxbridge-flags"
-version = "1.0.89"
+name = "colorchoice"
+version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
 
 [[package]]
-name = "cxxbridge-macro"
-version = "1.0.89"
+name = "colored"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2"
+checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
 dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "is-terminal",
+ "lazy_static",
+ "windows-sys",
 ]
 
 [[package]]
 name = "either"
-version = "1.8.1"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
 
 [[package]]
 name = "errno"
@@ -302,6 +299,16 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "errno"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
+dependencies = [
+ "libc",
+ "windows-sys",
+]
+
 [[package]]
 name = "errno-dragonfly"
 version = "0.1.2"
@@ -314,12 +321,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "1.8.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
-dependencies = [
- "instant",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "filedescriptor"
@@ -351,7 +355,7 @@ dependencies = [
  "proc-macro-error",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -377,65 +381,29 @@ dependencies = [
 
 [[package]]
 name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.53"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "winapi",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
-dependencies = [
- "cxx",
- "cxx-build",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.4"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
+checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
+ "hermit-abi 0.3.3",
  "libc",
  "windows-sys",
 ]
 
 [[package]]
 name = "is-terminal"
-version = "0.4.2"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189"
+checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
- "hermit-abi 0.2.6",
- "io-lifetimes",
- "rustix",
+ "hermit-abi 0.3.3",
+ "rustix 0.38.21",
  "windows-sys",
 ]
 
@@ -448,15 +416,6 @@ dependencies = [
  "either",
 ]
 
-[[package]]
-name = "js-sys"
-version = "0.3.61"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
-dependencies = [
- "wasm-bindgen",
-]
-
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -471,9 +430,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.139"
+version = "0.2.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
 
 [[package]]
 name = "libudev-sys"
@@ -486,34 +445,28 @@ dependencies = [
 ]
 
 [[package]]
-name = "link-cplusplus"
-version = "1.0.8"
+name = "linux-raw-sys"
+version = "0.3.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
+checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.1.4"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
 
 [[package]]
 name = "log"
-version = "0.4.17"
+version = "0.4.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "memoffset"
@@ -540,36 +493,20 @@ dependencies = [
  "minimal-lexical",
 ]
 
-[[package]]
-name = "num-integer"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
-dependencies = [
- "autocfg",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
-version = "0.2.15"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
  "autocfg",
 ]
 
 [[package]]
 name = "once_cell"
-version = "1.17.0"
+version = "1.18.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
-
-[[package]]
-name = "os_str_bytes"
-version = "6.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee"
+checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
 
 [[package]]
 name = "parse-display"
@@ -592,15 +529,15 @@ dependencies = [
  "proc-macro2",
  "quote",
  "regex",
- "regex-syntax",
- "syn",
+ "regex-syntax 0.6.29",
+ "syn 1.0.109",
 ]
 
 [[package]]
 name = "paste"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
 [[package]]
 name = "peeking_take_while"
@@ -610,9 +547,9 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
 
 [[package]]
 name = "proc-macro-error"
@@ -623,7 +560,7 @@ dependencies = [
  "proc-macro-error-attr",
  "proc-macro2",
  "quote",
- "syn",
+ "syn 1.0.109",
  "version_check",
 ]
 
@@ -640,56 +577,65 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.50"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
+checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.23"
+version = "1.0.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
 dependencies = [
  "proc-macro2",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.2.16"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
 ]
 
 [[package]]
 name = "regex"
-version = "1.7.1"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-syntax",
+ "regex-syntax 0.8.2",
 ]
 
 [[package]]
 name = "regex-syntax"
-version = "0.6.28"
+version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
 [[package]]
-name = "remove_dir_all"
-version = "0.5.3"
+name = "regex-syntax"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
-dependencies = [
- "winapi",
-]
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 
 [[package]]
 name = "rpassword"
@@ -709,29 +655,36 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustix"
-version = "0.36.7"
+version = "0.37.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
+checksum = "fea8ca367a3a01fe35e6943c400addf443c0f57670e6ec51196f71a4b8762dd2"
 dependencies = [
- "bitflags",
- "errno",
+ "bitflags 1.3.2",
+ "errno 0.3.5",
  "io-lifetimes",
  "libc",
- "linux-raw-sys",
+ "linux-raw-sys 0.3.8",
  "windows-sys",
 ]
 
 [[package]]
-name = "scratch"
-version = "1.0.3"
+name = "rustix"
+version = "0.38.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
+checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno 0.3.5",
+ "libc",
+ "linux-raw-sys 0.4.10",
+ "windows-sys",
+]
 
 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
 
 [[package]]
 name = "strsim"
@@ -741,9 +694,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
 [[package]]
 name = "syn"
-version = "1.0.107"
+version = "1.0.109"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -751,67 +704,57 @@ dependencies = [
 ]
 
 [[package]]
-name = "tempfile"
-version = "3.3.0"
+name = "syn"
+version = "2.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
 dependencies = [
- "cfg-if",
- "fastrand",
- "libc",
- "redox_syscall",
- "remove_dir_all",
- "winapi",
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
 ]
 
 [[package]]
-name = "termcolor"
-version = "1.2.0"
+name = "tempfile"
+version = "3.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
 dependencies = [
- "winapi-util",
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix 0.38.21",
+ "windows-sys",
 ]
 
 [[package]]
 name = "terminal_size"
-version = "0.2.3"
+version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cb20089a8ba2b69debd491f8d2d023761cbf196e999218c591fa1e7e15a21907"
+checksum = "8e6bf6f19e9f8ed8d4048dc22981458ebcf406d67e94cd422e5ecd73d63b3237"
 dependencies = [
- "rustix",
+ "rustix 0.37.27",
  "windows-sys",
 ]
 
 [[package]]
 name = "thiserror"
-version = "1.0.38"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
-]
-
-[[package]]
-name = "time"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
-dependencies = [
- "libc",
- "wasi",
- "winapi",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -827,21 +770,21 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
-name = "unicode-width"
-version = "0.1.10"
+name = "utf8parse"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
 [[package]]
 name = "uuid"
-version = "1.3.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79"
+checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc"
 
 [[package]]
 name = "version_check"
@@ -849,66 +792,6 @@ version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
 
-[[package]]
-name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
-
 [[package]]
 name = "winapi"
 version = "0.3.9"
@@ -925,15 +808,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
@@ -942,9 +816,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
 name = "windows-sys"
-version = "0.42.0"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
  "windows_aarch64_gnullvm",
  "windows_aarch64_msvc",
@@ -957,42 +840,42 @@ dependencies = [
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
index 92a3853f746847e405679482d9f5f35444bfd723..e88c05fe2591b69e1d8f986e81c67d4d07d6b985 100644 (file)
@@ -3,6 +3,7 @@ name = "bcachefs-rust"
 version = "0.3.1"
 authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>"]
 edition = "2018"
+rust-version = "1.65"
 
 [lib]
 crate-type = ["staticlib"]
@@ -10,9 +11,10 @@ crate-type = ["staticlib"]
 [dependencies]
 atty = "0.2.14"
 log = { version = "0.4", features = ["std"] }
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false }
 colored = "2"
 clap = { version = "4.0.32", features = ["derive", "wrap_help"] }
+clap_complete = "4.4.4"
 anyhow = "1.0"
 libc = "0.2.69"
 udev = "0.7.0"
index b27419766e7b094d972df929be92d8058db531de..a89ef60cf7cffa07fc863db8bf43dedff90ce14d 100644 (file)
@@ -3,30 +3,19 @@
 version = 3
 
 [[package]]
-name = "android_system_properties"
-version = "0.1.5"
+name = "aho-corasick"
+version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
 dependencies = [
- "libc",
+ "memchr",
 ]
 
 [[package]]
 name = "anyhow"
-version = "1.0.69"
+version = "1.0.75"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800"
-
-[[package]]
-name = "atty"
-version = "0.2.14"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
-dependencies = [
- "hermit-abi",
- "libc",
- "winapi",
-]
+checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
 
 [[package]]
 name = "autocfg"
@@ -41,7 +30,7 @@ dependencies = [
  "anyhow",
  "bindgen",
  "bitfield",
- "bitflags",
+ "bitflags 1.3.2",
  "byteorder",
  "chrono",
  "colored",
@@ -59,7 +48,7 @@ name = "bindgen"
 version = "0.64.0"
 source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
  "cexpr",
  "clang-sys",
  "lazy_static",
@@ -70,7 +59,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn",
+ "syn 1.0.109",
 ]
 
 [[package]]
@@ -86,22 +75,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
-name = "bumpalo"
-version = "3.12.0"
+name = "bitflags"
+version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
 
 [[package]]
 name = "byteorder"
-version = "1.4.3"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
-
-[[package]]
-name = "cc"
-version = "1.0.79"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "cexpr"
@@ -120,129 +103,49 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.23"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
 dependencies = [
- "iana-time-zone",
- "js-sys",
- "num-integer",
  "num-traits",
- "time",
- "wasm-bindgen",
- "winapi",
 ]
 
 [[package]]
 name = "clang-sys"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a"
+checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
 dependencies = [
  "glob",
  "libc",
 ]
 
-[[package]]
-name = "codespan-reporting"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e"
-dependencies = [
- "termcolor",
- "unicode-width",
-]
-
 [[package]]
 name = "colored"
-version = "2.0.0"
+version = "2.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd"
+checksum = "2674ec482fbc38012cf31e6c42ba0177b431a0cb6f15fe40efa5aab1bda516f6"
 dependencies = [
- "atty",
+ "is-terminal",
  "lazy_static",
- "winapi",
-]
-
-[[package]]
-name = "core-foundation-sys"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
-
-[[package]]
-name = "cxx"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62"
-dependencies = [
- "cc",
- "cxxbridge-flags",
- "cxxbridge-macro",
- "link-cplusplus",
-]
-
-[[package]]
-name = "cxx-build"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690"
-dependencies = [
- "cc",
- "codespan-reporting",
- "once_cell",
- "proc-macro2",
- "quote",
- "scratch",
- "syn",
-]
-
-[[package]]
-name = "cxxbridge-flags"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf"
-
-[[package]]
-name = "cxxbridge-macro"
-version = "1.0.91"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
+ "windows-sys",
 ]
 
 [[package]]
 name = "errno"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
-dependencies = [
- "errno-dragonfly",
- "libc",
- "winapi",
-]
-
-[[package]]
-name = "errno-dragonfly"
-version = "0.1.2"
+version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
+checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
 dependencies = [
- "cc",
  "libc",
+ "windows-sys",
 ]
 
 [[package]]
 name = "fastrand"
-version = "1.9.0"
+version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be"
-dependencies = [
- "instant",
-]
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
 [[package]]
 name = "filedescriptor"
@@ -273,63 +176,19 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "hermit-abi"
-version = "0.1.19"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "iana-time-zone"
-version = "0.1.53"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765"
-dependencies = [
- "android_system_properties",
- "core-foundation-sys",
- "iana-time-zone-haiku",
- "js-sys",
- "wasm-bindgen",
- "winapi",
-]
-
-[[package]]
-name = "iana-time-zone-haiku"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca"
-dependencies = [
- "cxx",
- "cxx-build",
-]
-
-[[package]]
-name = "instant"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
-dependencies = [
- "cfg-if",
-]
-
-[[package]]
-name = "io-lifetimes"
-version = "1.0.5"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3"
-dependencies = [
- "libc",
- "windows-sys 0.45.0",
-]
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
 
 [[package]]
-name = "js-sys"
-version = "0.3.61"
+name = "is-terminal"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730"
+checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
- "wasm-bindgen",
+ "hermit-abi",
+ "rustix",
+ "windows-sys",
 ]
 
 [[package]]
@@ -346,9 +205,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
 
 [[package]]
 name = "libc"
-version = "0.2.139"
+version = "0.2.149"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
+checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
 
 [[package]]
 name = "libudev-sys"
@@ -360,35 +219,17 @@ dependencies = [
  "pkg-config",
 ]
 
-[[package]]
-name = "link-cplusplus"
-version = "1.0.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "linux-raw-sys"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
-
-[[package]]
-name = "log"
-version = "0.4.17"
+version = "0.4.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
-dependencies = [
- "cfg-if",
-]
+checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
 
 [[package]]
 name = "memchr"
-version = "2.5.0"
+version = "2.6.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
+checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
 
 [[package]]
 name = "memoffset"
@@ -415,36 +256,20 @@ dependencies = [
  "minimal-lexical",
 ]
 
-[[package]]
-name = "num-integer"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
-dependencies = [
- "autocfg",
- "num-traits",
-]
-
 [[package]]
 name = "num-traits"
-version = "0.2.15"
+version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
 dependencies = [
  "autocfg",
 ]
 
-[[package]]
-name = "once_cell"
-version = "1.17.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
-
 [[package]]
 name = "paste"
-version = "1.0.11"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
 [[package]]
 name = "peeking_take_while"
@@ -454,51 +279,65 @@ checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
 [[package]]
 name = "pkg-config"
-version = "0.3.26"
+version = "0.3.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
+checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.51"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6"
+checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.23"
+version = "1.0.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
+checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae"
 dependencies = [
  "proc-macro2",
 ]
 
 [[package]]
 name = "redox_syscall"
-version = "0.2.16"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
 dependencies = [
- "bitflags",
+ "bitflags 1.3.2",
 ]
 
 [[package]]
 name = "regex"
-version = "1.7.1"
+version = "1.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
 dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
  "regex-syntax",
 ]
 
 [[package]]
 name = "regex-syntax"
-version = "0.6.28"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 
 [[package]]
 name = "rustc-hash"
@@ -508,29 +347,22 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
 [[package]]
 name = "rustix"
-version = "0.36.8"
+version = "0.38.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644"
+checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
 dependencies = [
- "bitflags",
+ "bitflags 2.4.1",
  "errno",
- "io-lifetimes",
  "libc",
  "linux-raw-sys",
- "windows-sys 0.45.0",
+ "windows-sys",
 ]
 
-[[package]]
-name = "scratch"
-version = "1.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2"
-
 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380"
 
 [[package]]
 name = "syn"
@@ -544,56 +376,47 @@ dependencies = [
 ]
 
 [[package]]
-name = "tempfile"
-version = "3.4.0"
+name = "syn"
+version = "2.0.38"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95"
+checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b"
 dependencies = [
- "cfg-if",
- "fastrand",
- "redox_syscall",
- "rustix",
- "windows-sys 0.42.0",
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
 ]
 
 [[package]]
-name = "termcolor"
-version = "1.2.0"
+name = "tempfile"
+version = "3.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
+checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5"
 dependencies = [
- "winapi-util",
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix",
+ "windows-sys",
 ]
 
 [[package]]
 name = "thiserror"
-version = "1.0.38"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0"
+checksum = "f9a7210f5c9a7156bb50aa36aed4c95afb51df0df00713949448cf9e97d382d2"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.38"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f"
+checksum = "266b2e40bc00e5a6c09c3584011e08b06f123c00362c92b975ba9843aaaa14b8"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn",
-]
-
-[[package]]
-name = "time"
-version = "0.1.45"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a"
-dependencies = [
- "libc",
- "wasi",
- "winapi",
+ "syn 2.0.38",
 ]
 
 [[package]]
@@ -609,81 +432,15 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.6"
+version = "1.0.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
-
-[[package]]
-name = "unicode-width"
-version = "0.1.10"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
 
 [[package]]
 name = "uuid"
-version = "1.3.0"
+version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79"
-
-[[package]]
-name = "wasi"
-version = "0.10.0+wasi-snapshot-preview1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f"
-
-[[package]]
-name = "wasm-bindgen"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b"
-dependencies = [
- "cfg-if",
- "wasm-bindgen-macro",
-]
-
-[[package]]
-name = "wasm-bindgen-backend"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9"
-dependencies = [
- "bumpalo",
- "log",
- "once_cell",
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-macro"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5"
-dependencies = [
- "quote",
- "wasm-bindgen-macro-support",
-]
-
-[[package]]
-name = "wasm-bindgen-macro-support"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6"
-dependencies = [
- "proc-macro2",
- "quote",
- "syn",
- "wasm-bindgen-backend",
- "wasm-bindgen-shared",
-]
-
-[[package]]
-name = "wasm-bindgen-shared"
-version = "0.2.84"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d"
+checksum = "88ad59a7560b41a70d191093a945f0b87bc1deeda46fb237479708a1d6b6cdfc"
 
 [[package]]
 name = "winapi"
@@ -701,15 +458,6 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
 
-[[package]]
-name = "winapi-util"
-version = "0.1.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
-dependencies = [
- "winapi",
-]
-
 [[package]]
 name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
@@ -718,33 +466,18 @@ checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
 [[package]]
 name = "windows-sys"
-version = "0.42.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
-dependencies = [
- "windows_aarch64_gnullvm",
- "windows_aarch64_msvc",
- "windows_i686_gnu",
- "windows_i686_msvc",
- "windows_x86_64_gnu",
- "windows_x86_64_gnullvm",
- "windows_x86_64_msvc",
-]
-
-[[package]]
-name = "windows-sys"
-version = "0.45.0"
+version = "0.48.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
 dependencies = [
  "windows-targets",
 ]
 
 [[package]]
 name = "windows-targets"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
 dependencies = [
  "windows_aarch64_gnullvm",
  "windows_aarch64_msvc",
@@ -757,42 +490,42 @@ dependencies = [
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.42.1"
+version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
index cb341eef5a12d193eea68402bef5dd558f503cc1..da6654c240c2f2bfb9f06eda403928b6ead45941 100644 (file)
@@ -9,7 +9,7 @@ crate-type = ["lib"]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-chrono = "0.4"
+chrono = { version = "0.4", default-features = false }
 colored = "2"
 anyhow = "1.0"
 udev = "0.7.0"
@@ -18,7 +18,7 @@ bitfield = "0.14.0"
 memoffset = "0.8.0"
 byteorder = "1.3"
 libc = "0.2.69"
-gag = "1.0.0"
+gag = { version = "1.0.0", default-features = false }
 bitflags = "1.3.2"
 paste = "1.0.11"
 
index 92ec3cefc38a030647dd619add8a65d00a5848ab..819b337ec5e8ddf4c0b5afed76e3f61d46dea3fd 100644 (file)
@@ -60,6 +60,7 @@ fn main() {
         .allowlist_var("KEY_SPEC_.*")
         .allowlist_var("Fix753_FMODE_.*")
         .allowlist_var("bch.*")
+        .allowlist_var("__bch2.*")
         .allowlist_var("__BTREE_ITER.*")
         .allowlist_var("BTREE_ITER.*")
         .blocklist_item("bch2_bkey_ops")
index fa8dbdeb432965605aed3b41c9dcf991ddd85afb..8e897c08b0029d936d5f07db557ced157fc8781e 100644 (file)
@@ -43,7 +43,7 @@ impl PartialEq for bch_sb {
 impl bch_sb {
     pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
         unsafe {
-            let ptr = bch2_sb_field_get(
+            let ptr = bch2_sb_field_get_id(
                 self as *const _ as *mut _,
                 bch_sb_field_type::BCH_SB_FIELD_crypt,
             ) as *const u8;
index 64697ea65863c8f8d804b4c5dfbc6c2c1f49850a..d483083914b40f59fb5c5bcc251e237ba6f041d6 100644 (file)
@@ -47,6 +47,8 @@ pub enum BkeyValC<'a> {
     inode_v3(&'a c::bch_inode_v3),
     bucket_gens(&'a c::bch_bucket_gens),
     snapshot_tree(&'a c::bch_snapshot_tree),
+    logged_op_truncate(&'a c::bch_logged_op_truncate),
+    logged_op_finsert(&'a c::bch_logged_op_finsert),
 }
 
 impl<'a, 'b> BkeySC<'a> {
@@ -96,6 +98,8 @@ impl<'a, 'b> BkeySC<'a> {
             KEY_TYPE_inode_v3               => inode_v3(unsafe { transmute(self.v) }),
             KEY_TYPE_bucket_gens            => bucket_gens(unsafe { transmute(self.v) }),
             KEY_TYPE_snapshot_tree          => snapshot_tree(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_truncate     => logged_op_truncate(unsafe { transmute(self.v) }),
+            KEY_TYPE_logged_op_finsert      => logged_op_finsert(unsafe { transmute(self.v) }),
             KEY_TYPE_MAX                    => unreachable!(),
         }
     }
index 32b4e7439ef08f725afa7bfa8a1444e1c9c1c386..f738a46689766681f67a12de7ee5d23b3838da2e 100644 (file)
@@ -11,24 +11,21 @@ use std::ptr;
 use bitflags::bitflags;
 
 pub struct BtreeTrans<'f> {
-    raw:    c::btree_trans,
+    raw:    *mut c::btree_trans,
     fs:     PhantomData<&'f Fs>
 }
 
 impl<'f> BtreeTrans<'f> {
     pub fn new(fs: &'f Fs) -> BtreeTrans {
         unsafe {
-            let mut trans: MaybeUninit<c::btree_trans> = MaybeUninit::uninit();
-
-            c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0);
-            BtreeTrans { raw: trans.assume_init(), fs: PhantomData }
+            BtreeTrans { raw: &mut *c::__bch2_trans_get(fs.raw, 0), fs: PhantomData }
         }
     }
 }
 
 impl<'f> Drop for BtreeTrans<'f> {
     fn drop(&mut self) {
-        unsafe { c::bch2_trans_exit(&mut self.raw) }
+        unsafe { c::bch2_trans_put(&mut *self.raw) }
     }             
 }
 
@@ -64,9 +61,9 @@ impl<'t> BtreeIter<'t> {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
 
             c::bch2_trans_iter_init_outlined(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
-                btree as u32,
+                btree,
                 pos,
                 flags.bits as u32);
 
@@ -123,7 +120,7 @@ impl<'t> BtreeNodeIter<'t> {
         unsafe {
             let mut iter: MaybeUninit<c::btree_iter> = MaybeUninit::uninit();
             c::bch2_trans_node_iter_init(
-                ptr::addr_of!(trans.raw).cast_mut(),
+                trans.raw,
                 iter.as_mut_ptr(),
                 btree,
                 pos,
index 73aeef64541f777f04304776fca307e9ba90f307..4c549442ad1a6c8953a69912d581645e11f55087 100644 (file)
@@ -62,7 +62,7 @@ use std::fmt;
 
 impl fmt::Display for c::btree_id {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let s = unsafe { CStr::from_ptr(*c::bch2_btree_ids.get_unchecked(*self as usize)) };
+        let s = unsafe { CStr::from_ptr(c::bch2_btree_id_str(*self)) };
         let s = s.to_str().unwrap();
         write!(f, "{}", s)
     }
@@ -92,7 +92,7 @@ impl FromStr for c::btree_id {
         let s = CString::new(s).unwrap();
         let p = s.as_ptr();
 
-        let v = unsafe {c::match_string(c::bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)};
+        let v = unsafe {c::match_string(c::__bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)};
         if v >= 0 {
             Ok(unsafe { std::mem::transmute(v) })
         } else {
index e7bcfcfb22574f6ea61a1c4980334ba31e624e31..e68de6640e2ad1f0614edc316f2e35c3d73408f3 100644 (file)
@@ -13,8 +13,8 @@
 #include "../include/linux/blkdev.h"
 
 
-#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name;
+#define MARK_FIX_753(req_name) const blk_mode_t Fix753_##req_name = req_name;
 
-MARK_FIX_753(FMODE_READ);
-MARK_FIX_753(FMODE_WRITE);
-MARK_FIX_753(FMODE_EXCL);
\ No newline at end of file
+MARK_FIX_753(BLK_OPEN_READ);
+MARK_FIX_753(BLK_OPEN_WRITE);
+MARK_FIX_753(BLK_OPEN_EXCL);
index 3f86b8cd427db4f18b287d8c973d3a07cc912cd6..a0d0591eafd934f29c1ea6436d3650e3c3499610 100644 (file)
@@ -8,9 +8,9 @@ use bch_bindgen::btree::BtreeTrans;
 use bch_bindgen::btree::BtreeIter;
 use bch_bindgen::btree::BtreeNodeIter;
 use bch_bindgen::btree::BtreeIterFlags;
-use clap::Parser;
-use std::ffi::{CStr, OsStr, c_int, c_char};
-use std::os::unix::ffi::OsStrExt;
+use clap::{Args, Parser};
+use std::ffi::{c_int, c_char};
+use crate::transform_c_args;
 
 fn list_keys(fs: &Fs, opt: Cli) -> anyhow::Result<()> {
     let trans = BtreeTrans::new(fs);
@@ -84,7 +84,7 @@ fn list_nodes_ondisk(fs: &Fs, opt: Cli) -> anyhow::Result<()> {
     Ok(())
 }
 
-#[derive(Clone, clap::ValueEnum)]
+#[derive(Clone, clap::ValueEnum, Debug)]
 enum Mode {
     Keys,
     Formats,
@@ -92,8 +92,9 @@ enum Mode {
     NodesOndisk,
 }
 
-#[derive(Parser)]
-struct Cli {
+/// List filesystem metadata in textual form
+#[derive(Parser, Debug)]
+pub struct Cli {
     /// Btree to list from
     #[arg(short, long, default_value_t=bcachefs::btree_id::BTREE_ID_extents)]
     btree:      bcachefs::btree_id,
@@ -120,7 +121,7 @@ struct Cli {
     /// Force color on/off. Default: autodetect tty
     #[arg(short, long, action = clap::ArgAction::Set, default_value_t=atty::is(Stream::Stdout))]
     colorize:   bool,
-   
+
     /// Verbose mode
     #[arg(short, long)]
     verbose:    bool,
@@ -157,12 +158,9 @@ fn cmd_list_inner(opt: Cli) -> anyhow::Result<()> {
 }
 
 #[no_mangle]
+#[allow(clippy::not_unsafe_ptr_arg_deref)]
 pub extern "C" fn cmd_list(argc: c_int, argv: *const *const c_char) {
-    let argv: Vec<_> = (0..argc)
-        .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) })
-        .map(|i| OsStr::from_bytes(i.to_bytes()))
-        .collect();
-
+    transform_c_args!(argv, argc, argv);
     let opt = Cli::parse_from(argv);
     colored::control::set_override(opt.colorize);
     if let Err(e) = cmd_list_inner(opt) {
index 0150ffd5ab1bf9581a03f6ae138c684681493a3f..17a289ca98c8eb866db4e195ff4e77263cbb54a6 100644 (file)
@@ -1,11 +1,11 @@
 use atty::Stream;
 use bch_bindgen::{bcachefs, bcachefs::bch_sb_handle};
 use log::{info, debug, error, LevelFilter};
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use uuid::Uuid;
 use std::path::PathBuf;
-use crate::key;
-use crate::key::KeyLoc;
+use crate::{key, transform_c_args};
+use crate::key::KeyLocation;
 use crate::logger::SimpleLogger;
 use std::ffi::{CStr, CString, OsStr, c_int, c_char, c_void};
 use std::os::unix::ffi::OsStrExt;
@@ -14,7 +14,7 @@ fn mount_inner(
     src: String,
     target: impl AsRef<std::path::Path>,
     fstype: &str,
-    mountflags: u64,
+    mountflags: libc::c_ulong,
     data: Option<String>,
 ) -> anyhow::Result<()> {
 
@@ -45,7 +45,7 @@ fn mount_inner(
 
 /// Parse a comma-separated mount options and split out mountflags and filesystem
 /// specific options.
-fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
+fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, libc::c_ulong) {
     use either::Either::*;
     debug!("parsing mount options: {}", options.as_ref());
     let (opts, flags) = options
@@ -129,7 +129,7 @@ fn get_devices_by_uuid(uuid: Uuid) -> anyhow::Result<Vec<(PathBuf, bch_sb_handle
 /// Mount a bcachefs filesystem by its UUID.
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
-struct Cli {
+pub struct Cli {
     /// Where the password would be loaded from.
     ///
     /// Possible values are:
@@ -137,7 +137,7 @@ struct Cli {
     /// "wait" - wait for password to become available before mounting;
     /// "ask" -  prompt the user for password;
     #[arg(short, long, default_value = "ask", verbatim_doc_comment)]
-    key_location:   KeyLoc,
+    key_location:   KeyLocation,
 
     /// Device, or UUID=<UUID>
     dev:            String,
@@ -145,7 +145,7 @@ struct Cli {
     /// Where the filesystem should be mounted. If not set, then the filesystem
     /// won't actually be mounted. But all steps preceeding mounting the
     /// filesystem (e.g. asking for passphrase) will still be performed.
-    mountpoint:     std::path::PathBuf,
+    mountpoint:     Option<std::path::PathBuf>,
 
     /// Mount options
     #[arg(short, default_value = "")]
@@ -199,34 +199,35 @@ fn cmd_mount_inner(opt: Cli) -> anyhow::Result<()> {
     if sbs.len() == 0 {
         Err(anyhow::anyhow!("No device found from specified parameters"))?;
     } else if unsafe { bcachefs::bch2_sb_is_encrypted(sbs[0].sb) } {
-        let key = opt
-            .key_location
-            .0
-            .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?;
-
-        key::prepare_key(&sbs[0], key)?;
+        key::prepare_key(&sbs[0], opt.key_location)?;
     }
 
-    info!(
-        "mounting with params: device: {}, target: {}, options: {}",
-        devs,
-        &opt.mountpoint.to_string_lossy(),
-        &opt.options
-    );
+    if let Some(mountpoint) = opt.mountpoint {
+        info!(
+            "mounting with params: device: {}, target: {}, options: {}",
+            devs,
+            mountpoint.to_string_lossy(),
+            &opt.options
+        );
+
+        mount(devs, mountpoint, &opt.options)?;
+    } else {
+        info!(
+            "would mount with params: device: {}, options: {}",
+            devs,
+            &opt.options
+        );
+    }
 
-    mount(devs, &opt.mountpoint, &opt.options)?;
     Ok(())
 }
 
 #[no_mangle]
-pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) {
-    let argv: Vec<_> = (0..argc)
-        .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) })
-        .map(|i| OsStr::from_bytes(i.to_bytes()))
-        .collect();
-
+#[allow(clippy::not_unsafe_ptr_arg_deref)]
+pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) -> c_int {
+    transform_c_args!(argv, argc, argv);
     let opt = Cli::parse_from(argv);
-    
+
     log::set_boxed_logger(Box::new(SimpleLogger)).unwrap();
 
     // @TODO : more granular log levels via mount option
@@ -239,7 +240,9 @@ pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) {
     colored::control::set_override(opt.colorize);
     if let Err(e) = cmd_mount_inner(opt) {
         error!("Fatal error: {}", e);
+        1
     } else {
         info!("Successfully mounted");
+        0
     }
 }
index 2b4fc45e8bc494de023dc877c34fae7c7eb21536..93daa2637168c3f7e4b37c400e4c07b4ed0b2a15 100644 (file)
@@ -1,50 +1,64 @@
 use log::{info};
 use bch_bindgen::bcachefs::bch_sb_handle;
+use clap::builder::PossibleValue;
 use crate::c_str;
 use anyhow::anyhow;
 
 #[derive(Clone, Debug)]
 pub enum KeyLocation {
+    None,
     Fail,
     Wait,
     Ask,
 }
 
-#[derive(Clone, Debug)]
-pub struct KeyLoc(pub Option<KeyLocation>);
-impl std::ops::Deref for KeyLoc {
-    type Target = Option<KeyLocation>;
-    fn deref(&self) -> &Self::Target {
-        &self.0
-    }
-}
-
-impl std::str::FromStr for KeyLoc {
+impl std::str::FromStr for KeyLocation {
     type Err = anyhow::Error;
     fn from_str(s: &str) -> anyhow::Result<Self> {
         match s {
-            ""      => Ok(KeyLoc(None)),
-            "fail"  => Ok(KeyLoc(Some(KeyLocation::Fail))),
-            "wait"  => Ok(KeyLoc(Some(KeyLocation::Wait))),
-            "ask"   => Ok(KeyLoc(Some(KeyLocation::Ask))),
-            _       => Err(anyhow!("invalid password option")),
+            ""|"none" => Ok(KeyLocation::None),
+            "fail"    => Ok(KeyLocation::Fail),
+            "wait"    => Ok(KeyLocation::Wait),
+            "ask"     => Ok(KeyLocation::Ask),
+            _         => Err(anyhow!("invalid password option")),
         }
     }
 }
 
+impl clap::ValueEnum for KeyLocation {
+    fn value_variants<'a>() -> &'a [Self] {
+        &[
+            KeyLocation::None,
+            KeyLocation::Fail,
+            KeyLocation::Wait,
+            KeyLocation::Ask,
+        ]
+    }
+
+    fn to_possible_value(&self) -> Option<PossibleValue> {
+        Some(match self {
+            Self::None => PossibleValue::new("none").alias(""),
+            Self::Fail => PossibleValue::new("fail"),
+            Self::Wait => PossibleValue::new("wait"),
+            Self::Ask => PossibleValue::new("ask"),
+        })
+    }
+}
+
 fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result<bool> {
     use bch_bindgen::keyutils::{self, keyctl_search};
     let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _;
-    let key_type = c_str!("logon");
+    let key_type = c_str!("user");
 
     let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
     if key_id > 0 {
         info!("Key has became available");
         Ok(true)
-    } else if errno::errno().0 != libc::ENOKEY {
-        Err(crate::ErrnoError(errno::errno()).into())
     } else {
-        Ok(false)
+        match errno::errno().0 {
+            libc::ENOKEY | libc::EKEYREVOKED => Ok(false),
+            _ => Err(crate::ErrnoError(errno::errno()).into()),
+        }
     }
 }
 
@@ -101,7 +115,7 @@ fn ask_for_key(sb: &bch_sb_handle) -> anyhow::Result<()> {
     } else if key.magic != bch_key_magic {
         Err(anyhow!("failed to verify the password"))
     } else {
-        let key_type = c_str!("logon");
+        let key_type = c_str!("user");
         let ret = unsafe {
             bch_bindgen::keyutils::add_key(
                 key_type,
@@ -125,5 +139,6 @@ pub fn prepare_key(sb: &bch_sb_handle, password: KeyLocation) -> anyhow::Result<
         KeyLocation::Fail => Err(anyhow!("no key available")),
         KeyLocation::Wait => Ok(wait_for_key(&sb.sb().uuid())?),
         KeyLocation::Ask => ask_for_key(sb),
+        _ => Err(anyhow!("no keyoption specified for locked filesystem")),
     }
 }
index 159d049de51feaffbd7c5619224929783451bbeb..64297b41d7dce8d56c6a1bdbb0c6ef7ea74add00 100644 (file)
@@ -1,7 +1,24 @@
+use clap::Subcommand;
+
 pub mod key;
 pub mod logger;
 pub mod cmd_mount;
 pub mod cmd_list;
+pub mod cmd_completions;
+
+#[derive(clap::Parser, Debug)]
+#[command(name = "bcachefs")]
+pub struct Cli {
+    #[command(subcommand)]
+    subcommands: Subcommands,
+}
+
+#[derive(Subcommand, Debug)]
+enum Subcommands {
+    List(cmd_list::Cli),
+    Mount(cmd_mount::Cli),
+    Completions(cmd_completions::Cli),
+}
 
 #[macro_export]
 macro_rules! c_str {
@@ -14,6 +31,18 @@ macro_rules! c_str {
     };
 }
 
+#[macro_export]
+macro_rules! transform_c_args {
+    ($var:ident, $argc:expr, $argv:expr) => {
+        // TODO: `OsStr::from_bytes` only exists on *nix
+        use ::std::os::unix::ffi::OsStrExt;
+        let $var: Vec<_> = (0..$argc)
+        .map(|i| unsafe { ::std::ffi::CStr::from_ptr(*$argv.add(i as usize)) })
+        .map(|i| ::std::ffi::OsStr::from_bytes(i.to_bytes()))
+        .collect();
+    };
+}
+
 #[derive(Debug)]
 struct ErrnoError(errno::Errno);
 impl std::fmt::Display for ErrnoError {
index 624656a1fa50d16f322414e6f0aa30c5938075a0..923a6666c72422e5c5e6a4014235c1dddbdd38a9 100644 (file)
@@ -17,6 +17,7 @@
 #include <blkid.h>
 #include <uuid/uuid.h>
 
+#include "libbcachefs.h"
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "linux/sort.h"
 #include "tools-util.h"
@@ -186,7 +187,7 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
 }
 
 /* Returns size of file or block device: */
-u64 get_size(const char *path, int fd)
+u64 get_size(int fd)
 {
        struct stat statbuf = xfstat(fd);
 
@@ -199,7 +200,7 @@ u64 get_size(const char *path, int fd)
 }
 
 /* Returns blocksize, in bytes: */
-unsigned get_blocksize(const char *path, int fd)
+unsigned get_blocksize(int fd)
 {
        struct stat statbuf = xfstat(fd);
 
@@ -212,22 +213,24 @@ unsigned get_blocksize(const char *path, int fd)
 }
 
 /* Open a block device, do magic blkid stuff to probe for existing filesystems: */
-int open_for_format(const char *dev, bool force)
+int open_for_format(struct dev_opts *dev, bool force)
 {
        blkid_probe pr;
        const char *fs_type = NULL, *fs_label = NULL;
        size_t fs_type_len, fs_label_len;
 
-       int fd = open(dev, O_RDWR|O_EXCL);
-       if (fd < 0)
-               die("Error opening device to format %s: %m", dev);
+       dev->bdev = blkdev_get_by_path(dev->path, BLK_OPEN_READ|BLK_OPEN_WRITE|BLK_OPEN_EXCL,
+                                      dev, NULL);
+       int ret = PTR_ERR_OR_ZERO(dev->bdev);
+       if (ret < 0)
+               die("Error opening device to format %s: %s", dev->path, strerror(-ret));
 
        if (force)
-               return fd;
+               return 0;
 
        if (!(pr = blkid_new_probe()))
                die("blkid error 1");
-       if (blkid_probe_set_device(pr, fd, 0, 0))
+       if (blkid_probe_set_device(pr, dev->bdev->bd_buffered_fd, 0, 0))
                die("blkid error 2");
        if (blkid_probe_enable_partitions(pr, true))
                die("blkid error 3");
@@ -240,19 +243,19 @@ int open_for_format(const char *dev, bool force)
        if (fs_type) {
                if (fs_label)
                        printf("%s contains a %s filesystem labelled '%s'\n",
-                              dev, fs_type, fs_label);
+                              dev->path, fs_type, fs_label);
                else
                        printf("%s contains a %s filesystem\n",
-                              dev, fs_type);
+                              dev->path, fs_type);
                fputs("Proceed anyway?", stdout);
                if (!ask_yn())
                        exit(EXIT_FAILURE);
                while (blkid_do_probe(pr) == 0)
-                       blkid_do_wipe(pr, 0);
+                       blkid_do_wipe(pr, 0);
        }
 
        blkid_free_probe(pr);
-       return fd;
+       return ret;
 }
 
 bool ask_yn(void)
@@ -670,7 +673,11 @@ struct bbpos bbpos_parse(char *buf)
        if (!(field = strsep(&s, ":")))
                die("invalid bbpos %s", buf);
 
-       ret.btree = read_string_list_or_die(field, bch2_btree_ids, "btree id");
+       ret.btree = read_string_list_or_die(field, __bch2_btree_ids, "btree id");
+
+       if (!s)
+               die("invalid bbpos %s", buf);
+
        ret.pos = bpos_parse(s);
        return ret;
 }
index e7bdd2c3ff50dd01536696be09676da35fece8c3..7a04c1080beb9ae5dc0df82ae40579ad3c20c7d4 100644 (file)
@@ -62,9 +62,10 @@ u64 read_file_u64(int, const char *);
 ssize_t read_string_list_or_die(const char *, const char * const[],
                                const char *);
 
-u64 get_size(const char *, int);
-unsigned get_blocksize(const char *, int);
-int open_for_format(const char *, bool);
+u64 get_size(int);
+unsigned get_blocksize(int);
+struct dev_opts;
+int open_for_format(struct dev_opts *, bool);
 
 bool ask_yn(void);