]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
New upstream snapshot
authorJonathan Carter <jcc@debian.org>
Wed, 16 Feb 2022 12:53:14 +0000 (14:53 +0200)
committerJonathan Carter <jcc@debian.org>
Wed, 16 Feb 2022 12:53:14 +0000 (14:53 +0200)
185 files changed:
.gitignore
.travis.yml
INSTALL
Makefile
bcachefs.8
bcachefs.c
cmd_data.c
cmd_debug.c
cmd_device.c
cmd_format.c
cmd_fs.c
cmd_fusemount.c
cmd_migrate.c
cmd_subvolume.c [new file with mode: 0644]
cmds.h
crypto.c
debian/changelog
debian/copyright
debian/files
debian/rules
default.nix
flake.lock [new file with mode: 0644]
flake.nix [new file with mode: 0644]
include/linux/backing-dev-defs.h [new file with mode: 0644]
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/compiler.h
include/linux/poison.h
include/linux/siphash.h [new file with mode: 0644]
include/linux/slab.h
include/linux/vmalloc.h
include/trace/events/bcachefs.h
libbcachefs.c
libbcachefs.h
libbcachefs/acl.c
libbcachefs/acl.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_methods.h
libbcachefs/bkey_sort.c
libbcachefs/bkey_sort.h
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache.h
libbcachefs/btree_locking.h
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/buckets_waiting_for_journal.c [new file with mode: 0644]
libbcachefs/buckets_waiting_for_journal.h [new file with mode: 0644]
libbcachefs/buckets_waiting_for_journal_types.h [new file with mode: 0644]
libbcachefs/chardev.c
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/compress.c
libbcachefs/debug.c
libbcachefs/dirent.c
libbcachefs/dirent.h
libbcachefs/disk_groups.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/ec_types.h
libbcachefs/errcode.h [new file with mode: 0644]
libbcachefs/error.c
libbcachefs/extent_update.c
libbcachefs/extent_update.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/eytzinger.h
libbcachefs/fs-common.c
libbcachefs/fs-common.h
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/io_types.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/move_types.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/rebalance.c
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/str_hash.h
libbcachefs/subvolume.c [new file with mode: 0644]
libbcachefs/subvolume.h [new file with mode: 0644]
libbcachefs/subvolume_types.h [new file with mode: 0644]
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tests.c
libbcachefs/util.c
libbcachefs/util.h
libbcachefs/varint.c
libbcachefs/xattr.c
libbcachefs/xattr.h
linux/blkdev.c
linux/siphash.c [new file with mode: 0644]
mount/build.rs [deleted file]
mount/src/filesystem.rs [deleted file]
mount/src/lib.rs [deleted file]
nix/bcachefs-kernel.nix [new file with mode: 0644]
nix/bcachefs.rev.sha256 [new file with mode: 0644]
nix/overlay.nix [new file with mode: 0644]
packaging/bcachefs-tools.spec
qcow2.c
rust-src/bch_bindgen/.gitignore [new file with mode: 0644]
rust-src/bch_bindgen/Cargo.lock [new file with mode: 0644]
rust-src/bch_bindgen/Cargo.toml [new file with mode: 0644]
rust-src/bch_bindgen/build.rs [new file with mode: 0644]
rust-src/bch_bindgen/default.nix [new file with mode: 0644]
rust-src/bch_bindgen/rustfmt.toml [new file with mode: 0644]
rust-src/bch_bindgen/src/bcachefs.rs [new file with mode: 0644]
rust-src/bch_bindgen/src/keyutils.rs [new file with mode: 0644]
rust-src/bch_bindgen/src/keyutils_wrapper.h [moved from mount/src/keyutils_wrapper.h with 100% similarity]
rust-src/bch_bindgen/src/lib.rs [new file with mode: 0644]
rust-src/bch_bindgen/src/libbcachefs_wrapper.h [moved from mount/src/libbcachefs_wrapper.h with 59% similarity]
rust-src/bch_bindgen/src/rs.rs [new file with mode: 0644]
rust-src/mount/.gitignore [new file with mode: 0644]
rust-src/mount/Cargo.lock [moved from mount/Cargo.lock with 68% similarity]
rust-src/mount/Cargo.toml [moved from mount/Cargo.toml with 54% similarity]
rust-src/mount/README.md [new file with mode: 0644]
rust-src/mount/default.nix [new file with mode: 0644]
rust-src/mount/module.nix [new file with mode: 0644]
rust-src/mount/rustfmt.toml [new file with mode: 0644]
rust-src/mount/src/filesystem.rs [new file with mode: 0644]
rust-src/mount/src/key.rs [moved from mount/src/key.rs with 77% similarity]
rust-src/mount/src/lib.rs [new file with mode: 0644]
rust-src/mount/src/main.rs [new file with mode: 0644]
smoke_test
tests/__init__.py [new file with mode: 0644]
tests/conftest.py
tests/test_basic.py
tests/test_fixture.py
tests/test_fuse.py
tests/util.py
tests/valgrind-suppressions.txt
tools-util.c
tools-util.h

index 8feb598ed1d42b110e951dbc878c015ddc27cd3a..b1c03cd14a631721195172b076f39d9a27917f73 100644 (file)
@@ -18,4 +18,4 @@ tests/__pycache__/
 
 mount/target
 mount.bcachefs
-doc/bcachefs.5.rst
+bcachefs-principles-of-operation.*
index 3b90b73c27f0c5456c364d211fa431f2c80e0779..e66f0c2a0e9cc3592315ec61bfce726735fcc76a 100644 (file)
@@ -9,6 +9,7 @@ addons:
     apt:
         packages:
             - valgrind
+            - python3-docutils
             - python3-pytest
             - python3-pytest-xdist
             - meson
@@ -18,7 +19,6 @@ addons:
             - libblkid-dev
             - libkeyutils-dev
             - liblz4-dev
-            - libscrypt-dev
             - libsodium-dev
             - liburcu-dev
             - libzstd-dev
diff --git a/INSTALL b/INSTALL
index 85c09a2157119ad86dde0f4712893ed4cdb59db0..b4d60bf48db87b0fff8d7653740d0f443131fe2a 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -6,7 +6,6 @@ Dependencies:
  * libblkid
  * libkeyutils
  * liblz4
- * libscrypt
  * libsodium
  * liburcu
  * libuuid
@@ -17,17 +16,18 @@ Dependencies:
 
 Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with
     apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \
-        liblz4-dev libscrypt-dev libsodium-dev liburcu-dev libzstd-dev \
-        uuid-dev zlib1g-dev valgrind libudev-dev
+        liblz4-dev libsodium-dev liburcu-dev libzstd-dev \
+        uuid-dev zlib1g-dev valgrind libudev-dev git build-essential \
+        python3 python3-docutils
 
 Fedora: install the "Development tools" group along with:
     dnf install -y libaio-devel libsodium-devel \
         libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \
         lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \
-        libscrypt-devel findutils
+        findutils
 
 Arch: install bcachefs-tools-git from the AUR.
-Or to build from source, install libscrypt from the AUR along with,
+Or to build from source, install build dependencies with
     pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind
 
 Then, just make && make install
index 23e0508569d8b3856d0a4906d43be01495dda3df..e49534e6c66323504c037119bbf509efa860f6b2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 PREFIX?=/usr/local
 PKG_CONFIG?=pkg-config
 INSTALL=install
-PYTEST=pytest-3
-CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall                           \
+
+CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC                             \
        -Wno-pointer-sign                                       \
        -fno-strict-aliasing                                    \
        -fno-delete-null-pointer-checks                         \
@@ -20,6 +20,21 @@ CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall                         \
        $(EXTRA_CFLAGS)
 LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS)
 
+## Configure Tools
+PYTEST_ARGS?=
+PYTEST_CMD?=$(shell \
+       command -v pytest-3 \
+       || which pytest-3 \
+)
+PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS)
+
+CARGO_ARGS=
+CARGO=cargo $(CARGO_ARGS)
+CARGO_PROFILE=release
+# CARGO_PROFILE=debug
+
+CARGO_BUILD_ARGS=--$(CARGO_PROFILE)
+CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS)
 VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit)
 
 include Makefile.compiler
@@ -49,8 +64,7 @@ endif
 
 CFLAGS+=$(PKGCONFIG_CFLAGS)
 LDLIBS+=$(PKGCONFIG_LDLIBS)
-
-LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio -ldl
+LDLIBS+=-lm -lpthread -lrt -lkeyutils -laio -ldl
 LDLIBS+=$(EXTRA_LDLIBS)
 
 ifeq ($(PREFIX),/usr)
@@ -61,31 +75,22 @@ else
        INITRAMFS_DIR=/etc/initramfs-tools
 endif
 
-var := $(shell rst2man -V 2>/dev/null)
-ifeq ($(.SHELLSTATUS),0)
-       RST2MAN=rst2man
-endif
-
-var := $(shell rst2man.py -V 2>/dev/null)
-ifeq ($(.SHELLSTATUS),0)
-       RST2MAN=rst2man.py
-endif
-
-undefine var
-
-ifeq (,$(RST2MAN))
-       @echo "WARNING: no RST2MAN found!"
-endif
-
 .PHONY: all
-all: bcachefs bcachefs.5
+all: bcachefs lib
+
+.PHONY: lib
+lib: libbcachefs.so
 
 .PHONY: tests
 tests: tests/test_helper
 
 .PHONY: check
 check: tests bcachefs
-       cd tests; $(PYTEST)
+ifneq (,$(PYTEST_CMD))
+       $(PYTEST)
+else
+       @echo "WARNING: pytest not found or specified, tests could not be run."
+endif
 
 .PHONY: TAGS tags
 TAGS:
@@ -94,34 +99,32 @@ TAGS:
 tags:
        ctags -R .
 
-DOCSRC := opts_macro.h bcachefs.5.rst.tmpl
-DOCGENERATED := bcachefs.5 doc/bcachefs.5.rst
-DOCDEPS := $(addprefix ./doc/,$(DOCSRC))
-bcachefs.5: $(DOCDEPS)  libbcachefs/opts.h
-       $(CC) doc/opts_macro.h -I libbcachefs -I include -E 2>/dev/null \
-               | doc/macro2rst.py
-       $(RST2MAN) doc/bcachefs.5.rst bcachefs.5
-
-SRCS=$(shell find . -type f -iname '*.c')
+SRCS=$(shell find . -type f ! -path '*/.*/*' -iname '*.c')
 DEPS=$(SRCS:.c=.d)
 -include $(DEPS)
 
 OBJS=$(SRCS:.c=.o)
 bcachefs: $(filter-out ./tests/%.o, $(OBJS))
 
-MOUNT_SRCS=$(shell find mount/src -type f -iname '*.rs') \
-    mount/Cargo.toml mount/Cargo.lock mount/build.rs
+RUST_SRCS=$(shell find rust-src/ -type f -iname '*.rs')
+MOUNT_SRCS=$(filter %mount, $(RUST_SRCS))
 
 debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y
 debug: bcachefs
 
-libbcachefs_mount.a: $(MOUNT_SRCS)
-       LIBBCACHEFS_INCLUDE=$(CURDIR) cargo build --manifest-path mount/Cargo.toml --release
-       cp mount/target/release/libbcachefs_mount.a $@
-
 MOUNT_OBJ=$(filter-out ./bcachefs.o ./tests/%.o ./cmd_%.o , $(OBJS))
-mount.bcachefs: libbcachefs_mount.a $(MOUNT_OBJ)
-       $(CC) -Wl,--gc-sections libbcachefs_mount.a $(MOUNT_OBJ) -o $@ $(LDLIBS)
+libbcachefs.so: LDFLAGS+=-shared
+libbcachefs.so: $(MOUNT_OBJ)
+       $(CC) $(LDFLAGS) $+ -o $@ $(LDLIBS)
+
+MOUNT_TOML=rust-src/mount/Cargo.toml
+mount.bcachefs: lib $(MOUNT_SRCS)
+       LIBBCACHEFS_LIB=$(CURDIR) \
+       LIBBCACHEFS_INCLUDE=$(CURDIR) \
+       $(CARGO_BUILD) --manifest-path $(MOUNT_TOML)
+
+       ln -f rust-src/mount/target/$(CARGO_PROFILE)/bcachefs-mount $@
+
 
 tests/test_helper: $(filter ./tests/%.o, $(OBJS))
 
@@ -138,7 +141,7 @@ cmd_version.o : .version
 .PHONY: install
 install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs
 install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs
-install: bcachefs
+install: bcachefs lib
        $(INSTALL) -m0755 -D bcachefs      -t $(DESTDIR)$(ROOT_SBINDIR)
        $(INSTALL) -m0755    fsck.bcachefs    $(DESTDIR)$(ROOT_SBINDIR)
        $(INSTALL) -m0755    mkfs.bcachefs    $(DESTDIR)$(ROOT_SBINDIR)
@@ -146,18 +149,26 @@ install: bcachefs
        $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT)
        $(INSTALL) -m0755 -D initramfs/hook   $(DESTDIR)$(INITRAMFS_HOOK)
        $(INSTALL) -m0755 -D mount.bcachefs.sh $(DESTDIR)$(ROOT_SBINDIR)
+       $(INSTALL) -m0755 -D libbcachefs.so -t $(DESTDIR)$(PREFIX)/lib/
+
        sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK)
        echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK)
 
 .PHONY: clean
 clean:
        $(RM) bcachefs mount.bcachefs libbcachefs_mount.a tests/test_helper .version $(OBJS) $(DEPS) $(DOCGENERATED)
-       $(RM) -rf mount/target
+       $(RM) -rf rust-src/*/target
 
 .PHONY: deb
 deb: all
        debuild -us -uc -nc -b -i -I
 
+bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex
+       pdflatex doc/bcachefs-principles-of-operation.tex
+       pdflatex doc/bcachefs-principles-of-operation.tex
+
+doc: bcachefs-principles-of-operation.pdf
+
 .PHONY: update-bcachefs-sources
 update-bcachefs-sources:
        git rm -rf --ignore-unmatch libbcachefs
@@ -184,6 +195,7 @@ update-bcachefs-sources:
        git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision
        git add .bcachefs_revision
 
+
 .PHONY: update-commit-bcachefs-sources
 update-commit-bcachefs-sources: update-bcachefs-sources
        git commit -m "Update bcachefs sources to $(shell git -C $(LINUX_DIR) show --oneline --no-patch)"
index 61af7f425a809a63cdf48570482f9b0b2729bf66..874068c8e95d313832a725e6ba5e589d42d8ed51 100644 (file)
@@ -99,7 +99,7 @@ Format one or a list of devices with bcachefs data structures.
 You need to do this before you create a volume.
 .Pp
 Device specific options must come before corresponding devices, e.g.
-.Dl bcachefs format --group=ssd /dev/sda --group=hdd /dev/sdb
+.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb
 .Bl -tag -width Ds
 .It Fl b , Fl -block Ns = Ns Ar size
 block size, in bytes (e.g. 4k)
@@ -111,7 +111,7 @@ Set metadata checksum type (default:
 .It Fl -data_checksum_type Ns = Ns ( Cm none | crc32c | crc64 )
 Set data checksum type (default:
 .Cm crc32c ) .
-.It Fl -compression_type Ns = Ns ( Cm none | lz4 | gzip )
+.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd )
 Set compression type (default:
 .Cm none ) .
 .It Fl -data_replicas Ns = Ns Ar number
index 239b114723831cb630d837c95bcf36f6e58b9cb9..4f2cd55111a2285123eabcee7852598a7830837b 100644 (file)
@@ -37,14 +37,14 @@ static void usage(void)
             "Repair:\n"
             "  fsck                     Check an existing filesystem for errors\n"
             "\n"
-            "Startup/shutdown, assembly of multi device filesystems:\n"
 #if 0
+            "Startup/shutdown, assembly of multi device filesystems:\n"
             "  assemble                 Assemble an existing multi device filesystem\n"
             "  incremental              Incrementally assemble an existing multi device filesystem\n"
             "  run                      Start a partially assembled filesystem\n"
             "  stop                     Stop a running filesystem\n"
-#endif
             "\n"
+#endif
             "Commands for managing a running filesystem:\n"
             "  fs usage                 Show disk usage\n"
             "\n"
@@ -56,7 +56,12 @@ static void usage(void)
             "  device evacuate          Migrate data off of a specific device\n"
             "  device set-state         Mark a device as failed\n"
             "  device resize            Resize filesystem on a device\n"
-            "  device journal-resize    Resize journal on a device\n"
+            "  device resize-journal    Resize journal on a device\n"
+            "\n"
+            "Commands for managing subvolumes and snapshots:\n"
+            "  subvolume create     Create a new subvolume\n"
+            "  subvolume delete     Delete an existing subvolume\n"
+            "  subvolume snapshot   Create a snapshot\n"
             "\n"
             "Commands for managing filesystem data:\n"
             "  data rereplicate         Rereplicate degraded data\n"
@@ -87,14 +92,9 @@ static char *full_cmd;
 
 static char *pop_cmd(int *argc, char *argv[])
 {
-       if (*argc < 2) {
-               printf("%s: missing command\n", argv[0]);
-               usage();
-               exit(EXIT_FAILURE);
-       }
-
        char *cmd = argv[1];
-       memmove(&argv[1], &argv[2], *argc * sizeof(argv[0]));
+       if (!(*argc < 2))
+               memmove(&argv[1], &argv[2], *argc * sizeof(argv[0]));
        (*argc)--;
 
        full_cmd = mprintf("%s %s", full_cmd, cmd);
@@ -105,10 +105,11 @@ static int fs_cmds(int argc, char *argv[])
 {
        char *cmd = pop_cmd(&argc, argv);
 
+       if (argc < 1)
+               return fs_usage();
        if (!strcmp(cmd, "usage"))
                return cmd_fs_usage(argc, argv);
 
-       usage();
        return 0;
 }
 
@@ -116,6 +117,8 @@ static int device_cmds(int argc, char *argv[])
 {
        char *cmd = pop_cmd(&argc, argv);
 
+       if (argc < 1)
+               return device_usage();
        if (!strcmp(cmd, "add"))
                return cmd_device_add(argc, argv);
        if (!strcmp(cmd, "remove"))
@@ -133,7 +136,6 @@ static int device_cmds(int argc, char *argv[])
        if (!strcmp(cmd, "resize-journal"))
                return cmd_device_resize_journal(argc, argv);
 
-       usage();
        return 0;
 }
 
@@ -141,12 +143,28 @@ static int data_cmds(int argc, char *argv[])
 {
        char *cmd = pop_cmd(&argc, argv);
 
+       if (argc < 1)
+               return data_usage();
        if (!strcmp(cmd, "rereplicate"))
                return cmd_data_rereplicate(argc, argv);
        if (!strcmp(cmd, "job"))
                return cmd_data_job(argc, argv);
 
-       usage();
+       return 0;
+}
+
+static int subvolume_cmds(int argc, char *argv[])
+{
+       char *cmd = pop_cmd(&argc, argv);
+       if (argc < 1)
+               return subvolume_usage();
+       if (!strcmp(cmd, "create"))
+               return cmd_subvolume_create(argc, argv);
+       if (!strcmp(cmd, "delete"))
+               return cmd_subvolume_delete(argc, argv);
+       if (!strcmp(cmd, "snapshot"))
+               return cmd_subvolume_snapshot(argc, argv);
+
        return 0;
 }
 
@@ -159,16 +177,34 @@ int main(int argc, char *argv[])
        setvbuf(stdout, NULL, _IOLBF, 0);
 
        char *cmd = pop_cmd(&argc, argv);
+       if (argc < 1) {
+               puts("missing command\n");
+               goto usage;
+       }
 
-       if (!strcmp(cmd, "version"))
-               return cmd_version(argc, argv);
+       /* these subcommands display usage when argc < 2 */
+       if (!strcmp(cmd, "device"))
+               return device_cmds(argc, argv);
+       if (!strcmp(cmd, "fs"))
+               return fs_cmds(argc, argv);
+       if (!strcmp(cmd, "data"))
+               return data_cmds(argc, argv);
+       if (!strcmp(cmd, "subvolume"))
+               return subvolume_cmds(argc, argv);
        if (!strcmp(cmd, "format"))
                return cmd_format(argc, argv);
+       if (!strcmp(cmd, "fsck"))
+               return cmd_fsck(argc, argv);
+       if (!strcmp(cmd, "version"))
+               return cmd_version(argc, argv);
        if (!strcmp(cmd, "show-super"))
                return cmd_show_super(argc, argv);
 
-       if (!strcmp(cmd, "fsck"))
-               return cmd_fsck(argc, argv);
+       if (argc < 2) {
+               printf("%s: missing command\n", argv[0]);
+               usage();
+               exit(EXIT_FAILURE);
+       }
 
 #if 0
        if (!strcmp(cmd, "assemble"))
@@ -181,15 +217,6 @@ int main(int argc, char *argv[])
                return cmd_stop(argc, argv);
 #endif
 
-       if (!strcmp(cmd, "fs"))
-               return fs_cmds(argc, argv);
-
-       if (!strcmp(cmd, "device"))
-               return device_cmds(argc, argv);
-
-       if (!strcmp(cmd, "data"))
-               return data_cmds(argc, argv);
-
        if (!strcmp(cmd, "unlock"))
                return cmd_unlock(argc, argv);
        if (!strcmp(cmd, "set-passphrase"))
@@ -223,6 +250,7 @@ int main(int argc, char *argv[])
        }
 
        printf("Unknown command %s\n", cmd);
+usage:
        usage();
        exit(EXIT_FAILURE);
 }
index 25a2dcb22cd0631b41010d1c7fa2b3776a39a6ed..d78598d5abb7ce7cefa478461e24351c1195376d 100644 (file)
@@ -9,6 +9,19 @@
 #include "cmds.h"
 #include "libbcachefs.h"
 
+int data_usage(void)
+{
+       puts("bcachefs data - manage filesystem data\n"
+            "Usage: bcachefs data <CMD> [OPTIONS]\n"
+            "\n"
+            "Commands:\n"
+            "  rereplicate                     Rereplicate degraded data\n"
+            "  job                             Kick off low level data jobs\n"
+            "\n"
+            "Report bugs to <linux-bcache@vger.kernel.org>");
+       return 0;
+}
+
 static void data_rereplicate_usage(void)
 {
        puts("bcachefs data rereplicate\n"
index 2f56e41e82b70834ecc29b78a69c8ddc842086cb..6ff58a96642bc2cabaee6f5ed2224b3a2db19616 100644 (file)
@@ -37,6 +37,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
        struct bch_sb *sb = ca->disk_sb.sb;
        ranges data;
        unsigned i;
+       int ret;
 
        darray_init(data);
 
@@ -64,12 +65,12 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                const struct bch_extent_ptr *ptr;
                struct bkey_ptrs_c ptrs;
                struct btree_trans trans;
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct btree *b;
 
                bch2_trans_init(&trans, c, 0, 0);
 
-               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b) {
+               __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) {
                        struct btree_node_iter iter;
                        struct bkey u;
                        struct bkey_s_c k;
@@ -85,6 +86,9 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                        }
                }
 
+               if (ret)
+                       die("error %s walking btree nodes", strerror(-ret));
+
                b = c->btree_roots[i].b;
                if (!btree_node_fake(b)) {
                        ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
@@ -95,6 +99,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd)
                                                  ptr->offset << 9,
                                                  btree_bytes(c));
                }
+
+               bch2_trans_iter_exit(&trans, &iter);
                bch2_trans_exit(&trans);
        }
 
@@ -181,7 +187,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                      struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        char buf[512];
        int ret;
@@ -189,6 +195,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_btree_key(&trans, iter, btree_id, start,
+                          BTREE_ITER_ALL_SNAPSHOTS|
                           BTREE_ITER_PREFETCH, k, ret) {
                if (bkey_cmp(k.k->p, end) > 0)
                        break;
@@ -196,7 +203,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id,
                bch2_bkey_val_to_text(&PBUF(buf), c, k);
                puts(buf);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 }
@@ -205,20 +212,24 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne
                               struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
+       int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
                bch2_btree_node_to_text(&PBUF(buf), c, b);
                puts(buf);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
 }
@@ -227,13 +238,14 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                       struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
+       int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
@@ -241,7 +253,10 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level,
                fputs(buf, stdout);
                putchar('\n');
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
 }
@@ -280,7 +295,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b)
        bio_put(bio);
        percpu_ref_put(&ca->io_ref);
 
-       while (offset < c->opts.btree_node_size) {
+       while (offset < btree_sectors(c)) {
                struct bset *i;
                struct nonce nonce;
                struct bch_csum csum;
@@ -346,13 +361,14 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
                              struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        char buf[4096];
+       int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
@@ -362,7 +378,10 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned
 
                print_node_ondisk(c, b);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
 }
@@ -371,16 +390,17 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                            struct bpos start, struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree_node_iter node_iter;
        struct bkey unpacked;
        struct bkey_s_c k;
        struct btree *b;
        char buf[4096];
+       int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) {
+       __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) {
                if (bkey_cmp(b->key.k.p, end) > 0)
                        break;
 
@@ -393,7 +413,10 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l
                        puts(buf);
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               die("error %s walking btree nodes", strerror(-ret));
 
        bch2_trans_exit(&trans);
 }
@@ -443,9 +466,9 @@ int cmd_list(int argc, char *argv[])
        enum btree_id btree_id_start    = 0;
        enum btree_id btree_id_end      = BTREE_ID_NR;
        enum btree_id btree_id;
-       unsigned level;
+       unsigned level = 0;
        struct bpos start = POS_MIN, end = POS_MAX;
-       u64 inum;
+       u64 inum = 0;
        int mode = 0, opt;
 
        opt_set(opts, nochanges,        true);
@@ -572,9 +595,6 @@ int cmd_list_journal(int argc, char *argv[])
 
        struct journal_replay *p;
        struct jset_entry *entry;
-       struct bkey_i *k, *_n;
-
-       /* This could be greatly expanded: */
 
        list_for_each_entry(p, &c->journal_entries, list) {
                printf("journal entry   %8llu\n"
@@ -585,14 +605,13 @@ int cmd_list_journal(int argc, char *argv[])
                       le32_to_cpu(p->j.version),
                       le64_to_cpu(p->j.last_seq));
 
-               for_each_jset_key(k, _n, entry, &p->j) {
-                       char buf[200];
+               vstruct_for_each(&p->j, entry) {
+                       char _buf[4096];
+                       struct printbuf buf = PBUF(_buf);
 
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
-                       printf("btree %s l %u: %s\n",
-                              bch2_btree_ids[entry->btree_id],
-                              entry->level,
-                              buf);
+                       printbuf_indent_push(&buf, 2);
+                       bch2_journal_entry_to_text(&buf, c, entry);
+                       printf("%s\n", _buf);
                }
        }
 
index b18bdd8c6f2e0a89bffb5e0e74fb0352e2a918f2..ef2dfa14654d2a3c6d5f37f48678f611b9c5fc3c 100644 (file)
 #include "libbcachefs/opts.h"
 #include "tools-util.h"
 
+int device_usage(void)
+{
+       puts("bcachefs device - manage devices within a running filesystem\n"
+            "Usage: bcachefs device <CMD> [OPTION]\n"
+            "\n"
+            "Commands:\n"
+            "  add                     add a new device to an existing filesystem\n"
+            "  remove                  remove a device from an existing filesystem\n"
+            "  online                  re-add an existing member to a filesystem\n"
+            "  offline                 take a device offline, without removing it\n"
+            "  evacuate                migrate data off a specific device\n"
+            "  set-state               mark a device as failed\n"
+            "  resize                  resize filesystem on a device\n"
+            "  resize-journal          resize journal on a device\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+       return 0;
+}
+
 static void device_add_usage(void)
 {
        puts("bcachefs device add - add a device to an existing filesystem\n"
@@ -30,7 +49,7 @@ static void device_add_usage(void)
             "  -S, --fs_size=size          Size of filesystem on device\n"
             "  -B, --bucket=size           Bucket size\n"
             "  -D, --discard               Enable discards\n"
-            "  -g, --group=group           Disk group\n"
+            "  -l, --label=label           Disk label\n"
             "  -f, --force                 Use device even if it appears to already be formatted\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
@@ -43,7 +62,7 @@ int cmd_device_add(int argc, char *argv[])
                { "fs_size",            required_argument,      NULL, 'S' },
                { "bucket",             required_argument,      NULL, 'B' },
                { "discard",            no_argument,            NULL, 'D' },
-               { "group",              required_argument,      NULL, 'g' },
+               { "label",              required_argument,      NULL, 'l' },
                { "force",              no_argument,            NULL, 'f' },
                { "help",               no_argument,            NULL, 'h' },
                { NULL }
@@ -59,18 +78,16 @@ int cmd_device_add(int argc, char *argv[])
                case 'S':
                        if (bch2_strtoull_h(optarg, &dev_opts.size))
                                die("invalid filesystem size");
-
-                       dev_opts.size >>= 9;
                        break;
                case 'B':
-                       dev_opts.bucket_size =
-                               hatoi_validate(optarg, "bucket size");
+                       if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+                               die("bad bucket_size %s", optarg);
                        break;
                case 'D':
                        dev_opts.discard = true;
                        break;
-               case 'g':
-                       dev_opts.group = strdup(optarg);
+               case 'l':
+                       dev_opts.label = strdup(optarg);
                        break;
                case 'f':
                        force = true;
@@ -85,8 +102,8 @@ int cmd_device_add(int argc, char *argv[])
        if (!fs_path)
                die("Please supply a filesystem");
 
-       char *dev_path = arg_pop();
-       if (!dev_path)
+       dev_opts.path = arg_pop();
+       if (!dev_opts.path)
                die("Please supply a device");
 
        if (argc)
@@ -94,7 +111,6 @@ int cmd_device_add(int argc, char *argv[])
 
        struct bchfs_handle fs = bcache_fs_open(fs_path);
 
-       dev_opts.path = dev_path;
        dev_opts.fd = open_for_format(dev_opts.path, force);
 
        struct bch_opt_strs fs_opt_strs;
@@ -103,9 +119,9 @@ int cmd_device_add(int argc, char *argv[])
        struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs);
 
        opt_set(fs_opts, block_size,
-               read_file_u64(fs.sysfs_fd, "block_size") >> 9);
+               read_file_u64(fs.sysfs_fd, "options/block_size"));
        opt_set(fs_opts, btree_node_size,
-               read_file_u64(fs.sysfs_fd, "btree_node_size") >> 9);
+               read_file_u64(fs.sysfs_fd, "options/btree_node_size"));
 
        struct bch_sb *sb = bch2_format(fs_opt_strs,
                                        fs_opts,
@@ -498,6 +514,9 @@ int cmd_device_resize(int argc, char *argv[])
 
                u64 nbuckets = size / le16_to_cpu(m->bucket_size);
 
+               if (nbuckets < le64_to_cpu(m->nbuckets))
+                       die("Shrinking not supported yet");
+
                printf("resizing %s to %llu buckets\n", dev, nbuckets);
                bchu_disk_resize(fs, idx, nbuckets);
        } else {
@@ -519,6 +538,9 @@ int cmd_device_resize(int argc, char *argv[])
 
                u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size);
 
+               if (nbuckets < le64_to_cpu(resize->mi.nbuckets))
+                       die("Shrinking not supported yet");
+
                printf("resizing %s to %llu buckets\n", dev, nbuckets);
                int ret = bch2_dev_resize(c, resize, nbuckets);
                if (ret)
@@ -533,7 +555,7 @@ int cmd_device_resize(int argc, char *argv[])
 static void device_resize_journal_usage(void)
 {
        puts("bcachefs device resize-journal \n"
-            "Usage: bcachefs device resize-journal device [ size ]\n"
+            "Usage: bcachefs device resize-journal device size\n"
             "\n"
             "Options:\n"
             "  -h, --help                  display this help and exit\n"
@@ -565,7 +587,7 @@ int cmd_device_resize_journal(int argc, char *argv[])
 
        char *size_arg = arg_pop();
        if (!size_arg)
-               size = get_size(dev, dev_fd);
+               die("Please supply a journal size");
        else if (bch2_strtoull_h(size_arg, &size))
                die("invalid size");
 
index 3f96f5de3c221acb8cfafca33bd12acb0003aef8..cc16b31fabca66991bbb7d29172303bce4c685d8 100644 (file)
 x(0,   replicas,               required_argument)      \
 x(0,   encrypted,              no_argument)            \
 x(0,   no_passphrase,          no_argument)            \
-x('L', label,                  required_argument)      \
+x('L', fs_label,               required_argument)      \
 x('U', uuid,                   required_argument)      \
 x(0,   fs_size,                required_argument)      \
 x(0,   superblock_size,        required_argument)      \
 x(0,   bucket_size,            required_argument)      \
-x('g', group,                  required_argument)      \
+x('l', label,                  required_argument)      \
 x(0,   discard,                no_argument)            \
 x(0,   data_allowed,           required_argument)      \
 x(0,   durability,             required_argument)      \
@@ -61,7 +61,7 @@ static void usage(void)
             "      --replicas=#            Sets both data and metadata replicas\n"
             "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
             "      --no_passphrase         Don't encrypt master encryption key\n"
-            "  -L, --label=label\n"
+            "  -L, --fs_label=label\n"
             "  -U, --uuid=uuid\n"
             "      --superblock_size=size\n"
             "\n"
@@ -69,14 +69,14 @@ static void usage(void)
 
        bch2_opts_usage(OPT_DEVICE);
 
-       puts("  -g, --group=label           Disk group\n"
+       puts("  -l, --label=label           Disk label\n"
             "\n"
             "  -f, --force\n"
             "  -q, --quiet                 Only print errors\n"
             "  -h, --help                  Display this help and exit\n"
             "\n"
             "Device specific options must come before corresponding devices, e.g.\n"
-            "  bcachefs format --group cache /dev/sdb /dev/sdc\n"
+            "  bcachefs format --label cache /dev/sdb /dev/sdc\n"
             "\n"
             "Report bugs to <linux-bcache@vger.kernel.org>");
 }
@@ -147,7 +147,7 @@ int cmd_format(int argc, char *argv[])
                case O_no_passphrase:
                        no_passphrase = true;
                        break;
-               case O_label:
+               case O_fs_label:
                case 'L':
                        opts.label = optarg;
                        break;
@@ -163,8 +163,6 @@ int cmd_format(int argc, char *argv[])
                case O_fs_size:
                        if (bch2_strtoull_h(optarg, &dev_opts.size))
                                die("invalid filesystem size");
-
-                       dev_opts.size >>= 9;
                        break;
                case O_superblock_size:
                        if (bch2_strtouint_h(optarg, &opts.superblock_size))
@@ -173,12 +171,12 @@ int cmd_format(int argc, char *argv[])
                        opts.superblock_size >>= 9;
                        break;
                case O_bucket_size:
-                       dev_opts.bucket_size =
-                               hatoi_validate(optarg, "bucket size");
+                       if (bch2_strtoull_h(optarg, &dev_opts.bucket_size))
+                               die("bad bucket_size %s", optarg);
                        break;
-               case O_group:
-               case 'g':
-                       dev_opts.group = optarg;
+               case O_label:
+               case 'l':
+                       dev_opts.label = optarg;
                        break;
                case O_discard:
                        dev_opts.discard = true;
@@ -258,7 +256,7 @@ int cmd_format(int argc, char *argv[])
                                                darray_size(device_paths),
                                                bch2_opts_empty());
                if (IS_ERR(c))
-                       die("error opening %s: %s", device_paths.item,
+                       die("error opening %s: %s", device_paths.item[0],
                            strerror(-PTR_ERR(c)));
 
                bch2_fs_stop(c);
index 8b9d91b80d162a051ce7d7920f43e57dd34a4e98..f8c46429af60344a448849eaf29980df33d27308 100644 (file)
--- a/cmd_fs.c
+++ b/cmd_fs.c
@@ -195,6 +195,18 @@ static void print_fs_usage(const char *path, enum units units)
        bcache_fs_close(fs);
 }
 
+int fs_usage(void)
+{
+       puts("bcachefs fs - manage a running filesystem\n"
+            "Usage: bcachefs fs <CMD> [OPTION]... path\n"
+            "\n"
+            "Commands:\n"
+            "  usage                      show disk usage\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+       return 0;
+}
+
 int cmd_fs_usage(int argc, char *argv[])
 {
        enum units units = BYTES;
index 2b6b2d7ecc92e8d3b322d9138173b608a55ca534..216094f06ad33e3b34d8ed72e4c96d3b0e4a927e 100644 (file)
@@ -171,7 +171,7 @@ static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum,
        struct bch_fs *c = fuse_req_userdata(req);
        struct bch_inode_unpacked inode_u;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        u64 now;
        int ret;
 
@@ -185,8 +185,7 @@ retry:
        bch2_trans_begin(&trans);
        now = bch2_current_time(c);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -208,11 +207,11 @@ retry:
                inode_u.bi_mtime = now;
        /* TODO: CTIME? */
 
-       ret   = bch2_inode_write(&trans, iter, &inode_u) ?:
+       ret   = bch2_inode_write(&trans, &iter, &inode_u) ?:
                bch2_trans_commit(&trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 err:
-        bch2_trans_iter_put(&trans, iter);
+        bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
@@ -523,7 +522,7 @@ static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum,
 static int inode_update_times(struct bch_fs *c, fuse_ino_t inum)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_inode_unpacked inode_u;
        int ret = 0;
        u64 now;
@@ -533,15 +532,14 @@ retry:
        bch2_trans_begin(&trans);
        now = bch2_current_time(c);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        inode_u.bi_mtime = now;
        inode_u.bi_ctime = now;
 
-       ret = bch2_inode_write(&trans, iter, &inode_u);
+       ret = bch2_inode_write(&trans, &iter, &inode_u);
        if (ret)
                goto err;
 
@@ -549,7 +547,7 @@ retry:
                                BTREE_INSERT_NOFAIL);
 
 err:
-        bch2_trans_iter_put(&trans, iter);
+        bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
index 51260906dccd95f71ed71627095dbe187777ba47..4da3ab1b58c6e0125f68c24e1771db4335d6b244 100644 (file)
@@ -123,6 +123,7 @@ static void update_inode(struct bch_fs *c,
        int ret;
 
        bch2_inode_pack(c, &packed, inode);
+       packed.inode.k.p.snapshot = U32_MAX;
        ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
                                NULL, NULL, 0);
        if (ret)
@@ -138,8 +139,9 @@ static void create_link(struct bch_fs *c,
        struct bch_inode_unpacked inode;
 
        int ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_link_trans(&trans, parent->bi_inum, inum,
-                               &parent_u, &inode, &qstr));
+               bch2_link_trans(&trans,
+                               (subvol_inum) { 1, parent->bi_inum }, &parent_u,
+                               (subvol_inum) { 1, inum }, &inode, &qstr));
        if (ret)
                die("error creating hardlink: %s", strerror(-ret));
 }
@@ -153,13 +155,16 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c,
        struct qstr qstr = QSTR(name);
        struct bch_inode_unpacked new_inode;
 
+       bch2_inode_init_early(c, &new_inode);
+
        int ret = bch2_trans_do(c, NULL, NULL, 0,
                bch2_create_trans(&trans,
-                                 parent->bi_inum, parent,
+                                 (subvol_inum) { 1, parent->bi_inum }, parent,
                                  &new_inode, &qstr,
-                                 uid, gid, mode, rdev, NULL, NULL));
+                                 uid, gid, mode, rdev, NULL, NULL,
+                                 (subvol_inum) {}, 0));
        if (ret)
-               die("error creating file: %s", strerror(-ret));
+               die("error creating %s: %s", name, strerror(-ret));
 
        return new_inode;
 }
@@ -225,44 +230,48 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst,
                const struct xattr_handler *h = xattr_resolve_name(&attr);
 
                int ret = bch2_trans_do(c, NULL, NULL, 0,
-                               bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr,
+                               bch2_xattr_set(&trans,
+                                              (subvol_inum) { 1, dst->bi_inum },
+                                              &hash_info, attr,
                                               val, val_size, h->flags, 0));
                if (ret < 0)
                        die("error creating xattr: %s", strerror(-ret));
        }
 }
 
-static char buf[1 << 20] __aligned(PAGE_SIZE);
+#define WRITE_DATA_BUF (1 << 20)
+
+static char buf[WRITE_DATA_BUF] __aligned(PAGE_SIZE);
 
 static void write_data(struct bch_fs *c,
                       struct bch_inode_unpacked *dst_inode,
                       u64 dst_offset, void *buf, size_t len)
 {
-       struct {
-               struct bch_write_op op;
-               struct bio_vec bv[sizeof(buf) / PAGE_SIZE];
-       } o;
+       struct bch_write_op op;
+       struct bio_vec bv[WRITE_DATA_BUF / PAGE_SIZE];
        struct closure cl;
 
        BUG_ON(dst_offset       & (block_bytes(c) - 1));
        BUG_ON(len              & (block_bytes(c) - 1));
+       BUG_ON(len > WRITE_DATA_BUF);
 
        closure_init_stack(&cl);
 
-       bio_init(&o.op.wbio.bio, o.bv, ARRAY_SIZE(o.bv));
-       bch2_bio_map(&o.op.wbio.bio, buf, len);
+       bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv));
+       bch2_bio_map(&op.wbio.bio, buf, len);
 
-       bch2_write_op_init(&o.op, c, bch2_opts_to_inode_opts(c->opts));
-       o.op.write_point        = writepoint_hashed(0);
-       o.op.nr_replicas        = 1;
-       o.op.pos                = POS(dst_inode->bi_inum, dst_offset >> 9);
+       bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts));
+       op.write_point  = writepoint_hashed(0);
+       op.nr_replicas  = 1;
+       op.subvol       = 1;
+       op.pos          = SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX);
 
-       int ret = bch2_disk_reservation_get(c, &o.op.res, len >> 9,
+       int ret = bch2_disk_reservation_get(c, &op.res, len >> 9,
                                            c->opts.data_replicas, 0);
        if (ret)
                die("error reserving space in new filesystem: %s", strerror(-ret));
 
-       closure_call(&o.op.cl, bch2_write, NULL, &cl);
+       closure_call(&op.cl, bch2_write, NULL, &cl);
        closure_sync(&cl);
 
        dst_inode->bi_sectors += len >> 9;
@@ -314,11 +323,12 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                e = bkey_extent_init(&k.k);
                e->k.p.inode    = dst->bi_inum;
                e->k.p.offset   = logical + sectors;
+               e->k.p.snapshot = U32_MAX;
                e->k.size       = sectors;
                bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
                                        .offset = physical,
                                        .dev = 0,
-                                       .gen = bucket(ca, b)->mark.gen,
+                                       .gen = *bucket_gen(ca, b),
                                  });
 
                ret = bch2_disk_reservation_get(c, &res, sectors, 1,
@@ -327,8 +337,6 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                        die("error reserving space in new filesystem: %s",
                            strerror(-ret));
 
-               bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c);
-
                ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i,
                                        &res, NULL, 0);
                if (ret)
@@ -428,6 +436,7 @@ static void copy_dir(struct copy_fs_state *s,
 
                if (!strcmp(d->d_name, ".") ||
                    !strcmp(d->d_name, "..") ||
+                   !strcmp(d->d_name, "lost+found") ||
                    stat.st_ino == s->bcachefs_inum)
                        continue;
 
@@ -569,7 +578,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
        syncfs(src_fd);
 
        struct bch_inode_unpacked root_inode;
-       int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode);
+       int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO },
+                                         &root_inode);
        if (ret)
                die("error looking up root directory: %s", strerror(-ret));
 
@@ -595,8 +605,6 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path,
 
        darray_free(s.extents);
        genradix_free(&s.hardlinks);
-
-       bch2_alloc_write(c, false);
 }
 
 static void find_superblock_space(ranges extents,
@@ -672,7 +680,7 @@ static int migrate_fs(const char            *fs_path,
 
        u64 bcachefs_inum;
        ranges extents = reserve_new_fs_space(file_path,
-                               fs_opts.block_size << 9,
+                               fs_opts.block_size >> 9,
                                get_size(dev.path, dev.fd) / 5,
                                &bcachefs_inum, stat.st_dev, force);
 
@@ -694,6 +702,7 @@ static int migrate_fs(const char            *fs_path,
        opt_set(opts, sb,       sb_offset);
        opt_set(opts, nostart,  true);
        opt_set(opts, noexcl,   true);
+       opt_set(opts, buckets_nouse, true);
 
        c = bch2_fs_open(path, 1, opts);
        if (IS_ERR(c))
diff --git a/cmd_subvolume.c b/cmd_subvolume.c
new file mode 100644 (file)
index 0000000..99a302b
--- /dev/null
@@ -0,0 +1,188 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <libgen.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "libbcachefs/bcachefs.h"
+#include "libbcachefs/bcachefs_ioctl.h"
+#include "cmds.h"
+#include "libbcachefs.h"
+#include "libbcachefs/opts.h"
+#include "tools-util.h"
+
+int subvolume_usage(void)
+{
+       puts("bcachefs subvolume - manage subvolumes and snapshots\n"
+            "Usage: bcachefs subvolume <CMD> [OPTION]\n"
+            "\n"
+            "Commands:\n"
+            "  create                  create a subvolume\n"
+            "  delete                  delete a subvolume\n"
+            "  snapshot                create a snapshot\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+       return 0;
+}
+
+static void subvolume_create_usage(void)
+{
+       puts("bcachefs subvolume create - create a new subvolume\n"
+            "Usage: bcachefs subvolume create [OPTION]... path\n"
+            "\n"
+            "Options:\n"
+            "  -h, --help                  Display this help and exit\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_create(int argc, char *argv[])
+{
+       static const struct option longopts[] = {
+               { "help",               no_argument,            NULL, 'h' },
+               { NULL }
+       };
+       char *path;
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+               switch (opt) {
+               case 'h':
+                       subvolume_create_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       while ((path = arg_pop())) {
+               char *dir = dirname(strdup(path));
+
+               struct bchfs_handle fs = bcache_fs_open(dir);
+
+               struct bch_ioctl_subvolume i = {
+                       .dirfd          = AT_FDCWD,
+                       .mode           = 0777,
+                       .dst_ptr        = (unsigned long)path,
+               };
+
+               xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i);
+               bcache_fs_close(fs);
+       }
+
+       return 0;
+}
+
+static void subvolume_delete_usage(void)
+{
+       puts("bcachefs subvolume delete - delete an existing subvolume\n"
+            "Usage: bcachefs subvolume delete [OPTION]... path\n"
+            "\n"
+            "Options:\n"
+            "  -h, --help                  Display this help and exit\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_delete(int argc, char *argv[])
+{
+       static const struct option longopts[] = {
+               { "help",               no_argument,            NULL, 'h' },
+               { NULL }
+       };
+       char *path;
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1)
+               switch (opt) {
+               case 'h':
+                       subvolume_delete_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       while ((path = arg_pop())) {
+               char *dir = dirname(strdup(path));
+
+               struct bchfs_handle fs = bcache_fs_open(dir);
+
+               struct bch_ioctl_subvolume i = {
+                       .dirfd          = AT_FDCWD,
+                       .mode           = 0777,
+                       .dst_ptr        = (unsigned long)path,
+               };
+
+               xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_DESTROY, &i);
+               bcache_fs_close(fs);
+       }
+
+       return 0;
+}
+
+static void snapshot_create_usage(void)
+{
+       puts("bcachefs subvolume snapshot - create a snapshot \n"
+            "Usage: bcachefs subvolume snapshot [OPTION]... <source> <dest>\n"
+            "\n"
+            "Create a snapshot of <source> at <dest>. If specified, <source> must be a subvolume;\n"
+            "if not specified the snapshot will be of the subvolme containing <dest>.\n"
+            "Options:\n"
+            "  -r                          Make snapshot read only\n"
+            "  -h, --help                  Display this help and exit\n"
+            "\n"
+            "Report bugs to <linux-bcachefs@vger.kernel.org>");
+}
+
+int cmd_subvolume_snapshot(int argc, char *argv[])
+{
+       static const struct option longopts[] = {
+               { "help",               no_argument,            NULL, 'h' },
+               { NULL }
+       };
+       unsigned flags = BCH_SUBVOL_SNAPSHOT_CREATE;
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, "rh", longopts, NULL)) != -1)
+               switch (opt) {
+               case 'r':
+                       flags |= BCH_SUBVOL_SNAPSHOT_RO;
+                       break;
+               case 'h':
+                       snapshot_create_usage();
+                       exit(EXIT_SUCCESS);
+               }
+       args_shift(optind);
+
+       char *src = arg_pop();
+       char *dst = arg_pop();
+
+       if (argc)
+               die("Too many arguments");
+
+       if (!dst)
+               swap(src, dst);
+       if (!dst)
+               die("Please specify a path to create");
+
+       char *dir = dirname(strdup(dst));
+
+       struct bchfs_handle fs = bcache_fs_open(dir);
+
+       struct bch_ioctl_subvolume i = {
+               .flags          = flags,
+               .dirfd          = AT_FDCWD,
+               .mode           = 0777,
+               .src_ptr        = (unsigned long)src,
+               .dst_ptr        = (unsigned long)dst,
+       };
+
+       xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i);
+       bcache_fs_close(fs);
+       return 0;
+}
diff --git a/cmds.h b/cmds.h
index cc490844dc8684533a9ff99ce6d039d71999d186..52db63f3040fc332812d8992fc3f88efcbd6cecb 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -19,8 +19,10 @@ int cmd_run(int argc, char *argv[]);
 int cmd_stop(int argc, char *argv[]);
 #endif
 
+int fs_usage(void);
 int cmd_fs_usage(int argc, char *argv[]);
 
+int device_usage(void);
 int cmd_device_add(int argc, char *argv[]);
 int cmd_device_remove(int argc, char *argv[]);
 int cmd_device_online(int argc, char *argv[]);
@@ -30,6 +32,7 @@ int cmd_device_set_state(int argc, char *argv[]);
 int cmd_device_resize(int argc, char *argv[]);
 int cmd_device_resize_journal(int argc, char *argv[]);
 
+int data_usage(void);
 int cmd_data_rereplicate(int argc, char *argv[]);
 int cmd_data_job(int argc, char *argv[]);
 
@@ -50,6 +53,11 @@ int cmd_version(int argc, char *argv[]);
 
 int cmd_setattr(int argc, char *argv[]);
 
+int subvolume_usage(void);
+int cmd_subvolume_create(int argc, char *argv[]);
+int cmd_subvolume_delete(int argc, char *argv[]);
+int cmd_subvolume_snapshot(int argc, char *argv[]);
+
 int cmd_fusemount(int argc, char *argv[]);
 
 #endif /* _CMDS_H */
index 7f7fbd5a337ac8d04585f30cfc468720892d6037..43753a3e8902e019371d5258b37681ff320211b2 100644 (file)
--- a/crypto.c
+++ b/crypto.c
@@ -12,7 +12,7 @@
 
 #include <keyutils.h>
 #include <linux/random.h>
-#include <libscrypt.h>
+#include <sodium/crypto_pwhash_scryptsalsa208sha256.h>
 #include <uuid/uuid.h>
 
 #include "libbcachefs/checksum.h"
@@ -84,12 +84,13 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *crypt,
 
        switch (BCH_CRYPT_KDF_TYPE(crypt)) {
        case BCH_KDF_SCRYPT:
-               ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase),
-                                      salt, sizeof(salt),
-                                      1ULL << BCH_KDF_SCRYPT_N(crypt),
-                                      1ULL << BCH_KDF_SCRYPT_R(crypt),
-                                      1ULL << BCH_KDF_SCRYPT_P(crypt),
-                                      (void *) &key, sizeof(key));
+               ret = crypto_pwhash_scryptsalsa208sha256_ll(
+                       (void *) passphrase, strlen(passphrase),
+                       salt, sizeof(salt),
+                       1ULL << BCH_KDF_SCRYPT_N(crypt),
+                       1ULL << BCH_KDF_SCRYPT_R(crypt),
+                       1ULL << BCH_KDF_SCRYPT_P(crypt),
+                       (void *) &key, sizeof(key));
                if (ret)
                        die("scrypt error: %i", ret);
                break;
@@ -170,9 +171,9 @@ void bch_sb_crypt_init(struct bch_sb *sb,
        if (passphrase) {
 
                SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
-               SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
-               SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
-               SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+               SET_BCH_KDF_SCRYPT_N(crypt, ilog2(16384));
+               SET_BCH_KDF_SCRYPT_R(crypt, ilog2(8));
+               SET_BCH_KDF_SCRYPT_P(crypt, ilog2(16));
 
                struct bch_key passphrase_key = derive_passphrase(crypt, passphrase);
 
index 64bcbcb9fb8291267b812d1b2377ddc3134bf32b..3cb088260aa8438a82faf6136e719ae8755148d3 100644 (file)
@@ -1,3 +1,12 @@
+bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium
+
+  * New upstream snapshot
+  * Grab patch from Ubuntu to reduce memory on amd64 builders
+    (http://launchpadlibrarian.net/580140160/bcachefs-tools_0.1+git20210805.6c42566-2_0.1+git20210805.6c42566-2ubuntu1.diff.gz)
+  * Update copyright years
+
+ -- Jonathan Carter <jcc@debian.org>  Wed, 16 Feb 2022 14:42:20 +0200
+
 bcachefs-tools (0.1+git20210805.6c42566-2) unstable; urgency=medium
 
   * Remove valgrind as build-dependency, seems unneeded unless
index 3125db17e51cb5ed15b1e5769fd67b36ec91611a..7fe4f5b51cddc9f741c2b242a22b791b8d1e392f 100644 (file)
@@ -4,7 +4,7 @@ Upstream-Contact: kmo@daterainc.com
 Source: https://evilpiepirate.org/git/bcachefs-tools.git
 
 Files: *
-Copyright: 2013-2020 Kent Overstreet <kmo@daterainc.com>
+Copyright: 2013-2022 Kent Overstreet <kmo@daterainc.com>
            2013 Gabriel de Perthuis <g2p.code@gmail.com>
            2008 Intel Corporation <willy@linux.intel.com>
 License: GPL-2
@@ -38,7 +38,7 @@ License: expat
  THE SOFTWARE.
 
 Files: debian/*
-Copyright: 2019-2020 Jonathan Carter <jcc@debian.org>
+Copyright: 2019-2022 Jonathan Carter <jcc@debian.org>
            2014 Tom Strickx <tstrickx@rootcu.be>,
            2014 David Mohr <david@mcbf.net>
 License: GPL-2+
index 66d7523c9ac6d4914d43e99bed71de95dc26a6fe..2ea4bfc7b4cc3a829312a1b02edbcc8bc32b511a 100644 (file)
@@ -1 +1 @@
-bcachefs-tools_0.1+git20210805.6c42566-2_source.buildinfo utils optional
+bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional
index b3ee89d7dc4a2646888ceb1c80cc4a136b6b1a36..748186e32beb73482f2334fc2719b5f3e9e9c0c7 100755 (executable)
@@ -3,6 +3,12 @@
 export DEB_BUILD_MAINT_OPTIONS=hardening=+all
 PREFIX := /usr
 
+DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH)
+
+ifeq ($(DEB_BUILD_ARCH),amd64)
+    DEB_BUILD_MAINT_OPTIONS += optimize=-lto
+endif
+
 %:
        dh $@
 
index f19ff1076ca200cb660cdadc83cd946d481d6799..48f2aa93bacabffcd38f70ad59fda0ce64391157 100644 (file)
-{ nixpkgs ? (import ./nix/nixpkgs.nix)
-}:
-
-with nixpkgs;
-
-stdenv.mkDerivation rec {
-  name = "bcachefs-tools-${version}";
-  version = "git";
-
-  src = lib.cleanSource ./.; # NOTE: ignore .git, otherwise things get weird!
-
-  nativeBuildInputs = [ git pkgconfig ];
-  buildInputs =
-    [ liburcu libuuid libaio zlib attr keyutils
-      libsodium libscrypt
-    ];
-
-  enableParallelBuilding = true;
-  makeFlags =
-    [ "PREFIX=$(out)"
-    ];
-
-  meta = with stdenv.lib; {
-    description = "Userspace tools for bcachefs";
-    homepage    = http://bcachefs.org;
-    license     = licenses.gpl2;
-    platforms   = platforms.linux;
-    maintainers =
-      [ "Kent Overstreet <kent.overstreet@gmail.com>"
-      ];
-  };
+{ lib
+, filter
+
+, stdenv
+, pkg-config
+, attr
+, libuuid
+, libsodium
+, keyutils
+
+, liburcu
+, zlib
+, libaio
+, udev
+, zstd
+, lz4
+
+, python39
+, python39Packages
+, docutils
+, nixosTests
+
+, lastModified
+, versionString ? lastModified
+
+, inShell ? false
+, debugMode ? inShell
+
+, testWithValgrind ? true
+, valgrind 
+
+, fuseSupport ? false
+, fuse3 ? null }:
+
+assert fuseSupport -> fuse3 != null;
+assert testWithValgrind -> valgrind != null;
+stdenv.mkDerivation {
+       pname = "bcachefs-tools";
+
+       version = "v0.1-flake-${versionString}";
+       VERSION = "v0.1-flake-${versionString}";
+       
+       src = filter.filter {
+               name = "bcachefs-tools";
+               root = ./.;
+               exclude = [
+                       ./rust-src
+                       
+                       ./.git
+                       ./nix
+                       
+                       ./flake.nix
+                       ./flake.lock
+               ];
+       };
+
+       postPatch = "patchShebangs --build doc/macro2rst.py";
+
+       nativeBuildInputs = [
+               # used to find dependencies
+               ## see ./INSTALL
+               pkg-config
+       ];
+       buildInputs = [
+               # bcachefs explicit dependencies
+               ## see ./INSTALL
+               libaio
+               
+               # libblkid
+               keyutils # libkeyutils
+               lz4 # liblz4
+               
+               libsodium
+               liburcu
+               libuuid
+               zstd # libzstd
+               zlib # zlib1g
+               valgrind
+
+               # unspecified dependencies
+               attr
+               udev
+
+               # documentation depenedencies
+               docutils
+               python39Packages.pygments
+       ] ++ (lib.optional fuseSupport fuse3)
+       ++ (lib.optional testWithValgrind valgrind) ;
+
+       makeFlags = [
+               "PREFIX=${placeholder "out"}"
+       ] ++ lib.optional debugMode "EXTRA_CFLAGS=-ggdb";
+
+       installFlags = [
+               "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools"
+       ];
+
+       doCheck = true; # needs bcachefs module loaded on builder
+
+       checkInputs = [
+               python39Packages.pytest
+               python39Packages.pytest-xdist
+       ] ++ lib.optional testWithValgrind valgrind;
+       
+       checkFlags = [ 
+               "BCACHEFS_TEST_USE_VALGRIND=${if testWithValgrind then "yes" else "no"}"
+               # cannot escape spaces within make flags, quotes are stripped
+               "PYTEST_CMD=pytest" # "PYTEST_ARGS='-n4 --version'"
+       ];
+
+       preCheck =
+               ''
+                       makeFlagsArray+=(PYTEST_ARGS="--verbose -n2")
+               '' +
+               lib.optionalString fuseSupport ''
+                       rm tests/test_fuse.py
+               '';
+
+       dontStrip = debugMode == true;
+       passthru = {
+               bcachefs_revision = let 
+                       file = builtins.readFile ./.bcachefs_revision;
+                       removeLineFeeds = str: lib.lists.foldr (lib.strings.removeSuffix) str ["\r" "\n"];
+               in removeLineFeeds file;
+               
+               tests = {
+                       smoke-test = nixosTests.bcachefs;
+               };
+       };
+
+       enableParallelBuilding = true;
+       meta = with lib; {
+               description = "Userspace tools for bcachefs";
+               homepage    = http://bcachefs.org;
+               license     = licenses.gpl2;
+               platforms   = platforms.linux;
+               maintainers =
+                       [ "Kent Overstreet <kent.overstreet@gmail.com>"
+                       ];
+
+       };
 }
diff --git a/flake.lock b/flake.lock
new file mode 100644 (file)
index 0000000..2c9c15b
--- /dev/null
@@ -0,0 +1,59 @@
+{
+  "nodes": {
+    "filter": {
+      "locked": {
+        "lastModified": 1620202920,
+        "narHash": "sha256-BOkm3eKT45Dk4NNxJT0xL9NnyYeZcF+t79zPnJkggac=",
+        "owner": "numtide",
+        "repo": "nix-filter",
+        "rev": "3c9e33ed627e009428197b07216613206f06ed80",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "nix-filter",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1633351077,
+        "narHash": "sha256-z38JG4Bb0GtM1aF1pANVdp1dniMP23Yb3HnRoJRy2uU=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "14aef06d9b3ad1d07626bdbb16083b83f92dc6c1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "filter": "filter",
+        "nixpkgs": "nixpkgs",
+        "utils": "utils"
+      }
+    },
+    "utils": {
+      "locked": {
+        "lastModified": 1629481132,
+        "narHash": "sha256-JHgasjPR0/J1J3DRm4KxM4zTyAj4IOJY8vIl75v/kPI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "997f7efcb746a9c140ce1f13c72263189225f482",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644 (file)
index 0000000..b52bc7e
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,96 @@
+{
+       description = "Userspace tools for bcachefs";
+
+       # Nixpkgs / NixOS version to use.
+       inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable";
+       inputs.utils.url = "github:numtide/flake-utils";
+       inputs.filter.url = "github:numtide/nix-filter";
+
+       outputs = { self, nixpkgs, utils, filter, ... }@inputs:
+               let
+                       # System types to support.
+                       supportedSystems = [ "x86_64-linux" ];
+               in
+               {
+                       version = "${builtins.substring 0 8 self.lastModifiedDate}-${self.shortRev or "dirty"}";
+
+                       overlay = import ./nix/overlay.nix inputs;
+                       nixosModule = self.nixosModules.bcachefs;
+                       nixosModules.bcachefs = import ./rust-src/mount/module.nix;
+                       nixosModules.bcachefs-enable-boot = ({config, pkgs, lib, ... }:{
+                               # Disable Upstream NixOS Module when this is in use
+                               disabledModules = [ "tasks/filesystems/bcachefs.nix" ];
+                               # Import needed packages
+                               nixpkgs.overlays = [ self.overlay ];
+
+                               # Add bcachefs to boot and kernel
+                               boot.initrd.supportedFilesystems = [ "bcachefs" ];
+                               boot.supportedFilesystems = [ "bcachefs" ];
+                       });
+
+                       nixosConfigurations.netboot-bcachefs = self.systems.netboot-bcachefs "x86_64-linux";
+                       systems.netboot-bcachefs = system: (nixpkgs.lib.nixosSystem { 
+                                       inherit system; modules = [
+                                               self.nixosModule 
+                                               self.nixosModules.bcachefs-enable-boot
+                                               ("${nixpkgs}/nixos/modules/installer/netboot/netboot-minimal.nix")
+                                               ({ lib, pkgs, config, ... }: {
+                                                       # installation disk autologin
+                                                       services.getty.autologinUser = lib.mkForce "root";
+                                                       users.users.root.initialPassword = "toor";
+                                                       
+                                                       # Symlink everything together
+                                                       system.build.netboot = pkgs.symlinkJoin {
+                                                               name = "netboot";
+                                                               paths = with config.system.build; [
+                                                                       netbootRamdisk
+                                                                       kernel
+                                                                       netbootIpxeScript
+                                                               ];
+                                                               preferLocalBuild = true;
+                                                       };
+                                               })
+                                       ]; 
+                               });
+               }
+               // utils.lib.eachSystem supportedSystems (system: 
+               let pkgs = import nixpkgs { 
+                       inherit system; 
+                       overlays = [ self.overlay ]; 
+               }; 
+               in rec {
+                       
+                       # A Nixpkgs overlay.
+
+                       # Provide some binary packages for selected system types.
+                       defaultPackage = pkgs.bcachefs.tools;
+                       packages = {
+                               inherit (pkgs.bcachefs)
+                                       tools
+                                       toolsValgrind
+                                       toolsDebug
+                                       mount
+                                       bch_bindgen
+                                       kernel;
+
+                               tools-musl = pkgs.pkgsMusl.bcachefs.tools;
+                               mount-musl = pkgs.pkgsMusl.bcachefs.mount;
+                       };
+
+                       checks = { 
+                               kernelSrc = packages.kernel.src;
+                               inherit (packages) 
+                                       mount
+                                       bch_bindgen
+                                       toolsValgrind;
+
+                               # Build and test initrd with bcachefs and bcachefs.mount installed
+                               # Disabled Test because it takes a while to build the kernel
+                               # bootStage1Module = self.nixosConfigurations.netboot-bcachefs.config.system.build.bootStage1;
+                       };
+
+                       devShell = devShells.tools;
+                       devShells.tools = pkgs.bcachefs.tools.override { inShell = true; };
+                       devShells.mount = pkgs.bcachefs.mount.override { inShell = true; };
+               });
+}
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644 (file)
index 0000000..e69de29
index 8aef4bb8e661b39dc75f73fc2ac061eb02a8b767..be736c8c70edd32decfc5e0222e0448212051dae 100644 (file)
@@ -21,6 +21,8 @@ struct request_queue {
 };
 
 struct gendisk {
+       struct backing_dev_info *bdi;
+       struct backing_dev_info __bdi;
 };
 
 struct hd_struct {
@@ -38,9 +40,6 @@ struct block_device {
        struct gendisk          __bd_disk;
        int                     bd_fd;
        int                     bd_sync_fd;
-
-       struct backing_dev_info *bd_bdi;
-       struct backing_dev_info __bd_bdi;
 };
 
 #define bdev_kobj(_bdev) (&((_bdev)->kobj))
index 35082ae30460911d26fc11bb1f052724c1d62378..4ce43b5cc629b6e4150b12eb7b61f3f281834816 100644 (file)
@@ -6,7 +6,7 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
-#define BIO_MAX_VECS   256
+#define BIO_MAX_VECS   256U
 
 typedef unsigned fmode_t;
 
@@ -74,6 +74,17 @@ int blkdev_issue_discard(struct block_device *, sector_t,
 
 #define bdev_get_queue(bdev)           (&((bdev)->queue))
 
+#ifndef SECTOR_SHIFT
+#define SECTOR_SHIFT 9
+#endif
+#ifndef SECTOR_SIZE
+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
+#endif
+
+#define PAGE_SECTORS_SHIFT     (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS           (1 << PAGE_SECTORS_SHIFT)
+#define SECTOR_MASK            (PAGE_SECTORS - 1)
+
 #define blk_queue_discard(q)           ((void) (q), 0)
 #define blk_queue_nonrot(q)            ((void) (q), 0)
 
index 2bfbfadb34b250d9f0840aded40060896bb72b37..6d039ea376df93743bd0e488a66b07dd1bfd0ba5 100644 (file)
@@ -60,6 +60,7 @@
 #define unlikely(x)            __builtin_expect(!!(x), 0)
 #define unreachable()          __builtin_unreachable()
 #define __same_type(a, b)      __builtin_types_compatible_p(typeof(a), typeof(b))
+#define fallthrough            __attribute__((__fallthrough__))
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
index aff1c9250c8216e4f14a99a6844c00988d7dbe8b..d62ef5a6b4e9c624383cd92b3773a9ea44e2c500 100644 (file)
@@ -78,4 +78,7 @@
 /********** security/ **********/
 #define KEY_DESTROY            0xbd
 
+/********** net/core/page_pool.c **********/
+#define PP_SIGNATURE           (0x40 + POISON_POINTER_DELTA)
+
 #endif
diff --git a/include/linux/siphash.h b/include/linux/siphash.h
new file mode 100644 (file)
index 0000000..bf21591
--- /dev/null
@@ -0,0 +1,145 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#ifndef _LINUX_SIPHASH_H
+#define _LINUX_SIPHASH_H
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#define SIPHASH_ALIGNMENT __alignof__(u64)
+typedef struct {
+       u64 key[2];
+} siphash_key_t;
+
+static inline bool siphash_key_is_zero(const siphash_key_t *key)
+{
+       return !(key->key[0] | key->key[1]);
+}
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key);
+#endif
+
+u64 siphash_1u64(const u64 a, const siphash_key_t *key);
+u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key);
+u64 siphash_3u64(const u64 a, const u64 b, const u64 c,
+                const siphash_key_t *key);
+u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d,
+                const siphash_key_t *key);
+u64 siphash_1u32(const u32 a, const siphash_key_t *key);
+u64 siphash_3u32(const u32 a, const u32 b, const u32 c,
+                const siphash_key_t *key);
+
+static inline u64 siphash_2u32(const u32 a, const u32 b,
+                              const siphash_key_t *key)
+{
+       return siphash_1u64((u64)b << 32 | a, key);
+}
+static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c,
+                              const u32 d, const siphash_key_t *key)
+{
+       return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key);
+}
+
+
+static inline u64 ___siphash_aligned(const __le64 *data, size_t len,
+                                    const siphash_key_t *key)
+{
+       if (__builtin_constant_p(len) && len == 4)
+               return siphash_1u32(le32_to_cpup((const __le32 *)data), key);
+       if (__builtin_constant_p(len) && len == 8)
+               return siphash_1u64(le64_to_cpu(data[0]), key);
+       if (__builtin_constant_p(len) && len == 16)
+               return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+                                   key);
+       if (__builtin_constant_p(len) && len == 24)
+               return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+                                   le64_to_cpu(data[2]), key);
+       if (__builtin_constant_p(len) && len == 32)
+               return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]),
+                                   le64_to_cpu(data[2]), le64_to_cpu(data[3]),
+                                   key);
+       return __siphash_aligned(data, len, key);
+}
+
+/**
+ * siphash - compute 64-bit siphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the siphash key
+ */
+static inline u64 siphash(const void *data, size_t len,
+                         const siphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT))
+               return __siphash_unaligned(data, len, key);
+#endif
+       return ___siphash_aligned(data, len, key);
+}
+
+#define HSIPHASH_ALIGNMENT __alignof__(unsigned long)
+typedef struct {
+       unsigned long key[2];
+} hsiphash_key_t;
+
+u32 __hsiphash_aligned(const void *data, size_t len,
+                      const hsiphash_key_t *key);
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+                        const hsiphash_key_t *key);
+#endif
+
+u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key);
+u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key);
+u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c,
+                 const hsiphash_key_t *key);
+u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d,
+                 const hsiphash_key_t *key);
+
+static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len,
+                                     const hsiphash_key_t *key)
+{
+       if (__builtin_constant_p(len) && len == 4)
+               return hsiphash_1u32(le32_to_cpu(data[0]), key);
+       if (__builtin_constant_p(len) && len == 8)
+               return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+                                    key);
+       if (__builtin_constant_p(len) && len == 12)
+               return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+                                    le32_to_cpu(data[2]), key);
+       if (__builtin_constant_p(len) && len == 16)
+               return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]),
+                                    le32_to_cpu(data[2]), le32_to_cpu(data[3]),
+                                    key);
+       return __hsiphash_aligned(data, len, key);
+}
+
+/**
+ * hsiphash - compute 32-bit hsiphash PRF value
+ * @data: buffer to hash
+ * @size: size of @data
+ * @key: the hsiphash key
+ */
+static inline u32 hsiphash(const void *data, size_t len,
+                          const hsiphash_key_t *key)
+{
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT))
+               return __hsiphash_unaligned(data, len, key);
+#endif
+       return ___hsiphash_aligned(data, len, key);
+}
+
+#endif /* _LINUX_SIPHASH_H */
index ef86153898130432d8119f9373546f7cb13124ec..bc99973fccd22059805f43d99fbda4f9e7daefaf 100644 (file)
 
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
+       unsigned i = 0;
        void *p;
 
-       run_shrinkers();
-
-       if (size) {
-               size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
-               alignment = max(sizeof(void *), alignment);
-               if (posix_memalign(&p, alignment, size))
-                       p = NULL;
-       } else {
-               p = malloc(0);
-       }
-       if (p && (flags & __GFP_ZERO))
-               memset(p, 0, size);
+       do {
+               run_shrinkers();
+
+               if (size) {
+                       size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE);
+                       alignment = max(sizeof(void *), alignment);
+                       if (posix_memalign(&p, alignment, size))
+                               p = NULL;
+               } else {
+                       p = malloc(0);
+               }
+               if (p && (flags & __GFP_ZERO))
+                       memset(p, 0, size);
+       } while (!p && i++ < 10);
 
        return p;
 }
@@ -38,8 +41,6 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 {
        void *new;
 
-       run_shrinkers();
-
        new = kmalloc(size, flags);
        if (!new)
                return NULL;
@@ -62,6 +63,10 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
        ((size) != 0 && (n) > SIZE_MAX / (size)                         \
         ? NULL : kmalloc((n) * (size), flags))
 
+#define kvmalloc_array(n, size, flags)                                 \
+       ((size) != 0 && (n) > SIZE_MAX / (size)                         \
+        ? NULL : kmalloc((n) * (size), flags))
+
 #define kcalloc(n, size, flags)                kmalloc_array(n, size, flags|__GFP_ZERO)
 
 #define kfree(p)                       free(p)
@@ -74,13 +79,16 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags)
 static inline struct page *alloc_pages(gfp_t flags, unsigned int order)
 {
        size_t size = PAGE_SIZE << order;
+       unsigned i = 0;
        void *p;
 
-       run_shrinkers();
+       do {
+               run_shrinkers();
 
-       p = aligned_alloc(PAGE_SIZE, size);
-       if (p && (flags & __GFP_ZERO))
-               memset(p, 0, size);
+               p = aligned_alloc(PAGE_SIZE, size);
+               if (p && (flags & __GFP_ZERO))
+                       memset(p, 0, size);
+       } while (!p && i++ < 10);
 
        return p;
 }
index c674d9a2c05737da3e94d21234dd366f59ecbab9..ccb319eb52a4a444db0f5981f23a7acfa92bc2f7 100644 (file)
 
 static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask)
 {
+       unsigned i = 0;
        void *p;
 
        size = round_up(size, PAGE_SIZE);
 
-       run_shrinkers();
+       do {
+               run_shrinkers();
 
-       p = aligned_alloc(PAGE_SIZE, size);
-       if (!p)
-               return NULL;
-
-       if (gfp_mask & __GFP_ZERO)
-               memset(p, 0, size);
+               p = aligned_alloc(PAGE_SIZE, size);
+               if (p && gfp_mask & __GFP_ZERO)
+                       memset(p, 0, size);
+       } while (!p && i++ < 10);
 
        return p;
 }
index a11bb5f7180eec21aafe49f7beb9c64d2cb96560..a21a39230a091b6076052007c14cdf0c872b87c1 100644 (file)
@@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail,
                  __entry->required, __entry->cl)
 );
 
-TRACE_EVENT(btree_insert_key,
-       TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
-       TP_ARGS(c, b, k),
-
-       TP_STRUCT__entry(
-               __field(u8,             id                      )
-               __field(u64,            inode                   )
-               __field(u64,            offset                  )
-               __field(u32,            size                    )
-       ),
-
-       TP_fast_assign(
-               __entry->id             = b->c.btree_id;
-               __entry->inode          = k->k.p.inode;
-               __entry->offset         = k->k.p.offset;
-               __entry->size           = k->k.size;
-       ),
-
-       TP_printk("btree %u: %llu:%llu len %u", __entry->id,
-                 __entry->inode, __entry->offset, __entry->size)
-);
-
 DEFINE_EVENT(btree_node, btree_split,
        TP_PROTO(struct bch_fs *c, struct btree *b),
        TP_ARGS(c, b)
@@ -340,6 +318,80 @@ DEFINE_EVENT(btree_node, btree_set_root,
        TP_ARGS(c, b)
 );
 
+TRACE_EVENT(btree_cache_scan,
+       TP_PROTO(unsigned long nr_to_scan_pages,
+                unsigned long nr_to_scan_nodes,
+                unsigned long can_free_nodes,
+                long ret),
+       TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret),
+
+       TP_STRUCT__entry(
+               __field(unsigned long,  nr_to_scan_pages        )
+               __field(unsigned long,  nr_to_scan_nodes        )
+               __field(unsigned long,  can_free_nodes          )
+               __field(long,           ret                     )
+       ),
+
+       TP_fast_assign(
+               __entry->nr_to_scan_pages       = nr_to_scan_pages;
+               __entry->nr_to_scan_nodes       = nr_to_scan_nodes;
+               __entry->can_free_nodes         = can_free_nodes;
+               __entry->ret                    = ret;
+       ),
+
+       TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li",
+                 __entry->nr_to_scan_pages,
+                 __entry->nr_to_scan_nodes,
+                 __entry->can_free_nodes,
+                 __entry->ret)
+);
+
+TRACE_EVENT(btree_node_relock_fail,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos,
+                unsigned long node,
+                u32 iter_lock_seq,
+                u32 node_lock_seq),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 24    )
+               __field(unsigned long,          caller_ip       )
+               __field(u8,                     btree_id        )
+               __field(u64,                    pos_inode       )
+               __field(u64,                    pos_offset      )
+               __field(u32,                    pos_snapshot    )
+               __field(unsigned long,          node            )
+               __field(u32,                    iter_lock_seq   )
+               __field(u32,                    node_lock_seq   )
+       ),
+
+       TP_fast_assign(
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+               __entry->btree_id               = btree_id;
+               __entry->pos_inode              = pos->inode;
+               __entry->pos_offset             = pos->offset;
+               __entry->pos_snapshot           = pos->snapshot;
+               __entry->node                   = node;
+               __entry->iter_lock_seq          = iter_lock_seq;
+               __entry->node_lock_seq          = node_lock_seq;
+       ),
+
+       TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u",
+                 __entry->trans_fn,
+                 (void *) __entry->caller_ip,
+                 __entry->btree_id,
+                 __entry->pos_inode,
+                 __entry->pos_offset,
+                 __entry->pos_snapshot,
+                 __entry->node,
+                 __entry->iter_lock_seq,
+                 __entry->node_lock_seq)
+);
+
 /* Garbage collection */
 
 DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
@@ -381,7 +433,7 @@ TRACE_EVENT(alloc_scan,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->disk_sb.bdev->bd_dev;
+               __entry->dev            = ca->dev;
                __entry->found          = found;
                __entry->inc_gen        = inc_gen;
                __entry->inc_gen_skipped = inc_gen_skipped;
@@ -403,7 +455,7 @@ TRACE_EVENT(invalidate,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->disk_sb.bdev->bd_dev;
+               __entry->dev            = ca->dev;
                __entry->offset         = offset,
                __entry->sectors        = sectors;
        ),
@@ -425,7 +477,7 @@ DECLARE_EVENT_CLASS(bucket_alloc,
        ),
 
        TP_fast_assign(
-               __entry->dev            = ca->disk_sb.bdev->bd_dev;
+               __entry->dev            = ca->dev;
                __entry->reserve        = reserve;
        ),
 
@@ -540,157 +592,87 @@ TRACE_EVENT(copygc_wait,
                  __entry->wait_amount, __entry->until)
 );
 
-TRACE_EVENT(trans_get_iter,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                enum btree_id btree_id,
-                struct bpos *got_pos,
-                unsigned got_locks,
-                unsigned got_uptodate,
-                struct bpos *src_pos,
-                unsigned src_locks,
-                unsigned src_uptodate),
-       TP_ARGS(trans_ip, caller_ip, btree_id,
-               got_pos, got_locks, got_uptodate,
-               src_pos, src_locks, src_uptodate),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip                )
-               __field(unsigned long,          caller_ip               )
-               __field(u8,                     btree_id                )
-               __field(u64,                    got_pos_inode           )
-               __field(u64,                    got_pos_offset          )
-               __field(u32,                    got_pos_snapshot        )
-               __field(u8,                     got_locks               )
-               __field(u8,                     got_uptodate            )
-               __field(u64,                    src_pos_inode           )
-               __field(u64,                    src_pos_offset          )
-               __field(u32,                    src_pos_snapshot        )
-               __field(u8,                     src_locks               )
-               __field(u8,                     src_uptodate            )
-       ),
-
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->got_pos_inode          = got_pos->inode;
-               __entry->got_pos_offset         = got_pos->offset;
-               __entry->got_pos_snapshot       = got_pos->snapshot;
-               __entry->got_locks              = got_locks;
-               __entry->got_uptodate           = got_uptodate;
-               __entry->src_pos_inode          = src_pos->inode;
-               __entry->src_pos_offset         = src_pos->offset;
-               __entry->src_pos_snapshot       = src_pos->snapshot;
-               __entry->src_locks              = src_locks;
-               __entry->src_uptodate           = src_uptodate;
-       ),
-
-       TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u "
-                 "src %llu:%llu:%u l %u u %u",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->btree_id,
-                 __entry->got_pos_inode,
-                 __entry->got_pos_offset,
-                 __entry->got_pos_snapshot,
-                 __entry->got_locks,
-                 __entry->got_uptodate,
-                 __entry->src_pos_inode,
-                 __entry->src_pos_offset,
-                 __entry->src_pos_snapshot,
-                 __entry->src_locks,
-                 __entry->src_uptodate)
-);
-
-TRACE_EVENT(transaction_restart_ip,
-       TP_PROTO(unsigned long caller, unsigned long ip),
-       TP_ARGS(caller, ip),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          caller  )
-               __field(unsigned long,          ip      )
-       ),
-
-       TP_fast_assign(
-               __entry->caller = caller;
-               __entry->ip     = ip;
-       ),
-
-       TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
-);
-
 DECLARE_EVENT_CLASS(transaction_restart,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip),
+       TP_ARGS(trans_fn, caller_ip),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
+               __array(char,                   trans_fn, 24    )
                __field(unsigned long,          caller_ip       )
        ),
 
        TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
        ),
 
-       TP_printk("%ps %pS",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip)
+       TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      transaction_restart_ip,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip),
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_blocked_journal_reclaim,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_journal_res_get,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_journal_preres_get,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_journal_reclaim,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_fault_inject,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_traverse_all,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DEFINE_EVENT(transaction_restart,      trans_restart_mark_replicas,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip),
+       TP_ARGS(trans_fn, caller_ip)
+);
+
+DEFINE_EVENT(transaction_restart,      trans_restart_key_cache_raced,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip),
-       TP_ARGS(trans_ip, caller_ip)
+       TP_ARGS(trans_fn, caller_ip)
 );
 
 DECLARE_EVENT_CLASS(transaction_restart_iter,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
+               __array(char,                   trans_fn, 24    )
                __field(unsigned long,          caller_ip       )
                __field(u8,                     btree_id        )
                __field(u64,                    pos_inode       )
@@ -699,7 +681,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
        ),
 
        TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
                __entry->btree_id               = btree_id;
                __entry->pos_inode              = pos->inode;
@@ -707,8 +689,8 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
                __entry->pos_snapshot           = pos->snapshot;
        ),
 
-       TP_printk("%ps %pS btree %u pos %llu:%llu:%u",
-                 (void *) __entry->trans_ip,
+       TP_printk("%s %pS btree %u pos %llu:%llu:%u",
+                 __entry->trans_fn,
                  (void *) __entry->caller_ip,
                  __entry->btree_id,
                  __entry->pos_inode,
@@ -717,153 +699,111 @@ DECLARE_EVENT_CLASS(transaction_restart_iter,
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_mark,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
-       TP_PROTO(unsigned long trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
-       TP_PROTO(unsigned long trans_ip,
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
                 enum btree_id btree_id,
                 struct bpos *pos),
-       TP_ARGS(trans_ip, caller_ip, btree_id, pos)
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
-TRACE_EVENT(iter_traverse,
-       TP_PROTO(unsigned long  trans_ip,
-                unsigned long  caller_ip,
-                bool key_cache,
-                enum btree_id  btree_id,
-                struct bpos    *pos,
-                int ret),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret),
-
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
-               __field(unsigned long,          caller_ip       )
-               __field(u8,                     key_cache       )
-               __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
-               __field(s32,                    ret             )
-       ),
-
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->key_cache              = key_cache;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
-               __entry->ret                    = ret;
-       ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
 
-       TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->key_cache,
-                 __entry->btree_id,
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->ret)
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
-TRACE_EVENT(iter_set_search_pos,
-       TP_PROTO(unsigned long  trans_ip,
-                unsigned long  caller_ip,
-                enum btree_id  btree_id,
-                struct bpos    *old_pos,
-                struct bpos    *new_pos,
-                unsigned       good_level),
-       TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
 
-       TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip                )
-               __field(unsigned long,          caller_ip               )
-               __field(u8,                     btree_id                )
-               __field(u64,                    old_pos_inode           )
-               __field(u64,                    old_pos_offset          )
-               __field(u32,                    old_pos_snapshot        )
-               __field(u64,                    new_pos_inode           )
-               __field(u64,                    new_pos_offset          )
-               __field(u32,                    new_pos_snapshot        )
-               __field(u8,                     good_level              )
-       ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
 
-       TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->btree_id               = btree_id;
-               __entry->old_pos_inode          = old_pos->inode;
-               __entry->old_pos_offset         = old_pos->offset;
-               __entry->old_pos_snapshot       = old_pos->snapshot;
-               __entry->new_pos_inode          = new_pos->inode;
-               __entry->new_pos_offset         = new_pos->offset;
-               __entry->new_pos_snapshot       = new_pos->snapshot;
-               __entry->good_level             = good_level;
-       ),
+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
+);
 
-       TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->btree_id,
-                 __entry->old_pos_inode,
-                 __entry->old_pos_offset,
-                 __entry->old_pos_snapshot,
-                 __entry->new_pos_inode,
-                 __entry->new_pos_offset,
-                 __entry->new_pos_snapshot,
-                 __entry->good_level)
+DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip,
+                enum btree_id btree_id,
+                struct bpos *pos),
+       TP_ARGS(trans_fn, caller_ip, btree_id, pos)
 );
 
 TRACE_EVENT(trans_restart_would_deadlock,
-       TP_PROTO(unsigned long  trans_ip,
+       TP_PROTO(const char *trans_fn,
                 unsigned long  caller_ip,
                 bool           in_traverse_all,
                 unsigned       reason,
@@ -873,12 +813,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
                 enum btree_id  want_btree_id,
                 unsigned       want_iter_type,
                 struct bpos    *want_pos),
-       TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
+       TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason,
                have_btree_id, have_iter_type, have_pos,
                want_btree_id, want_iter_type, want_pos),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
+               __array(char,                   trans_fn, 24    )
                __field(unsigned long,          caller_ip       )
                __field(u8,                     in_traverse_all )
                __field(u8,                     reason          )
@@ -896,7 +836,7 @@ TRACE_EVENT(trans_restart_would_deadlock,
        ),
 
        TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
                __entry->caller_ip              = caller_ip;
                __entry->in_traverse_all        = in_traverse_all;
                __entry->reason                 = reason;
@@ -914,8 +854,8 @@ TRACE_EVENT(trans_restart_would_deadlock,
                __entry->want_pos_snapshot      = want_pos->snapshot;
        ),
 
-       TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
-                 (void *) __entry->trans_ip,
+       TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
+                 __entry->trans_fn,
                  (void *) __entry->caller_ip,
                  __entry->in_traverse_all,
                  __entry->reason,
@@ -931,99 +871,43 @@ TRACE_EVENT(trans_restart_would_deadlock,
                  __entry->want_pos_snapshot)
 );
 
-TRACE_EVENT(trans_restart_mem_realloced,
-       TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
-                unsigned long bytes),
-       TP_ARGS(trans_ip, caller_ip, bytes),
+TRACE_EVENT(trans_restart_would_deadlock_write,
+       TP_PROTO(const char *trans_fn),
+       TP_ARGS(trans_fn),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
-               __field(unsigned long,          caller_ip       )
-               __field(unsigned long,          bytes           )
+               __array(char,                   trans_fn, 24    )
        ),
 
        TP_fast_assign(
-               __entry->trans_ip       = trans_ip;
-               __entry->caller_ip      = caller_ip;
-               __entry->bytes          = bytes;
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
        ),
 
-       TP_printk("%ps %pS bytes %lu",
-                 (void *) __entry->trans_ip,
-                 (void *) __entry->caller_ip,
-                 __entry->bytes)
+       TP_printk("%s", __entry->trans_fn)
 );
 
-DECLARE_EVENT_CLASS(node_lock_fail,
-       TP_PROTO(unsigned long trans_ip,
+TRACE_EVENT(trans_restart_mem_realloced,
+       TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq),
+                unsigned long bytes),
+       TP_ARGS(trans_fn, caller_ip, bytes),
 
        TP_STRUCT__entry(
-               __field(unsigned long,          trans_ip        )
+               __array(char,                   trans_fn, 24    )
                __field(unsigned long,          caller_ip       )
-               __field(u8,                     key_cache       )
-               __field(u8,                     btree_id        )
-               __field(u64,                    pos_inode       )
-               __field(u64,                    pos_offset      )
-               __field(u32,                    pos_snapshot    )
-               __field(u32,                    level           )
-               __field(u32,                    iter_seq        )
-               __field(u32,                    node            )
-               __field(u32,                    node_seq        )
+               __field(unsigned long,          bytes           )
        ),
 
        TP_fast_assign(
-               __entry->trans_ip               = trans_ip;
-               __entry->caller_ip              = caller_ip;
-               __entry->key_cache              = key_cache;
-               __entry->btree_id               = btree_id;
-               __entry->pos_inode              = pos->inode;
-               __entry->pos_offset             = pos->offset;
-               __entry->pos_snapshot           = pos->snapshot;
-               __entry->level                  = level;
-               __entry->iter_seq               = iter_seq;
-               __entry->node                   = node;
-               __entry->node_seq               = node_seq;
+               strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip      = caller_ip;
+               __entry->bytes          = bytes;
        ),
 
-       TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u",
-                 (void *) __entry->trans_ip,
+       TP_printk("%s %pS bytes %lu",
+                 __entry->trans_fn,
                  (void *) __entry->caller_ip,
-                 __entry->key_cache,
-                 __entry->btree_id,
-                 __entry->pos_inode,
-                 __entry->pos_offset,
-                 __entry->pos_snapshot,
-                 __entry->level, __entry->iter_seq,
-                 __entry->node, __entry->node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq)
-);
-
-DEFINE_EVENT(node_lock_fail, node_relock_fail,
-       TP_PROTO(unsigned long trans_ip,
-                unsigned long caller_ip,
-                bool key_cache,
-                enum btree_id btree_id,
-                struct bpos *pos,
-                unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
-       TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos,
-               level, iter_seq, node, node_seq)
+                 __entry->bytes)
 );
 
 #endif /* _TRACE_BCACHE_H */
index 34246dc9106d5ffd83712f4b6571b4e91e042abe..f78ebf04f4a8616969f7908b13b16c8132e644c1 100644 (file)
 
 #define NSEC_PER_SEC   1000000000L
 
-/* minimum size filesystem we can create, given a bucket size: */
-static u64 min_size(unsigned bucket_size)
-{
-       return BCH_MIN_NR_NBUCKETS * bucket_size;
-}
-
 static void init_layout(struct bch_sb_layout *l,
                        unsigned block_size,
                        unsigned sb_size,
                        u64 sb_start, u64 sb_end)
 {
+       u64 sb_pos = sb_start;
        unsigned i;
 
        memset(l, 0, sizeof(*l));
@@ -51,25 +46,32 @@ static void init_layout(struct bch_sb_layout *l,
 
        /* Create two superblocks in the allowed range: */
        for (i = 0; i < l->nr_superblocks; i++) {
-               if (sb_start != BCH_SB_SECTOR)
-                       sb_start = round_up(sb_start, block_size);
+               if (sb_pos != BCH_SB_SECTOR)
+                       sb_pos = round_up(sb_pos, block_size);
 
-               l->sb_offset[i] = cpu_to_le64(sb_start);
-               sb_start += sb_size;
+               l->sb_offset[i] = cpu_to_le64(sb_pos);
+               sb_pos += sb_size;
        }
 
-       if (sb_start >= sb_end)
-               die("insufficient space for superblocks");
+       if (sb_pos > sb_end)
+               die("insufficient space for superblocks: start %llu end %llu > %llu size %u",
+                   sb_start, sb_pos, sb_end, sb_size);
+}
+
+/* minimum size filesystem we can create, given a bucket size: */
+static u64 min_size(unsigned bucket_size)
+{
+       return BCH_MIN_NR_NBUCKETS * bucket_size;
 }
 
 void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
 {
        if (!dev->size)
-               dev->size = get_size(dev->path, dev->fd) >> 9;
+               dev->size = get_size(dev->path, dev->fd);
 
        if (!dev->bucket_size) {
                if (dev->size < min_size(opts.block_size))
-                       die("cannot format %s, too small (%llu sectors, min %llu)",
+                       die("cannot format %s, too small (%llu bytes, min %llu)",
                            dev->path, dev->size, min_size(opts.block_size));
 
                /* Bucket size must be >= block size: */
@@ -81,16 +83,16 @@ void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
                                                 opts.btree_node_size);
 
                /* Want a bucket size of at least 128k, if possible: */
-               dev->bucket_size = max(dev->bucket_size, 256U);
+               dev->bucket_size = max(dev->bucket_size, 128ULL << 10);
 
                if (dev->size >= min_size(dev->bucket_size)) {
                        unsigned scale = max(1,
-                                            ilog2(dev->size / min_size(dev->bucket_size)) / 4);
+                               ilog2(dev->size / min_size(dev->bucket_size)) / 4);
 
                        scale = rounddown_pow_of_two(scale);
 
                        /* max bucket size 1 mb */
-                       dev->bucket_size = min(dev->bucket_size * scale, 1U << 11);
+                       dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20);
                } else {
                        do {
                                dev->bucket_size /= 2;
@@ -98,19 +100,24 @@ void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev)
                }
        }
 
-       dev->nbuckets   = dev->size / dev->bucket_size;
+       dev->nbuckets = dev->size / dev->bucket_size;
 
        if (dev->bucket_size < opts.block_size)
-               die("Bucket size cannot be smaller than block size");
+               die("Bucket size (%llu) cannot be smaller than block size (%u)",
+                   dev->bucket_size, opts.block_size);
 
        if (opt_defined(opts, btree_node_size) &&
            dev->bucket_size < opts.btree_node_size)
-               die("Bucket size cannot be smaller than btree node size");
+               die("Bucket size (%llu) cannot be smaller than btree node size (%u)",
+                   dev->bucket_size, opts.btree_node_size);
 
        if (dev->nbuckets < BCH_MIN_NR_NBUCKETS)
-               die("Not enough buckets: %llu, need %u (bucket size %u)",
+               die("Not enough buckets: %llu, need %u (bucket size %llu)",
                    dev->nbuckets, BCH_MIN_NR_NBUCKETS, dev->bucket_size);
 
+       if (dev->bucket_size > (u32) U16_MAX << 9)
+               die("Bucket size (%llu) too big (max %u)",
+                   dev->bucket_size, (u32) U16_MAX << 9);
 }
 
 static unsigned parse_target(struct bch_sb_handle *sb,
@@ -165,7 +172,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
        /* calculate btree node size: */
        if (!opt_defined(fs_opts, btree_node_size)) {
                /* 256k default btree node size */
-               opt_set(fs_opts, btree_node_size, 512);
+               opt_set(fs_opts, btree_node_size, 256 << 10);
 
                for (i = devs; i < devs + nr_devs; i++)
                        fs_opts.btree_node_size =
@@ -173,12 +180,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs     fs_opt_strs,
                                      i->bucket_size);
        }
 
-       if (!is_power_of_2(fs_opts.block_size))
-               die("block size must be power of 2");
-
-       if (!is_power_of_2(fs_opts.btree_node_size))
-               die("btree node size must be power of 2");
-
        if (uuid_is_null(opts.uuid.b))
                uuid_generate(opts.uuid.b);
 
@@ -188,7 +189,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
        sb.sb->version          = le16_to_cpu(opts.version);
        sb.sb->version_min      = le16_to_cpu(opts.version);
        sb.sb->magic            = BCACHE_MAGIC;
-       sb.sb->block_size       = cpu_to_le16(fs_opts.block_size);
        sb.sb->user_uuid        = opts.uuid;
        sb.sb->nr_devices       = nr_devs;
 
@@ -205,22 +205,15 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
        for (opt_id = 0;
             opt_id < bch2_opts_nr;
             opt_id++) {
-               const struct bch_option *opt = &bch2_opt_table[opt_id];
                u64 v;
 
-               if (opt->set_sb == SET_NO_SB_OPT)
-                       continue;
-
                v = bch2_opt_defined_by_id(&fs_opts, opt_id)
                        ? bch2_opt_get_by_id(&fs_opts, opt_id)
                        : bch2_opt_get_by_id(&bch2_opts_default, opt_id);
 
-               opt->set_sb(sb.sb, v);
+               __bch2_opt_set_sb(sb.sb, &bch2_opt_table[opt_id], v);
        }
 
-       SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,
-                               ilog2(opts.encoded_extent_max));
-
        struct timespec now;
        if (clock_gettime(CLOCK_REALTIME, &now))
                die("error getting current time: %m");
@@ -231,7 +224,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
        /* Member info: */
        mi = bch2_sb_resize_members(&sb,
                        (sizeof(*mi) + sizeof(struct bch_member) *
-                        nr_devs) / sizeof(u64));
+                       nr_devs) / sizeof(u64));
 
        for (i = devs; i < devs + nr_devs; i++) {
                struct bch_member *m = mi->members + (i - devs);
@@ -239,25 +232,31 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
                uuid_generate(m->uuid.b);
                m->nbuckets     = cpu_to_le64(i->nbuckets);
                m->first_bucket = 0;
-               m->bucket_size  = cpu_to_le16(i->bucket_size);
+               m->bucket_size  = cpu_to_le16(i->bucket_size >> 9);
 
-               SET_BCH_MEMBER_REPLACEMENT(m,   BCH_CACHE_REPLACEMENT_lru);
                SET_BCH_MEMBER_DISCARD(m,       i->discard);
                SET_BCH_MEMBER_DATA_ALLOWED(m,  i->data_allowed);
                SET_BCH_MEMBER_DURABILITY(m,    i->durability + 1);
        }
 
-       /* Disk groups */
+       /* Disk labels*/
        for (i = devs; i < devs + nr_devs; i++) {
-               struct bch_member *m = mi->members + (i - devs);
+               struct bch_member *m;
                int idx;
 
-               if (!i->group)
+               if (!i->label)
                        continue;
 
-               idx = bch2_disk_path_find_or_create(&sb, i->group);
+               idx = bch2_disk_path_find_or_create(&sb, i->label);
                if (idx < 0)
-                       die("error creating disk path: %s", idx);
+                       die("error creating disk path: %s", strerror(-idx));
+
+               /*
+                * Recompute mi and m after each sb modification: its location
+                * in memory may have changed due to reallocation.
+                */
+               mi = bch2_sb_get_members(sb.sb);
+               m = mi->members + (i - devs);
 
                SET_BCH_MEMBER_GROUP(m, idx + 1);
        }
@@ -281,11 +280,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs    fs_opt_strs,
        }
 
        for (i = devs; i < devs + nr_devs; i++) {
+               u64 size_sectors = i->size >> 9;
+
                sb.sb->dev_idx = i - devs;
 
                if (!i->sb_offset) {
                        i->sb_offset    = BCH_SB_SECTOR;
-                       i->sb_end       = i->size;
+                       i->sb_end       = size_sectors;
                }
 
                init_layout(&sb.sb->layout, fs_opts.block_size,
@@ -301,9 +302,9 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
                 */
                if (i->sb_offset == BCH_SB_SECTOR) {
                        struct bch_sb_layout *l = &sb.sb->layout;
-                       u64 backup_sb = i->size - (1 << l->sb_max_size_bits);
+                       u64 backup_sb = size_sectors - (1 << l->sb_max_size_bits);
 
-                       backup_sb = rounddown(backup_sb, i->bucket_size);
+                       backup_sb = rounddown(backup_sb, i->bucket_size >> 9);
                        l->sb_offset[l->nr_superblocks++] = cpu_to_le64(backup_sb);
                }
 
@@ -311,7 +312,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs      fs_opt_strs,
                        /* Zero start of disk */
                        static const char zeroes[BCH_SB_SECTOR << 9];
 
-                       xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+                       xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0,
+                               "zeroing start of disk");
                }
 
                bch2_super_write(i->fd, sb.sb);
@@ -332,12 +334,14 @@ void bch2_super_write(int fd, struct bch_sb *sb)
                if (sb->offset == BCH_SB_SECTOR) {
                        /* Write backup layout */
                        xpwrite(fd, &sb->layout, sizeof(sb->layout),
-                               BCH_SB_LAYOUT_SECTOR << 9);
+                               BCH_SB_LAYOUT_SECTOR << 9,
+                               "backup layout");
                }
 
                sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
                xpwrite(fd, sb, vstruct_bytes(sb),
-                       le64_to_cpu(sb->offset) << 9);
+                       le64_to_cpu(sb->offset) << 9,
+                       "superblock");
        }
 
        fsync(fd);
@@ -410,10 +414,10 @@ static int bch2_sb_get_target(struct bch_sb *sb, char *buf, size_t len, u64 v)
                struct bch_disk_group *g = gi->entries + t.group;
 
                if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) {
-                       ret = scnprintf(buf, len, "Group %u (%.*s)", t.group,
+                       ret = scnprintf(buf, len, "Label %u (%.*s)", t.group,
                                BCH_SB_LABEL_SIZE, g->label);
                } else {
-                       ret = scnprintf(buf, len, "Bad group %u", t.group);
+                       ret = scnprintf(buf, len, "Bad label %u", t.group);
                }
                break;
        }
@@ -475,7 +479,7 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
                char member_uuid_str[40];
                char data_allowed_str[100];
                char data_has_str[100];
-               char group[BCH_SB_LABEL_SIZE+10];
+               char label [BCH_SB_LABEL_SIZE+10];
                char time_str[64];
 
                if (!bch2_member_exists(m))
@@ -487,14 +491,14 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
                        unsigned idx = BCH_MEMBER_GROUP(m) - 1;
 
                        if (idx < disk_groups_nr(gi)) {
-                               snprintf(group, sizeof(group), "%.*s (%u)",
+                               scnprintf(label, sizeof(label), "%.*s (%u)",
                                        BCH_SB_LABEL_SIZE,
                                        gi->entries[idx].label, idx);
                        } else {
-                               strcpy(group, "(bad disk groups section)");
+                               strcpy(label, "(bad disk labels section)");
                        }
                } else {
-                       strcpy(group, "(none)");
+                       strcpy(label, "(none)");
                }
 
                bch2_flags_to_text(&PBUF(data_allowed_str),
@@ -531,7 +535,6 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
 
                       "    Has data:                   %s\n"
 
-                      "    Replacement policy:         %s\n"
                       "    Discard:                    %llu\n",
                       i, member_uuid_str,
                       pr_units(le16_to_cpu(m->bucket_size) *
@@ -545,14 +548,10 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f,
                       ? bch2_member_states[BCH_MEMBER_STATE(m)]
                       : "unknown",
 
-                      group,
+                      label,
                       data_allowed_str,
                       data_has_str,
 
-                      BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR
-                      ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)]
-                      : "unknown",
-
                       BCH_MEMBER_DISCARD(m));
        }
 }
@@ -573,7 +572,7 @@ static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f,
 }
 
 static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
-                                  enum units units)
+                                     enum units units)
 {
        struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0);
        struct bch_replicas_entry_v0 *e;
@@ -636,7 +635,7 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
 }
 
 static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f,
-                               enum units units)
+                                               enum units units)
 {
        struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist);
        unsigned i, nr = blacklist_nr_entries(bl);
@@ -899,7 +898,9 @@ struct bchfs_handle bcache_fs_open(const char *path)
                free(ctl);
        } else {
                /* It's a path: */
-               ret.ioctl_fd = xopen(path, O_RDONLY);
+               ret.ioctl_fd = open(path, O_RDONLY);
+               if (ret.ioctl_fd < 0)
+                       die("Error opening filesystem at %s: %m", path);
 
                struct bch_ioctl_query_uuid uuid;
                if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid) < 0)
@@ -1062,7 +1063,7 @@ struct bch_opt_strs bch2_cmdline_opts_get(int *argc, char *argv[],
 
                optid = bch2_opt_lookup(optstr);
                if (optid < 0 ||
-                   !(bch2_opt_table[optid].mode & opt_types)) {
+                   !(bch2_opt_table[optid].flags & opt_types)) {
                        i++;
                        goto next;
                }
@@ -1102,7 +1103,8 @@ struct bch_opts bch2_parse_opts(struct bch_opt_strs strs)
                    bch2_opt_table[i].type == BCH_OPT_FN)
                        continue;
 
-               ret = bch2_opt_parse(NULL, &bch2_opt_table[i],
+               ret = bch2_opt_parse(NULL, "option",
+                                    &bch2_opt_table[i],
                                     strs.by_id[i], &v);
                if (ret < 0)
                        die("Invalid %s: %s",
@@ -1130,7 +1132,7 @@ void bch2_opts_usage(unsigned opt_types)
        for (opt = bch2_opt_table;
             opt < bch2_opt_table + bch2_opts_nr;
             opt++) {
-               if (!(opt->mode & opt_types))
+               if (!(opt->flags & opt_types))
                        continue;
 
                c += printf("      --%s", opt->attr.name);
index 7cdbf69622548aa9f88bf982a65756a6673c8ef0..ab4f0cd67fa079306a89c71cc34f059d55c8308c 100644 (file)
@@ -35,7 +35,6 @@ struct format_opts {
        uuid_le         uuid;
        unsigned        version;
        unsigned        superblock_size;
-       unsigned        encoded_extent_max;
        bool            encrypted;
        char            *passphrase;
 };
@@ -45,16 +44,15 @@ static inline struct format_opts format_opts_default()
        return (struct format_opts) {
                .version                = bcachefs_metadata_version_current,
                .superblock_size        = SUPERBLOCK_SIZE_DEFAULT,
-               .encoded_extent_max     = 128,
        };
 }
 
 struct dev_opts {
        int             fd;
        char            *path;
-       u64             size; /* 512 byte sectors */
-       unsigned        bucket_size;
-       const char      *group;
+       u64             size;           /* bytes*/
+       u64             bucket_size;    /* bytes */
+       const char      *label;
        unsigned        data_allowed;
        unsigned        durability;
        bool            discard;
index eb907e5d33d3fcf364f0f977d390d916336d391d..5070caf8f349adbec532ae966346fe22e6fda142 100644 (file)
@@ -212,36 +212,38 @@ bch2_acl_to_xattr(struct btree_trans *trans,
        return xattr;
 }
 
-struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu)
 {
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c_xattr xattr;
        struct posix_acl *acl = NULL;
        struct bkey_s_c k;
        int ret;
 
+       if (rcu)
+               return ERR_PTR(-ECHILD);
+
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
-                       &hash, inode->v.i_ino,
+       ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc,
+                       &hash, inode_inum(inode),
                        &X_SEARCH(acl_to_xattr_type(type), "", 0),
                        0);
-       if (IS_ERR(iter)) {
-               if (PTR_ERR(iter) == -EINTR)
+       if (ret) {
+               if (ret == -EINTR)
                        goto retry;
-
-               if (PTR_ERR(iter) != -ENOENT)
-                       acl = ERR_CAST(iter);
+               if (ret != -ENOENT)
+                       acl = ERR_PTR(ret);
                goto out;
        }
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret) {
                acl = ERR_PTR(ret);
@@ -254,17 +256,17 @@ retry:
 
        if (!IS_ERR(acl))
                set_cached_acl(&inode->v, type, acl);
-       bch2_trans_iter_put(&trans, iter);
 out:
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return acl;
 }
 
-int bch2_set_acl_trans(struct btree_trans *trans,
+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
                       struct bch_inode_unpacked *inode_u,
-                      const struct bch_hash_info *hash_info,
                       struct posix_acl *acl, int type)
 {
+       struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u);
        int ret;
 
        if (type == ACL_TYPE_DEFAULT &&
@@ -277,14 +279,14 @@ int bch2_set_acl_trans(struct btree_trans *trans,
                if (IS_ERR(xattr))
                        return PTR_ERR(xattr);
 
-               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
-                                   inode_u->bi_inum, &xattr->k_i, 0);
+               ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info,
+                                   inum, &xattr->k_i, 0);
        } else {
                struct xattr_search_key search =
                        X_SEARCH(acl_to_xattr_type(type), "", 0);
 
-               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
-                                      inode_u->bi_inum, &search);
+               ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info,
+                                      inum, &search);
        }
 
        return ret == -ENOENT ? 0 : ret;
@@ -296,9 +298,8 @@ int bch2_set_acl(struct user_namespace *mnt_userns,
        struct bch_inode_info *inode = to_bch_ei(vinode);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *inode_iter;
+       struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
-       struct bch_hash_info hash_info;
        struct posix_acl *acl;
        umode_t mode;
        int ret;
@@ -309,9 +310,8 @@ retry:
        bch2_trans_begin(&trans);
        acl = _acl;
 
-       inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                                    BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+                             BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
 
@@ -323,27 +323,24 @@ retry:
                        goto btree_err;
        }
 
-       hash_info = bch2_hash_info_init(c, &inode_u);
-
-       ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
+       ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type);
        if (ret)
                goto btree_err;
 
        inode_u.bi_ctime        = bch2_current_time(c);
        inode_u.bi_mode         = mode;
 
-       ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &inode->ei_journal_seq, 0);
+       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(&trans, NULL, NULL, 0);
 btree_err:
-       bch2_trans_iter_put(&trans, inode_iter);
+       bch2_trans_iter_exit(&trans, &inode_iter);
 
        if (ret == -EINTR)
                goto retry;
        if (unlikely(ret))
                goto err;
 
-       bch2_inode_update_after_write(c, inode, &inode_u,
+       bch2_inode_update_after_write(&trans, inode, &inode_u,
                                      ATTR_CTIME|ATTR_MODE);
 
        set_cached_acl(&inode->v, type, acl);
@@ -354,28 +351,27 @@ err:
        return ret;
 }
 
-int bch2_acl_chmod(struct btree_trans *trans,
+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                   struct bch_inode_unpacked *inode,
                   umode_t mode,
                   struct posix_acl **new_acl)
 {
        struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_i_xattr *new;
        struct posix_acl *acl;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
-                       &hash_info, inode->bi_inum,
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
+                              &hash_info, inum,
                        &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
                        BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(iter);
        if (ret)
                return ret == -ENOENT ? 0 : ret;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        xattr = bkey_s_c_to_xattr(k);
        if (ret)
                goto err;
@@ -396,12 +392,12 @@ int bch2_acl_chmod(struct btree_trans *trans,
                goto err;
        }
 
-       new->k.p = iter->pos;
-       ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+       new->k.p = iter.pos;
+       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
        *new_acl = acl;
        acl = NULL;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        if (!IS_ERR_OR_NULL(acl))
                kfree(acl);
        return ret;
index 25fc54dd08845884dd0b8e0fad064920ae230741..2d76a4897ba89ae3d4ef43135ecea243af9e1a7e 100644 (file)
@@ -26,27 +26,26 @@ typedef struct {
        __le32          a_version;
 } bch_acl_header;
 
-struct posix_acl *bch2_get_acl(struct inode *, int);
+struct posix_acl *bch2_get_acl(struct inode *, int, bool);
 
-int bch2_set_acl_trans(struct btree_trans *,
+int bch2_set_acl_trans(struct btree_trans *, subvol_inum,
                       struct bch_inode_unpacked *,
-                      const struct bch_hash_info *,
                       struct posix_acl *, int);
 int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int);
-int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
+int bch2_acl_chmod(struct btree_trans *, subvol_inum,
+                  struct bch_inode_unpacked *,
                   umode_t, struct posix_acl **);
 
 #else
 
-static inline int bch2_set_acl_trans(struct btree_trans *trans,
+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum,
                                     struct bch_inode_unpacked *inode_u,
-                                    const struct bch_hash_info *hash_info,
                                     struct posix_acl *acl, int type)
 {
        return 0;
 }
 
-static inline int bch2_acl_chmod(struct btree_trans *trans,
+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum,
                                 struct bch_inode_unpacked *inode,
                                 umode_t mode,
                                 struct posix_acl **new_acl)
index 886861a00df30ef6393c33c8775cc923f2943dfe..023db6219ad878f8106b439895adf7d8f362a358 100644 (file)
@@ -9,6 +9,7 @@
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "ec.h"
@@ -147,10 +148,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
        return 0;
 }
 
-static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
+                               struct bkey_s_c k)
+{
+       struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
+       const u8 *in = a.v->data;
+       const u8 *end = bkey_val_end(a);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v;
+
+       out->gen        = a.v->gen;
+       out->oldest_gen = a.v->oldest_gen;
+       out->data_type  = a.v->data_type;
+       out->journal_seq = le64_to_cpu(a.v->journal_seq);
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < a.v->nr_fields) {                                 \
+               ret = bch2_varint_decode_fast(in, end, &v);             \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+       } else {                                                        \
+               v = 0;                                                  \
+       }                                                               \
+       out->_name = v;                                                 \
+       if (v != out->_name)                                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
+       return 0;
+}
+
+static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
                               const struct bkey_alloc_unpacked src)
 {
-       struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+       struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
        unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
        u8 *out = a->v.data;
        u8 *end = (void *) &dst[1];
@@ -161,6 +196,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
        a->v.gen        = src.gen;
        a->v.oldest_gen = src.oldest_gen;
        a->v.data_type  = src.data_type;
+       a->v.journal_seq = cpu_to_le64(src.journal_seq);
 
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
@@ -194,19 +230,40 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
                .gen    = 0,
        };
 
-       if (k.k->type == KEY_TYPE_alloc_v2)
-               bch2_alloc_unpack_v2(&ret, k);
-       else if (k.k->type == KEY_TYPE_alloc)
+       switch (k.k->type) {
+       case KEY_TYPE_alloc:
                bch2_alloc_unpack_v1(&ret, k);
+               break;
+       case KEY_TYPE_alloc_v2:
+               bch2_alloc_unpack_v2(&ret, k);
+               break;
+       case KEY_TYPE_alloc_v3:
+               bch2_alloc_unpack_v3(&ret, k);
+               break;
+       }
 
        return ret;
 }
 
-void bch2_alloc_pack(struct bch_fs *c,
-                    struct bkey_alloc_buf *dst,
-                    const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+                                      const struct bkey_alloc_unpacked src)
 {
-       bch2_alloc_pack_v2(dst, src);
+       struct bkey_alloc_buf *dst;
+
+       dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+       if (!IS_ERR(dst))
+               bch2_alloc_pack_v3(dst, src);
+
+       return dst;
+}
+
+int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
+                    struct bkey_alloc_unpacked *u, unsigned trigger_flags)
+{
+       struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
+
+       return PTR_ERR_OR_ZERO(a) ?:
+               bch2_trans_update(trans, iter, &a->k, trigger_flags);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -249,137 +306,81 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-                          struct bkey_s_c k)
-{
-       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
-
-       pr_buf(out, "gen %u oldest_gen %u data_type %s",
-              u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
-#define x(_name, ...)  pr_buf(out, " " #_name " %llu", (u64) u._name);
-       BCH_ALLOC_FIELDS_V2()
-#undef  x
-}
-
-static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bch_dev *ca;
-       struct bucket *g;
        struct bkey_alloc_unpacked u;
 
-       if (k.k->type != KEY_TYPE_alloc &&
-           k.k->type != KEY_TYPE_alloc_v2)
-               return 0;
-
-       ca = bch_dev_bkey_exists(c, k.k->p.inode);
-       g = bucket(ca, k.k->p.offset);
-       u = bch2_alloc_unpack(k);
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
 
-       g->_mark.gen            = u.gen;
-       g->_mark.data_type      = u.data_type;
-       g->_mark.dirty_sectors  = u.dirty_sectors;
-       g->_mark.cached_sectors = u.cached_sectors;
-       g->io_time[READ]        = u.read_time;
-       g->io_time[WRITE]       = u.write_time;
-       g->oldest_gen           = u.oldest_gen;
-       g->gen_valid            = 1;
+       if (bch2_alloc_unpack_v3(&u, k))
+               return "unpack error";
 
-       return 0;
+       return NULL;
 }
 
-int bch2_alloc_read(struct bch_fs *c)
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
 {
-       int ret;
-
-       down_read(&c->gc_lock);
-       ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
-       up_read(&c->gc_lock);
-       if (ret) {
-               bch_err(c, "error reading alloc info: %i", ret);
-               return ret;
-       }
+       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-       return 0;
+       pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+              u.gen, u.oldest_gen, bch2_data_types[u.data_type],
+              u.journal_seq);
+#define x(_name, ...)  pr_buf(out, " " #_name " %llu", (u64) u._name);
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
 }
 
-static int bch2_alloc_write_key(struct btree_trans *trans,
-                               struct btree_iter *iter,
-                               unsigned flags)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 {
-       struct bch_fs *c = trans->c;
+       struct btree_trans trans;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
        struct bucket *g;
-       struct bucket_mark m;
-       struct bkey_alloc_unpacked old_u, new_u;
-       struct bkey_alloc_buf a;
+       struct bkey_alloc_unpacked u;
        int ret;
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_btree_key_cache_flush(trans,
-                       BTREE_ID_alloc, iter->pos);
-       if (ret)
-               goto err;
-
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto err;
-
-       old_u = bch2_alloc_unpack(k);
-
-       percpu_down_read(&c->mark_lock);
-       ca      = bch_dev_bkey_exists(c, iter->pos.inode);
-       g       = bucket(ca, iter->pos.offset);
-       m       = READ_ONCE(g->mark);
-       new_u   = alloc_mem_to_key(iter, g, m);
-       percpu_up_read(&c->mark_lock);
-
-       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-               return 0;
 
-       bch2_alloc_pack(c, &a, new_u);
-       ret   = bch2_trans_update(trans, iter, &a.k,
-                                 BTREE_TRIGGER_NORUN) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|flags);
-err:
-       if (ret == -EINTR)
-               goto retry;
-       return ret;
-}
-
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ca = bch_dev_bkey_exists(c, k.k->p.inode);
+               g = __bucket(ca, k.k->p.offset, gc);
+               u = bch2_alloc_unpack(k);
+
+               if (!gc)
+                       *bucket_gen(ca, k.k->p.offset) = u.gen;
+
+               g->_mark.gen            = u.gen;
+               g->io_time[READ]        = u.read_time;
+               g->io_time[WRITE]       = u.write_time;
+               g->oldest_gen           = !gc ? u.oldest_gen : u.gen;
+               g->gen_valid            = 1;
+
+               if (!gc ||
+                   (metadata_only &&
+                    (u.data_type == BCH_DATA_user ||
+                     u.data_type == BCH_DATA_cached ||
+                     u.data_type == BCH_DATA_parity))) {
+                       g->_mark.data_type      = u.data_type;
+                       g->_mark.dirty_sectors  = u.dirty_sectors;
+                       g->_mark.cached_sectors = u.cached_sectors;
+                       g->_mark.stripe         = u.stripe != 0;
+                       g->stripe               = u.stripe;
+                       g->stripe_redundancy    = u.stripe_redundancy;
+               }
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       for_each_member_device(ca, c, i) {
-               bch2_btree_iter_set_pos(iter,
-                       POS(ca->dev_idx, ca->mi.first_bucket));
+       bch2_trans_exit(&trans);
 
-               while (iter->pos.offset < ca->mi.nbuckets) {
-                       bch2_trans_cond_resched(&trans);
+       if (ret)
+               bch_err(c, "error reading alloc info: %i", ret);
 
-                       ret = bch2_alloc_write_key(&trans, iter, flags);
-                       if (ret) {
-                               percpu_ref_put(&ca->ref);
-                               goto err;
-                       }
-                       bch2_btree_iter_advance(iter);
-               }
-       }
-err:
-       bch2_trans_iter_put(&trans, iter);
-       bch2_trans_exit(&trans);
        return ret;
 }
 
@@ -389,31 +390,21 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                              size_t bucket_nr, int rw)
 {
        struct bch_fs *c = trans->c;
-       struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
-       struct btree_iter *iter;
-       struct bucket *g;
-       struct bkey_alloc_buf *a;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        struct bkey_alloc_unpacked u;
        u64 *time, now;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
-                                  BTREE_ITER_CACHED|
-                                  BTREE_ITER_CACHED_NOFILL|
-                                  BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               goto out;
-
-       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-       ret = PTR_ERR_OR_ZERO(a);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
        if (ret)
                goto out;
 
-       percpu_down_read(&c->mark_lock);
-       g = bucket(ca, bucket_nr);
-       u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
-       percpu_up_read(&c->mark_lock);
+       u = bch2_alloc_unpack(k);
 
        time = rw == READ ? &u.read_time : &u.write_time;
        now = atomic64_read(&c->io_clock[rw].now);
@@ -422,11 +413,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 
        *time = now;
 
-       bch2_alloc_pack(c, a, u);
-       ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
+       ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
                bch2_trans_commit(trans, NULL, NULL, 0);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -453,6 +443,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
            test_bit(b, ca->buckets_nouse))
                return false;
 
+       if (ca->new_fs_bucket_idx) {
+               /*
+                * Device or filesystem is still being initialized, and we
+                * haven't fully marked superblocks & journal:
+                */
+               if (is_superblock_bucket(ca, b))
+                       return false;
+
+               if (b < ca->new_fs_bucket_idx)
+                       return false;
+       }
+
        gc_gen = bucket_gc_gen(bucket(ca, b));
 
        ca->inc_gen_needs_gc            += gc_gen >= BUCKET_GC_GEN_MAX / 2;
@@ -469,7 +471,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
 static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
                                u64 now, u64 last_seq_ondisk)
 {
-       unsigned used = bucket_sectors_used(m);
+       unsigned used = m.cached_sectors;
 
        if (used) {
                /*
@@ -488,8 +490,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
                 * keys when there's only a small difference, so that we can
                 * keep sequential buckets together:
                 */
-               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
-                       (bucket_gc_gen(g) >> 4);
+               return bucket_gc_gen(g) >> 4;
        }
 }
 
@@ -521,7 +522,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        buckets = bucket_array(ca);
        ca->alloc_heap.used = 0;
        now = atomic64_read(&c->io_clock[READ].now);
-       last_seq_ondisk = c->journal.last_seq_ondisk;
+       last_seq_ondisk = c->journal.flushed_seq_ondisk;
 
        /*
         * Find buckets with lowest read priority, by building a maxheap sorted
@@ -538,6 +539,14 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
 
+               if (!m.data_type &&
+                   bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                                                    last_seq_ondisk,
+                                                    ca->dev_idx, b)) {
+                       ca->buckets_waiting_on_journal++;
+                       continue;
+               }
+
                if (e.nr && e.bucket + e.nr == b && e.key == key) {
                        e.nr++;
                } else {
@@ -568,94 +577,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        up_read(&ca->bucket_lock);
 }
 
-static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket_mark m;
-       size_t b, start;
-
-       if (ca->fifo_last_bucket <  ca->mi.first_bucket ||
-           ca->fifo_last_bucket >= ca->mi.nbuckets)
-               ca->fifo_last_bucket = ca->mi.first_bucket;
-
-       start = ca->fifo_last_bucket;
-
-       do {
-               ca->fifo_last_bucket++;
-               if (ca->fifo_last_bucket == ca->mi.nbuckets)
-                       ca->fifo_last_bucket = ca->mi.first_bucket;
-
-               b = ca->fifo_last_bucket;
-               m = READ_ONCE(buckets->b[b].mark);
-
-               if (bch2_can_invalidate_bucket(ca, b, m)) {
-                       struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-                       heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-                       if (heap_full(&ca->alloc_heap))
-                               break;
-               }
-
-               cond_resched();
-       } while (ca->fifo_last_bucket != start);
-}
-
-static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket_mark m;
-       size_t checked, i;
-
-       for (checked = 0;
-            checked < ca->mi.nbuckets / 2;
-            checked++) {
-               size_t b = bch2_rand_range(ca->mi.nbuckets -
-                                          ca->mi.first_bucket) +
-                       ca->mi.first_bucket;
-
-               m = READ_ONCE(buckets->b[b].mark);
-
-               if (bch2_can_invalidate_bucket(ca, b, m)) {
-                       struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
-
-                       heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
-                       if (heap_full(&ca->alloc_heap))
-                               break;
-               }
-
-               cond_resched();
-       }
-
-       sort(ca->alloc_heap.data,
-            ca->alloc_heap.used,
-            sizeof(ca->alloc_heap.data[0]),
-            bucket_idx_cmp, NULL);
-
-       /* remove duplicates: */
-       for (i = 0; i + 1 < ca->alloc_heap.used; i++)
-               if (ca->alloc_heap.data[i].bucket ==
-                   ca->alloc_heap.data[i + 1].bucket)
-                       ca->alloc_heap.data[i].nr = 0;
-}
-
 static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
        size_t i, nr = 0;
 
        ca->inc_gen_needs_gc                    = 0;
        ca->inc_gen_really_needs_gc             = 0;
+       ca->buckets_waiting_on_journal          = 0;
 
-       switch (ca->mi.replacement) {
-       case BCH_CACHE_REPLACEMENT_lru:
-               find_reclaimable_buckets_lru(c, ca);
-               break;
-       case BCH_CACHE_REPLACEMENT_fifo:
-               find_reclaimable_buckets_fifo(c, ca);
-               break;
-       case BCH_CACHE_REPLACEMENT_random:
-               find_reclaimable_buckets_random(c, ca);
-               break;
-       }
+       find_reclaimable_buckets_lru(c, ca);
 
        heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
 
@@ -665,92 +595,61 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
        return nr;
 }
 
-/*
- * returns sequence number of most recent journal entry that updated this
- * bucket:
- */
-static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
-{
-       if (m.journal_seq_valid) {
-               u64 journal_seq = atomic64_read(&c->journal.seq);
-               u64 bucket_seq  = journal_seq;
-
-               bucket_seq &= ~((u64) U16_MAX);
-               bucket_seq |= m.journal_seq;
-
-               if (bucket_seq > journal_seq)
-                       bucket_seq -= 1 << 16;
-
-               return bucket_seq;
-       } else {
-               return 0;
-       }
-}
-
 static int bucket_invalidate_btree(struct btree_trans *trans,
-                                  struct bch_dev *ca, u64 b)
+                                  struct bch_dev *ca, u64 b,
+                                  struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_alloc_buf *a;
-       struct bkey_alloc_unpacked u;
-       struct bucket *g;
-       struct bucket_mark m;
-       struct btree_iter *iter =
-               bch2_trans_get_iter(trans, BTREE_ID_alloc,
-                                   POS(ca->dev_idx, b),
-                                   BTREE_ITER_CACHED|
-                                   BTREE_ITER_CACHED_NOFILL|
-                                   BTREE_ITER_INTENT);
+       struct btree_iter iter;
+       struct bkey_s_c k;
        int ret;
 
-       a = bch2_trans_kmalloc(trans, sizeof(*a));
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               goto err;
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            POS(ca->dev_idx, b),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
 
-       ret = bch2_btree_iter_traverse(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
        if (ret)
                goto err;
 
-       percpu_down_read(&c->mark_lock);
-       g = bucket(ca, b);
-       m = READ_ONCE(g->mark);
-       u = alloc_mem_to_key(iter, g, m);
-       percpu_up_read(&c->mark_lock);
-
-       u.gen++;
-       u.data_type     = 0;
-       u.dirty_sectors = 0;
-       u.cached_sectors = 0;
-       u.read_time     = atomic64_read(&c->io_clock[READ].now);
-       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
+       *u = bch2_alloc_unpack(k);
+       u->gen++;
+       u->data_type            = 0;
+       u->dirty_sectors        = 0;
+       u->cached_sectors       = 0;
+       u->read_time            = atomic64_read(&c->io_clock[READ].now);
+       u->write_time           = atomic64_read(&c->io_clock[WRITE].now);
 
-       bch2_alloc_pack(c, a, u);
-       ret = bch2_trans_update(trans, iter, &a->k,
-                               BTREE_TRIGGER_BUCKET_INVALIDATE);
+       ret = bch2_alloc_write(trans, &iter, u,
+                              BTREE_TRIGGER_BUCKET_INVALIDATE);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
 static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
                                      u64 *journal_seq, unsigned flags)
 {
-       struct bucket *g;
-       struct bucket_mark m;
+       struct bkey_alloc_unpacked u;
        size_t b;
+       u64 commit_seq = 0;
        int ret = 0;
 
+       /*
+        * If the read-only path is trying to shut down, we can't be generating
+        * new btree updates:
+        */
+       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
+               return 1;
+
        BUG_ON(!ca->alloc_heap.used ||
               !ca->alloc_heap.data[0].nr);
        b = ca->alloc_heap.data[0].bucket;
 
        /* first, put on free_inc and mark as owned by allocator: */
        percpu_down_read(&c->mark_lock);
-       g = bucket(ca, b);
-       m = READ_ONCE(g->mark);
-
-       BUG_ON(m.dirty_sectors);
 
        bch2_mark_alloc_bucket(c, ca, b, true);
 
@@ -759,37 +658,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
        BUG_ON(!fifo_push(&ca->free_inc, b));
        spin_unlock(&c->freelist_lock);
 
-       /*
-        * If we're not invalidating cached data, we only increment the bucket
-        * gen in memory here, the incremented gen will be updated in the btree
-        * by bch2_trans_mark_pointer():
-        */
-       if (!m.cached_sectors &&
-           !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
-               BUG_ON(m.data_type);
-               bucket_cmpxchg(g, m, m.gen++);
-               percpu_up_read(&c->mark_lock);
-               goto out;
-       }
-
        percpu_up_read(&c->mark_lock);
 
-       /*
-        * If the read-only path is trying to shut down, we can't be generating
-        * new btree updates:
-        */
-       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
-               ret = 1;
-               goto out;
-       }
-
-       ret = bch2_trans_do(c, NULL, journal_seq,
+       ret = bch2_trans_do(c, NULL, &commit_seq,
                            BTREE_INSERT_NOCHECK_RW|
                            BTREE_INSERT_NOFAIL|
                            BTREE_INSERT_JOURNAL_RESERVED|
                            flags,
-                           bucket_invalidate_btree(&trans, ca, b));
-out:
+                           bucket_invalidate_btree(&trans, ca, b, &u));
+
        if (!ret) {
                /* remove from alloc_heap: */
                struct alloc_heap_entry e, *top = ca->alloc_heap.data;
@@ -801,11 +678,17 @@ out:
                        heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
 
                /*
-                * Make sure we flush the last journal entry that updated this
-                * bucket (i.e. deleting the last reference) before writing to
-                * this bucket again:
+                * If we invalidating cached data then we need to wait on the
+                * journal commit:
                 */
-               *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+               if (u.data_type)
+                       *journal_seq = max(*journal_seq, commit_seq);
+
+               /*
+                * We already waiting on u.alloc_seq when we filtered out
+                * buckets that need journal commit:
+                */
+               BUG_ON(*journal_seq > u.journal_seq);
        } else {
                size_t b2;
 
@@ -856,10 +739,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        /* If we used NOWAIT, don't return the error: */
        if (!fifo_empty(&ca->free_inc))
                ret = 0;
-       if (ret) {
+       if (ret < 0)
                bch_err(ca, "error invalidating buckets: %i", ret);
+       if (ret)
                return ret;
-       }
 
        if (journal_seq)
                ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@@ -972,8 +855,14 @@ static int bch2_allocator_thread(void *arg)
                        gc_count = c->gc_count;
                        nr = find_reclaimable_buckets(c, ca);
 
-                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-                                        ca->inc_gen_really_needs_gc);
+                       if (!nr && ca->buckets_waiting_on_journal) {
+                               ret = bch2_journal_flush(&c->journal);
+                               if (ret)
+                                       goto stop;
+                       } else if (nr < (ca->mi.nbuckets >> 6) &&
+                                  ca->buckets_waiting_on_journal >= nr / 2) {
+                               bch2_journal_flush_async(&c->journal, NULL);
+                       }
 
                        if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
                             ca->inc_gen_really_needs_gc) &&
@@ -981,6 +870,9 @@ static int bch2_allocator_thread(void *arg)
                                atomic_inc(&c->kick_gc);
                                wake_up_process(c->gc_thread);
                        }
+
+                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
+                                        ca->inc_gen_really_needs_gc);
                }
 
                ret = bch2_invalidate_buckets(c, ca);
@@ -1015,7 +907,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
        lockdep_assert_held(&c->state_lock);
 
        for_each_online_member(ca, c, i) {
-               struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
+               struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
 
                ra_pages += bdi->ra_pages;
        }
@@ -1085,7 +977,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
             ob++) {
                spin_lock(&ob->lock);
                if (ob->valid && !ob->on_partial_list &&
-                   ob->ptr.dev == ca->dev_idx)
+                   ob->dev == ca->dev_idx)
                        ret = true;
                spin_unlock(&ob->lock);
        }
@@ -1232,22 +1124,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
 }
-
-void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
-{
-       struct open_bucket *ob;
-
-       for (ob = c->open_buckets;
-            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-            ob++) {
-               spin_lock(&ob->lock);
-               if (ob->valid && !ob->on_partial_list) {
-                       pr_buf(out, "%zu ref %u type %s\n",
-                              ob - c->open_buckets,
-                              atomic_read(&ob->pin),
-                              bch2_data_types[ob->type]);
-               }
-               spin_unlock(&ob->lock);
-       }
-
-}
index a4f6bf56b18f6eee5266852e994f28f6d5b5f738..98c7866e20b57ded9f8d629d8427d5966f97bfb5 100644 (file)
@@ -4,11 +4,14 @@
 
 #include "bcachefs.h"
 #include "alloc_types.h"
+#include "buckets.h"
 #include "debug.h"
+#include "super.h"
 
 extern const char * const bch2_allocator_states[];
 
 struct bkey_alloc_unpacked {
+       u64             journal_seq;
        u64             bucket;
        u8              dev;
        u8              gen;
@@ -19,23 +22,6 @@ struct bkey_alloc_unpacked {
 #undef  x
 };
 
-struct bkey_alloc_buf {
-       struct bkey_i   k;
-
-       union {
-       struct {
-#define x(_name,  _bits)               + _bits / 8
-       u8              _pad[8 + BCH_ALLOC_FIELDS_V1()];
-#undef  x
-       } _v1;
-       struct {
-#define x(_name,  _bits)               + 8 + _bits / 8
-       u8              _pad[8 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-       } _v2;
-       };
-} __attribute__((packed, aligned(8)));
-
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX      96U
 
@@ -52,33 +38,28 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
        ;
 }
 
+struct bkey_alloc_buf {
+       struct bkey_i   k;
+       struct bch_alloc_v3 v;
+
+#define x(_name,  _bits)               + _bits / 8
+       u8              _pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+} __attribute__((packed, aligned(8)));
+
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
-                    const struct bkey_alloc_unpacked);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+                                      const struct bkey_alloc_unpacked);
+int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
+                    struct bkey_alloc_unpacked *, unsigned);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct btree_iter *iter,
-                struct bucket *g, struct bucket_mark m)
-{
-       return (struct bkey_alloc_unpacked) {
-               .dev            = iter->pos.inode,
-               .bucket         = iter->pos.offset,
-               .gen            = m.gen,
-               .oldest_gen     = g->oldest_gen,
-               .data_type      = m.data_type,
-               .dirty_sectors  = m.dirty_sectors,
-               .cached_sectors = m.cached_sectors,
-               .read_time      = g->io_time[READ],
-               .write_time     = g->io_time[WRITE],
-       };
-}
-
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
 const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
@@ -91,7 +72,19 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .val_to_text    = bch2_alloc_to_text,           \
 }
 
-int bch2_alloc_read(struct bch_fs *);
+#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {     \
+       .key_invalid    = bch2_alloc_v3_invalid,        \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
+static inline bool bkey_is_alloc(const struct bkey *k)
+{
+       return  k->type == KEY_TYPE_alloc ||
+               k->type == KEY_TYPE_alloc_v2 ||
+               k->type == KEY_TYPE_alloc_v3;
+}
+
+int bch2_alloc_read(struct bch_fs *, bool, bool);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
@@ -129,9 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
-void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
-
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index 412fed47948278060b77516e6c5a4801df124a78..9b81ed2665c8d93324d19dc1e5f8e5f0e4930eae 100644 (file)
  * reference _after_ doing the index update that makes its allocation reachable.
  */
 
+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
+{
+       open_bucket_idx_t idx = ob - c->open_buckets;
+       open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+       ob->hash = *slot;
+       *slot = idx;
+}
+
+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob)
+{
+       open_bucket_idx_t idx = ob - c->open_buckets;
+       open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket);
+
+       while (*slot != idx) {
+               BUG_ON(!*slot);
+               slot = &c->open_buckets[*slot].hash;
+       }
+
+       *slot = ob->hash;
+       ob->hash = 0;
+}
+
 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
        if (ob->ec) {
                bch2_ec_bucket_written(c, ob);
@@ -55,14 +78,16 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        percpu_down_read(&c->mark_lock);
        spin_lock(&ob->lock);
 
-       bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
+       bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
        ob->valid = false;
-       ob->type = 0;
+       ob->data_type = 0;
 
        spin_unlock(&ob->lock);
        percpu_up_read(&c->mark_lock);
 
        spin_lock(&c->freelist_lock);
+       bch2_open_bucket_hash_remove(c, ob);
+
        ob->freelist = c->open_buckets_freelist;
        c->open_buckets_freelist = ob - c->open_buckets;
 
@@ -81,8 +106,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c,
        unsigned i;
 
        open_bucket_for_each(c, obs, ob, i)
-               if (ob->ptr.dev == dev &&
-                   ob->ec)
+               if (ob->dev == dev && ob->ec)
                        bch2_ec_bucket_cancel(c, ob);
 }
 
@@ -95,7 +119,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
        ob = c->open_buckets + c->open_buckets_freelist;
        c->open_buckets_freelist = ob->freelist;
        atomic_set(&ob->pin, 1);
-       ob->type = 0;
+       ob->data_type = 0;
 
        c->open_buckets_nr_free--;
        return ob;
@@ -105,8 +129,8 @@ static void open_bucket_free_unused(struct bch_fs *c,
                                    struct write_point *wp,
                                    struct open_bucket *ob)
 {
-       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-       bool may_realloc = wp->type == BCH_DATA_user;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+       bool may_realloc = wp->data_type == BCH_DATA_user;
 
        BUG_ON(ca->open_buckets_partial_nr >
               ARRAY_SIZE(ca->open_buckets_partial));
@@ -127,37 +151,18 @@ static void open_bucket_free_unused(struct bch_fs *c,
        }
 }
 
-static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct open_bucket *ob;
-       unsigned i;
-
-       open_bucket_for_each(c, obs, ob, i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-
-               BUG_ON(ptr_stale(ca, &ob->ptr));
-       }
-#endif
-}
-
 /* _only_ for allocating the journal on a new device: */
 long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
-       struct bucket_array *buckets;
-       ssize_t b;
+       while (ca->new_fs_bucket_idx < ca->mi.nbuckets) {
+               u64 b = ca->new_fs_bucket_idx++;
 
-       rcu_read_lock();
-       buckets = bucket_array(ca);
-
-       for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
-               if (is_available_bucket(buckets->b[b].mark) &&
-                   !buckets->b[b].mark.owned_by_allocator)
-                       goto success;
-       b = -1;
-success:
-       rcu_read_unlock();
-       return b;
+               if (!is_superblock_bucket(ca, b) &&
+                   (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse)))
+                       return b;
+       }
+
+       return -1;
 }
 
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
@@ -251,15 +256,14 @@ out:
        ob->valid       = true;
        ob->sectors_free = ca->mi.bucket_size;
        ob->alloc_reserve = reserve;
-       ob->ptr         = (struct bch_extent_ptr) {
-               .type   = 1 << BCH_EXTENT_ENTRY_ptr,
-               .gen    = bucket(ca, b)->mark.gen,
-               .offset = bucket_to_sector(ca, b),
-               .dev    = ca->dev_idx,
-       };
-
+       ob->dev         = ca->dev_idx;
+       ob->gen         = *bucket_gen(ca, b);
+       ob->bucket      = b;
        spin_unlock(&ob->lock);
 
+       ca->nr_open_buckets++;
+       bch2_open_bucket_hash_add(c, ob);
+
        if (c->blocked_allocate_open_bucket) {
                bch2_time_stats_update(
                        &c->times[BCH_TIME_blocked_allocate_open_bucket],
@@ -274,7 +278,6 @@ out:
                c->blocked_allocate = 0;
        }
 
-       ca->nr_open_buckets++;
        spin_unlock(&c->freelist_lock);
 
        bch2_wake_allocator(ca);
@@ -338,9 +341,9 @@ static void add_new_bucket(struct bch_fs *c,
                           struct open_bucket *ob)
 {
        unsigned durability =
-               bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
+               bch_dev_bkey_exists(c, ob->dev)->mi.durability;
 
-       __clear_bit(ob->ptr.dev, devs_may_alloc->d);
+       __clear_bit(ob->dev, devs_may_alloc->d);
        *nr_effective   += (flags & BUCKET_ALLOC_USE_DURABILITY)
                ? durability : 1;
        *have_cache     |= !durability;
@@ -348,8 +351,7 @@ static void add_new_bucket(struct bch_fs *c,
        ob_push(c, ptrs, ob);
 }
 
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *c,
+int bch2_bucket_alloc_set(struct bch_fs *c,
                      struct open_buckets *ptrs,
                      struct dev_stripe_state *stripe,
                      struct bch_devs_mask *devs_may_alloc,
@@ -363,7 +365,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
        struct dev_alloc_list devs_sorted =
                bch2_dev_alloc_list(c, stripe, devs_may_alloc);
        struct bch_dev *ca;
-       enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
+       int ret = -INSUFFICIENT_DEVICES;
        unsigned i;
 
        BUG_ON(*nr_effective >= nr_replicas);
@@ -381,7 +383,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
                ob = bch2_bucket_alloc(c, ca, reserve,
                                flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
                if (IS_ERR(ob)) {
-                       ret = -PTR_ERR(ob);
+                       ret = PTR_ERR(ob);
 
                        if (cl)
                                return ret;
@@ -394,7 +396,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
                bch2_dev_stripe_increment(ca, stripe);
 
                if (*nr_effective >= nr_replicas)
-                       return ALLOC_SUCCESS;
+                       return 0;
        }
 
        return ret;
@@ -408,8 +410,7 @@ bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static enum bucket_alloc_ret
-bucket_alloc_from_stripe(struct bch_fs *c,
+static int bucket_alloc_from_stripe(struct bch_fs *c,
                         struct open_buckets *ptrs,
                         struct write_point *wp,
                         struct bch_devs_mask *devs_may_alloc,
@@ -452,13 +453,13 @@ bucket_alloc_from_stripe(struct bch_fs *c,
                                continue;
 
                        ob = c->open_buckets + h->s->blocks[ec_idx];
-                       if (ob->ptr.dev == devs_sorted.devs[i] &&
+                       if (ob->dev == devs_sorted.devs[i] &&
                            !test_and_set_bit(ec_idx, h->s->blocks_allocated))
                                goto got_bucket;
                }
        goto out_put_head;
 got_bucket:
-       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+       ca = bch_dev_bkey_exists(c, ob->dev);
 
        ob->ec_idx      = ec_idx;
        ob->ec          = h->s;
@@ -488,12 +489,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
        unsigned i;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
 
                if (*nr_effective < nr_replicas &&
-                   test_bit(ob->ptr.dev, devs_may_alloc->d) &&
+                   test_bit(ob->dev, devs_may_alloc->d) &&
                    (ca->mi.durability ||
-                    (wp->type == BCH_DATA_user && !*have_cache)) &&
+                    (wp->data_type == BCH_DATA_user && !*have_cache)) &&
                    (ob->ec || !need_ec)) {
                        add_new_bucket(c, ptrs, devs_may_alloc,
                                       nr_effective, have_cache,
@@ -505,8 +506,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
        wp->ptrs = ptrs_skip;
 }
 
-static enum bucket_alloc_ret
-open_bucket_add_buckets(struct bch_fs *c,
+static int open_bucket_add_buckets(struct bch_fs *c,
                        struct open_buckets *ptrs,
                        struct write_point *wp,
                        struct bch_devs_list *devs_have,
@@ -522,11 +522,11 @@ open_bucket_add_buckets(struct bch_fs *c,
        struct bch_devs_mask devs;
        struct open_bucket *ob;
        struct closure *cl = NULL;
-       enum bucket_alloc_ret ret;
+       int ret;
        unsigned i;
 
        rcu_read_lock();
-       devs = target_rw_devs(c, wp->type, target);
+       devs = target_rw_devs(c, wp->data_type, target);
        rcu_read_unlock();
 
        /* Don't allocate from devices we already have pointers to: */
@@ -534,7 +534,7 @@ open_bucket_add_buckets(struct bch_fs *c,
                __clear_bit(devs_have->devs[i], devs.d);
 
        open_bucket_for_each(c, ptrs, ob, i)
-               __clear_bit(ob->ptr.dev, devs.d);
+               __clear_bit(ob->dev, devs.d);
 
        if (erasure_code) {
                if (!ec_open_bucket(c, ptrs)) {
@@ -550,8 +550,8 @@ open_bucket_add_buckets(struct bch_fs *c,
                                                 target, erasure_code,
                                                 nr_replicas, nr_effective,
                                                 have_cache, flags, _cl);
-                       if (ret == FREELIST_EMPTY ||
-                           ret == OPEN_BUCKETS_EMPTY)
+                       if (ret == -FREELIST_EMPTY ||
+                           ret == -OPEN_BUCKETS_EMPTY)
                                return ret;
                        if (*nr_effective >= nr_replicas)
                                return 0;
@@ -575,7 +575,7 @@ retry_blocking:
        ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
                                nr_replicas, nr_effective, have_cache,
                                reserve, flags, cl);
-       if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
+       if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) {
                cl = _cl;
                goto retry_blocking;
        }
@@ -594,7 +594,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
        unsigned i, j;
 
        open_bucket_for_each(c, obs, ob, i) {
-               bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+               bool drop = !ca || ob->dev == ca->dev_idx;
 
                if (!drop && ob->ec) {
                        mutex_lock(&ob->ec->lock);
@@ -603,7 +603,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
                                        continue;
 
                                ob2 = c->open_buckets + ob->ec->blocks[j];
-                               drop |= ob2->ptr.dev == ca->dev_idx;
+                               drop |= ob2->dev == ca->dev_idx;
                        }
                        mutex_unlock(&ob->ec->lock);
                }
@@ -772,7 +772,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
        unsigned nr_effective, write_points_nr;
        unsigned ob_flags = 0;
        bool have_cache;
-       enum bucket_alloc_ret ret;
+       int ret;
        int i;
 
        if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
@@ -787,11 +787,11 @@ retry:
 
        wp = writepoint_find(c, write_point.v);
 
-       if (wp->type == BCH_DATA_user)
+       if (wp->data_type == BCH_DATA_user)
                ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
        /* metadata may not allocate on cache devices: */
-       if (wp->type != BCH_DATA_user)
+       if (wp->data_type != BCH_DATA_user)
                have_cache = true;
 
        if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
@@ -821,7 +821,7 @@ alloc_done:
        if (erasure_code && !ec_open_bucket(c, &ptrs))
                pr_debug("failed to get ec bucket: ret %u", ret);
 
-       if (ret == INSUFFICIENT_DEVICES &&
+       if (ret == -INSUFFICIENT_DEVICES &&
            nr_effective >= nr_replicas_required)
                ret = 0;
 
@@ -841,8 +841,6 @@ alloc_done:
 
        BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-       verify_not_stale(c, &wp->ptrs);
-
        return wp;
 err:
        open_bucket_for_each(c, &wp->ptrs, ob, i)
@@ -854,27 +852,42 @@ err:
 
        mutex_unlock(&wp->lock);
 
-       if (ret == FREELIST_EMPTY &&
+       if (ret == -FREELIST_EMPTY &&
            try_decrease_writepoints(c, write_points_nr))
                goto retry;
 
        switch (ret) {
-       case OPEN_BUCKETS_EMPTY:
-       case FREELIST_EMPTY:
+       case -OPEN_BUCKETS_EMPTY:
+       case -FREELIST_EMPTY:
                return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
-       case INSUFFICIENT_DEVICES:
+       case -INSUFFICIENT_DEVICES:
                return ERR_PTR(-EROFS);
        default:
                BUG();
        }
 }
 
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+
+       return (struct bch_extent_ptr) {
+               .type   = 1 << BCH_EXTENT_ENTRY_ptr,
+               .gen    = ob->gen,
+               .dev    = ob->dev,
+               .offset = bucket_to_sector(ca, ob->bucket) +
+                       ca->mi.bucket_size -
+                       ob->sectors_free,
+       };
+}
+
 /*
  * Append pointers to the space we just allocated to @k, and mark @sectors space
  * as allocated out of @ob
  */
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
-                                   struct bkey_i *k, unsigned sectors)
+                                   struct bkey_i *k, unsigned sectors,
+                                   bool cached)
 
 {
        struct open_bucket *ob;
@@ -884,14 +897,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
        wp->sectors_free -= sectors;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-               struct bch_extent_ptr tmp = ob->ptr;
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
+               struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob);
 
-               tmp.cached = !ca->mi.durability &&
-                       wp->type == BCH_DATA_user;
+               ptr.cached = cached ||
+                       (!ca->mi.durability &&
+                        wp->data_type == BCH_DATA_user);
 
-               tmp.offset += ca->mi.bucket_size - ob->sectors_free;
-               bch2_bkey_append_ptr(k, tmp);
+               bch2_bkey_append_ptr(k, ptr);
 
                BUG_ON(sectors > ob->sectors_free);
                ob->sectors_free -= sectors;
@@ -921,7 +934,7 @@ static inline void writepoint_init(struct write_point *wp,
                                   enum bch_data_type type)
 {
        mutex_init(&wp->lock);
-       wp->type = type;
+       wp->data_type = type;
 }
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *c)
@@ -958,3 +971,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c)
                                   writepoint_hash(c, wp->write_point));
        }
 }
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct open_bucket *ob;
+
+       for (ob = c->open_buckets;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+            ob++) {
+               spin_lock(&ob->lock);
+               if (ob->valid && !ob->on_partial_list) {
+                       pr_buf(out, "%zu ref %u type %s\n",
+                              ob - c->open_buckets,
+                              atomic_read(&ob->pin),
+                              bch2_data_types[ob->data_type]);
+               }
+               spin_unlock(&ob->lock);
+       }
+
+}
index c658295cb8e09f375b5b416a222b14e3070fb779..d466bda9afc8fdddb49f7b353c8c571b12f1fcf6 100644 (file)
@@ -12,13 +12,6 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;
 
-enum bucket_alloc_ret {
-       ALLOC_SUCCESS,
-       OPEN_BUCKETS_EMPTY,
-       FREELIST_EMPTY,         /* Allocator thread not keeping up */
-       INSUFFICIENT_DEVICES,
-};
-
 struct dev_alloc_list {
        unsigned        nr;
        u8              devs[BCH_SB_MEMBERS_MAX];
@@ -92,14 +85,37 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
        unsigned i;
 
        open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               ob->type = wp->type;
+               ob->data_type = wp->data_type;
                atomic_inc(&ob->pin);
                ob_push(c, ptrs, ob);
        }
 }
 
-enum bucket_alloc_ret
-bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c,
+                                                 unsigned dev, u64 bucket)
+{
+       return c->open_buckets_hash +
+               (jhash_3words(dev, bucket, bucket >> 32, 0) &
+                (OPEN_BUCKETS_COUNT - 1));
+}
+
+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+       open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket);
+
+       while (slot) {
+               struct open_bucket *ob = &c->open_buckets[slot];
+
+               if (ob->dev == dev && ob->bucket == bucket)
+                       return true;
+
+               slot = ob->hash;
+       }
+
+       return false;
+}
+
+int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
                      struct dev_stripe_state *, struct bch_devs_mask *,
                      unsigned, unsigned *, bool *, enum alloc_reserve,
                      unsigned, struct closure *);
@@ -113,8 +129,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
                                             unsigned,
                                             struct closure *);
 
+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *);
 void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
-                                   struct bkey_i *, unsigned);
+                                   struct bkey_i *, unsigned, bool);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
 void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
@@ -135,4 +152,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp
 
 void bch2_fs_allocator_foreground_init(struct bch_fs *);
 
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
index 4a1cd8b73d16b1e500a84ae9290d9bf144c502cf..409232e3d99800ef652ce6fcd8b2cf0a2e6476b9 100644 (file)
@@ -37,24 +37,31 @@ typedef FIFO(long)  alloc_fifo;
 #define WRITE_POINT_HASH_NR    32
 #define WRITE_POINT_MAX                32
 
+/*
+ * 0 is never a valid open_bucket_idx_t:
+ */
 typedef u16                    open_bucket_idx_t;
 
 struct open_bucket {
        spinlock_t              lock;
        atomic_t                pin;
        open_bucket_idx_t       freelist;
+       open_bucket_idx_t       hash;
 
        /*
         * When an open bucket has an ec_stripe attached, this is the index of
         * the block in the stripe this open_bucket corresponds to:
         */
        u8                      ec_idx;
-       u8                      type;
+       enum bch_data_type      data_type:3;
        unsigned                valid:1;
        unsigned                on_partial_list:1;
        int                     alloc_reserve:3;
+
        unsigned                sectors_free;
-       struct bch_extent_ptr   ptr;
+       u8                      dev;
+       u8                      gen;
+       u64                     bucket;
        struct ec_stripe_new    *ec;
 };
 
@@ -74,7 +81,7 @@ struct write_point {
        struct mutex            lock;
        u64                     last_used;
        unsigned long           write_point;
-       enum bch_data_type      type;
+       enum bch_data_type      data_type;
 
        /* calculated based on how many pointers we're actually going to use: */
        unsigned                sectors_free;
index 051aba63eaa54ff7b130d1b6a8f618e0e56111f4..0e9689f6878afd3062710d9f6b4bd288bf8eef84 100644 (file)
  */
 
 #undef pr_fmt
+#ifdef __KERNEL__
 #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
+#else
+#define pr_fmt(fmt) "%s() " fmt "\n", __func__
+#endif
 
+#include <linux/backing-dev-defs.h>
 #include <linux/bug.h>
 #include <linux/bio.h>
 #include <linux/closure.h>
 #include <linux/zstd.h>
 
 #include "bcachefs_format.h"
+#include "errcode.h"
 #include "fifo.h"
 #include "opts.h"
 #include "util.h"
@@ -275,9 +281,6 @@ do {                                                                        \
                "significantly affect performance")                     \
        BCH_DEBUG_PARAM(debug_check_iterators,                          \
                "Enables extra verification for btree iterators")       \
-       BCH_DEBUG_PARAM(debug_check_bkeys,                              \
-               "Run bkey_debugcheck (primarily checking GC/allocation "\
-               "information) when iterating over keys")                \
        BCH_DEBUG_PARAM(debug_check_btree_accounting,                   \
                "Verify btree accounting for keys within a node")       \
        BCH_DEBUG_PARAM(journal_seq_verify,                             \
@@ -319,8 +322,12 @@ BCH_DEBUG_PARAMS_DEBUG()
 #define BCH_TIME_STATS()                       \
        x(btree_node_mem_alloc)                 \
        x(btree_node_split)                     \
+       x(btree_node_compact)                   \
+       x(btree_node_merge)                     \
        x(btree_node_sort)                      \
        x(btree_node_read)                      \
+       x(btree_interior_update_foreground)     \
+       x(btree_interior_update_total)          \
        x(btree_gc)                             \
        x(btree_lock_contended_read)            \
        x(btree_lock_contended_intent)          \
@@ -328,8 +335,8 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(data_write)                           \
        x(data_read)                            \
        x(data_promote)                         \
-       x(journal_write)                        \
-       x(journal_delay)                        \
+       x(journal_flush_write)                  \
+       x(journal_noflush_write)                \
        x(journal_flush_seq)                    \
        x(blocked_journal)                      \
        x(blocked_allocate)                     \
@@ -345,6 +352,7 @@ enum bch_time_stats {
 #include "alloc_types.h"
 #include "btree_types.h"
 #include "buckets_types.h"
+#include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
 #include "ec_types.h"
 #include "journal_types.h"
@@ -352,6 +360,7 @@ enum bch_time_stats {
 #include "quota_types.h"
 #include "rebalance_types.h"
 #include "replicas_types.h"
+#include "subvolume_types.h"
 #include "super_types.h"
 
 /* Number of nodes btree coalesce will try to coalesce at once */
@@ -380,6 +389,8 @@ enum gc_phase {
        GC_PHASE_BTREE_alloc,
        GC_PHASE_BTREE_quotas,
        GC_PHASE_BTREE_reflink,
+       GC_PHASE_BTREE_subvolumes,
+       GC_PHASE_BTREE_snapshots,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -423,6 +434,7 @@ struct bch_dev {
        struct bch_sb_handle    disk_sb;
        struct bch_sb           *sb_read_scratch;
        int                     sb_write_error;
+       dev_t                   dev;
 
        struct bch_devs_mask    self;
 
@@ -436,6 +448,8 @@ struct bch_dev {
         * Or rcu_read_lock(), but only for ptr_stale():
         */
        struct bucket_array __rcu *buckets[2];
+       struct bucket_gens __rcu *bucket_gens;
+       u8                      *oldest_gen;
        unsigned long           *buckets_nouse;
        struct rw_semaphore     bucket_lock;
 
@@ -444,6 +458,7 @@ struct bch_dev {
        struct bch_dev_usage __percpu   *usage_gc;
 
        /* Allocator: */
+       u64                     new_fs_bucket_idx;
        struct task_struct __rcu *alloc_thread;
 
        /*
@@ -466,6 +481,7 @@ struct bch_dev {
 
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
+       size_t                  buckets_waiting_on_journal;
 
        enum allocator_states   allocator_state;
 
@@ -491,6 +507,7 @@ struct bch_dev {
 
 enum {
        /* startup: */
+       BCH_FS_INITIALIZED,
        BCH_FS_ALLOC_READ_DONE,
        BCH_FS_ALLOC_CLEAN,
        BCH_FS_ALLOCATOR_RUNNING,
@@ -498,7 +515,6 @@ enum {
        BCH_FS_INITIAL_GC_DONE,
        BCH_FS_INITIAL_GC_UNFIXED,
        BCH_FS_TOPOLOGY_REPAIR_DONE,
-       BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
        BCH_FS_RW,
@@ -518,7 +534,6 @@ enum {
        /* misc: */
        BCH_FS_NEED_ANOTHER_GC,
        BCH_FS_DELETED_NODES,
-       BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
 };
@@ -548,6 +563,7 @@ struct journal_keys {
                enum btree_id   btree_id:8;
                unsigned        level:8;
                bool            allocated;
+               bool            overwritten;
                struct bkey_i   *k;
                u32             journal_seq;
                u32             journal_offset;
@@ -557,12 +573,27 @@ struct journal_keys {
        u64                     journal_seq_base;
 };
 
-struct btree_iter_buf {
-       struct btree_iter       *iter;
+struct btree_path_buf {
+       struct btree_path       *path;
 };
 
 #define REPLICAS_DELTA_LIST_MAX        (1U << 16)
 
+struct snapshot_t {
+       u32                     parent;
+       u32                     children[2];
+       u32                     subvol; /* Nonzero only if a subvolume points to this node: */
+       u32                     equiv;
+};
+
+typedef struct {
+       u32             subvol;
+       u64             inum;
+} subvol_inum;
+
+#define BCACHEFS_ROOT_SUBVOL_INUM                                      \
+       ((subvol_inum) { BCACHEFS_ROOT_SUBVOL,  BCACHEFS_ROOT_INO })
+
 struct bch_fs {
        struct closure          cl;
 
@@ -609,7 +640,6 @@ struct bch_fs {
 
                u16             version;
                u16             version_min;
-               u16             encoded_extent_max;
 
                u8              nr_devices;
                u8              clean;
@@ -634,6 +664,15 @@ struct bch_fs {
        struct closure          sb_write;
        struct mutex            sb_lock;
 
+       /* snapshot.c: */
+       GENRADIX(struct snapshot_t) snapshots;
+       struct bch_snapshot_table __rcu *snapshot_table;
+       struct mutex            snapshot_table_lock;
+       struct work_struct      snapshot_delete_work;
+       struct work_struct      snapshot_wait_for_pagecache_and_delete_work;
+       struct snapshot_id_list snapshots_unlinked;
+       struct mutex            snapshots_unlinked_lock;
+
        /* BTREE CACHE */
        struct bio_set          btree_bio;
        struct workqueue_struct *io_complete_wq;
@@ -666,13 +705,15 @@ struct bch_fs {
        /* btree_iter.c: */
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
-       mempool_t               btree_iters_pool;
+       mempool_t               btree_paths_pool;
        mempool_t               btree_trans_mem_pool;
-       struct btree_iter_buf  __percpu *btree_iters_bufs;
+       struct btree_path_buf  __percpu *btree_paths_bufs;
 
        struct srcu_struct      btree_trans_barrier;
+       bool                    btree_trans_barrier_initialized;
 
        struct btree_key_cache  btree_key_cache;
+       unsigned                btree_key_cache_btrees;
 
        struct workqueue_struct *btree_update_wq;
        struct workqueue_struct *btree_io_complete_wq;
@@ -721,10 +762,12 @@ struct bch_fs {
        struct closure_waitlist freelist_wait;
        u64                     blocked_allocate;
        u64                     blocked_allocate_open_bucket;
+
        open_bucket_idx_t       open_buckets_freelist;
        open_bucket_idx_t       open_buckets_nr_free;
        struct closure_waitlist open_buckets_wait;
        struct open_bucket      open_buckets[OPEN_BUCKETS_COUNT];
+       open_bucket_idx_t       open_buckets_hash[OPEN_BUCKETS_COUNT];
 
        struct write_point      btree_write_point;
        struct write_point      rebalance_write_point;
@@ -734,6 +777,8 @@ struct bch_fs {
        struct mutex            write_points_hash_lock;
        unsigned                write_points_nr;
 
+       struct buckets_waiting_for_journal buckets_waiting_for_journal;
+
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
        atomic_t                kick_gc;
@@ -759,6 +804,7 @@ struct bch_fs {
         * it's not while a gc is in progress.
         */
        struct rw_semaphore     gc_lock;
+       struct mutex            gc_gens_lock;
 
        /* IO PATH */
        struct semaphore        io_in_flight;
@@ -791,8 +837,13 @@ struct bch_fs {
        struct write_point      copygc_write_point;
        s64                     copygc_wait;
 
+       /* DATA PROGRESS STATS */
+       struct list_head        data_progress_list;
+       struct mutex            data_progress_lock;
+
        /* STRIPES: */
-       GENRADIX(struct stripe) stripes[2];
+       GENRADIX(struct stripe) stripes;
+       GENRADIX(struct gc_stripe) gc_stripes;
 
        ec_stripes_heap         ec_stripes_heap;
        spinlock_t              ec_stripes_heap_lock;
@@ -816,7 +867,6 @@ struct bch_fs {
        u64                     reflink_hint;
        reflink_gc_table        reflink_gc_table;
        size_t                  reflink_gc_nr;
-       size_t                  reflink_gc_idx;
 
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
@@ -888,10 +938,25 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca)
 
 static inline unsigned block_bytes(const struct bch_fs *c)
 {
-       return c->opts.block_size << 9;
+       return c->opts.block_size;
+}
+
+static inline unsigned block_sectors(const struct bch_fs *c)
+{
+       return c->opts.block_size >> 9;
+}
+
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+       return c->opts.btree_node_size >> 9;
+}
+
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+       return c->btree_key_cache_btrees & (1U << btree);
 }
 
-static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
 {
        struct timespec64 t;
        s32 rem;
@@ -903,13 +968,13 @@ static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time
        return t;
 }
 
-static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts)
 {
        return (ts.tv_sec * c->sb.time_units_per_sec +
                (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
 }
 
-static inline s64 bch2_current_time(struct bch_fs *c)
+static inline s64 bch2_current_time(const struct bch_fs *c)
 {
        struct timespec64 now;
 
index 98779e46bbd0878d3773023aeaf2d276bb252a87..5153f0e420541c1b8a03e4ebf76501a7c87c2b6a 100644 (file)
@@ -76,6 +76,7 @@
 #include <asm/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/uuid.h>
+#include "vstructs.h"
 
 #define LE_BITMASK(_bits, name, type, field, offset, end)              \
 static const unsigned  name##_OFFSET = offset;                         \
@@ -323,7 +324,7 @@ static inline void bkey_init(struct bkey *k)
 */
 #define BCH_BKEY_TYPES()                               \
        x(deleted,              0)                      \
-       x(discard,              1)                      \
+       x(whiteout,             1)                      \
        x(error,                2)                      \
        x(cookie,               3)                      \
        x(hash_whiteout,        4)                      \
@@ -342,7 +343,11 @@ static inline void bkey_init(struct bkey *k)
        x(inline_data,          17)                     \
        x(btree_ptr_v2,         18)                     \
        x(indirect_inline_data, 19)                     \
-       x(alloc_v2,             20)
+       x(alloc_v2,             20)                     \
+       x(subvolume,            21)                     \
+       x(snapshot,             22)                     \
+       x(inode_v2,             23)                     \
+       x(alloc_v3,             24)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -355,7 +360,7 @@ struct bch_deleted {
        struct bch_val          v;
 };
 
-struct bch_discard {
+struct bch_whiteout {
        struct bch_val          v;
 };
 
@@ -679,6 +684,16 @@ struct bch_inode {
        __u8                    fields[0];
 } __attribute__((packed, aligned(8)));
 
+struct bch_inode_v2 {
+       struct bch_val          v;
+
+       __le64                  bi_journal_seq;
+       __le64                  bi_hash_seed;
+       __le64                  bi_flags;
+       __le16                  bi_mode;
+       __u8                    fields[0];
+} __attribute__((packed, aligned(8)));
+
 struct bch_inode_generation {
        struct bch_val          v;
 
@@ -686,6 +701,10 @@ struct bch_inode_generation {
        __le32                  pad;
 } __attribute__((packed, aligned(8)));
 
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
 #define BCH_INODE_FIELDS()                     \
        x(bi_atime,                     96)     \
        x(bi_ctime,                     96)     \
@@ -709,7 +728,9 @@ struct bch_inode_generation {
        x(bi_erasure_code,              16)     \
        x(bi_fields_set,                16)     \
        x(bi_dir,                       64)     \
-       x(bi_dir_offset,                64)
+       x(bi_dir_offset,                64)     \
+       x(bi_subvol,                    32)     \
+       x(bi_parent_subvol,             32)
 
 /* subset of BCH_INODE_FIELDS */
 #define BCH_INODE_OPTS()                       \
@@ -764,6 +785,9 @@ LE32_BITMASK(INODE_STR_HASH,        struct bch_inode, bi_flags, 20, 24);
 LE32_BITMASK(INODE_NR_FIELDS,  struct bch_inode, bi_flags, 24, 31);
 LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
 
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS,        struct bch_inode_v2, bi_flags, 24, 31);
+
 /* Dirents */
 
 /*
@@ -781,7 +805,13 @@ struct bch_dirent {
        struct bch_val          v;
 
        /* Target inode number: */
+       union {
        __le64                  d_inum;
+       struct {                /* DT_SUBVOL */
+       __le32                  d_child_subvol;
+       __le32                  d_parent_subvol;
+       };
+       };
 
        /*
         * Copy of mode bits 12-15 from the target inode - so userspace can get
@@ -792,6 +822,9 @@ struct bch_dirent {
        __u8                    d_name[];
 } __attribute__((packed, aligned(8)));
 
+#define DT_SUBVOL      16
+#define BCH_DT_MAX     17
+
 #define BCH_NAME_MAX   (U8_MAX * sizeof(u64) -                         \
                         sizeof(struct bkey) -                          \
                         offsetof(struct bch_dirent, d_name))
@@ -849,6 +882,17 @@ struct bch_alloc_v2 {
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
+struct bch_alloc_v3 {
+       struct bch_val          v;
+       __le64                  journal_seq;
+       __le32                  flags;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+
 enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
        BCH_ALLOC_FIELDS_V1()
@@ -902,18 +946,24 @@ struct bch_stripe {
 struct bch_reflink_p {
        struct bch_val          v;
        __le64                  idx;
-
-       __le32                  reservation_generation;
-       __u8                    nr_replicas;
-       __u8                    pad[3];
-};
+       /*
+        * A reflink pointer might point to an indirect extent which is then
+        * later split (by copygc or rebalance). If we only pointed to part of
+        * the original indirect extent, and then one of the fragments is
+        * outside the range we point to, we'd leak a refcount: so when creating
+        * reflink pointers, we need to store pad values to remember the full
+        * range we were taking a reference on.
+        */
+       __le32                  front_pad;
+       __le32                  back_pad;
+} __attribute__((packed, aligned(8)));
 
 struct bch_reflink_v {
        struct bch_val          v;
        __le64                  refcount;
        union bch_extent_entry  start[0];
        __u64                   _data[0];
-};
+} __attribute__((packed, aligned(8)));
 
 struct bch_indirect_inline_data {
        struct bch_val          v;
@@ -928,6 +978,43 @@ struct bch_inline_data {
        u8                      data[0];
 };
 
+/* Subvolumes: */
+
+#define SUBVOL_POS_MIN         POS(0, 1)
+#define SUBVOL_POS_MAX         POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL   1
+
+struct bch_subvolume {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  snapshot;
+       __le64                  inode;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO,         struct bch_subvolume, flags,  0,  1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP,       struct bch_subvolume, flags,  1,  2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED,   struct bch_subvolume, flags,  2,  3)
+
+/* Snapshots */
+
+struct bch_snapshot {
+       struct bch_val          v;
+       __le32                  flags;
+       __le32                  parent;
+       __le32                  children[2];
+       __le32                  subvol;
+       __le32                  pad;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED,     struct bch_snapshot, flags,  0,  1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -977,15 +1064,12 @@ struct bch_member {
 };
 
 LE64_BITMASK(BCH_MEMBER_STATE,         struct bch_member, flags[0],  0,  4)
-/* 4-10 unused, was TIER, HAS_(META)DATA */
-LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
 LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
 LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags[0], 28, 30)
 
-#define BCH_TIER_MAX                   4U
-
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
 LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
@@ -1004,18 +1088,6 @@ enum bch_member_state {
        BCH_MEMBER_STATE_NR
 };
 
-#define BCH_CACHE_REPLACEMENT_POLICIES()       \
-       x(lru,          0)                      \
-       x(fifo,         1)                      \
-       x(random,       2)
-
-enum bch_cache_replacement_policies {
-#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
-       BCH_CACHE_REPLACEMENT_POLICIES()
-#undef x
-       BCH_CACHE_REPLACEMENT_NR
-};
-
 struct bch_sb_field_members {
        struct bch_sb_field     field;
        struct bch_member       members[0];
@@ -1211,7 +1283,11 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_snapshot              = 12,
        bcachefs_metadata_version_inode_backpointers    = 13,
        bcachefs_metadata_version_btree_ptr_sectors_written = 14,
-       bcachefs_metadata_version_max                   = 15,
+       bcachefs_metadata_version_snapshot_2            = 15,
+       bcachefs_metadata_version_reflink_p_fix         = 16,
+       bcachefs_metadata_version_subvol_dirent         = 17,
+       bcachefs_metadata_version_inode_v2              = 18,
+       bcachefs_metadata_version_max                   = 19,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1348,6 +1424,10 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,        struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
 LE64_BITMASK(BCH_SB_SHARD_INUMS,       struct bch_sb, flags[3], 28, 29);
 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 
 /*
  * Features:
@@ -1355,7 +1435,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
  * journal_seq_blacklist_v3:   gates BCH_SB_FIELD_journal_seq_blacklist
  * reflink:                    gates KEY_TYPE_reflink
  * inline_data:                        gates KEY_TYPE_inline_data
- * new_siphash:                        gates BCH_STR_HASH_SIPHASH
+ * new_siphash:                        gates BCH_STR_HASH_siphash
  * new_extent_overwrite:       gates BTREE_NODE_NEW_EXTENT_OVERWRITE
  */
 #define BCH_SB_FEATURES()                      \
@@ -1431,12 +1511,17 @@ enum bch_error_actions {
        BCH_ON_ERROR_NR
 };
 
+#define BCH_STR_HASH_TYPES()           \
+       x(crc32c,               0)      \
+       x(crc64,                1)      \
+       x(siphash_old,          2)      \
+       x(siphash,              3)
+
 enum bch_str_hash_type {
-       BCH_STR_HASH_CRC32C             = 0,
-       BCH_STR_HASH_CRC64              = 1,
-       BCH_STR_HASH_SIPHASH_OLD        = 2,
-       BCH_STR_HASH_SIPHASH            = 3,
-       BCH_STR_HASH_NR                 = 4,
+#define x(t, n) BCH_STR_HASH_##t = n,
+       BCH_STR_HASH_TYPES()
+#undef x
+       BCH_STR_HASH_NR
 };
 
 #define BCH_STR_HASH_OPTS()            \
@@ -1451,34 +1536,39 @@ enum bch_str_hash_opts {
        BCH_STR_HASH_OPT_NR
 };
 
+#define BCH_CSUM_TYPES()                       \
+       x(none,                         0)      \
+       x(crc32c_nonzero,               1)      \
+       x(crc64_nonzero,                2)      \
+       x(chacha20_poly1305_80,         3)      \
+       x(chacha20_poly1305_128,        4)      \
+       x(crc32c,                       5)      \
+       x(crc64,                        6)      \
+       x(xxhash,                       7)
+
 enum bch_csum_type {
-       BCH_CSUM_NONE                   = 0,
-       BCH_CSUM_CRC32C_NONZERO         = 1,
-       BCH_CSUM_CRC64_NONZERO          = 2,
-       BCH_CSUM_CHACHA20_POLY1305_80   = 3,
-       BCH_CSUM_CHACHA20_POLY1305_128  = 4,
-       BCH_CSUM_CRC32C                 = 5,
-       BCH_CSUM_CRC64                  = 6,
-       BCH_CSUM_XXHASH                 = 7,
-       BCH_CSUM_NR                     = 8,
+#define x(t, n) BCH_CSUM_##t = n,
+       BCH_CSUM_TYPES()
+#undef x
+       BCH_CSUM_NR
 };
 
 static const unsigned bch_crc_bytes[] = {
-       [BCH_CSUM_NONE]                         = 0,
-       [BCH_CSUM_CRC32C_NONZERO]               = 4,
-       [BCH_CSUM_CRC32C]                       = 4,
-       [BCH_CSUM_CRC64_NONZERO]                = 8,
-       [BCH_CSUM_CRC64]                        = 8,
-       [BCH_CSUM_XXHASH]                       = 8,
-       [BCH_CSUM_CHACHA20_POLY1305_80]         = 10,
-       [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
+       [BCH_CSUM_none]                         = 0,
+       [BCH_CSUM_crc32c_nonzero]               = 4,
+       [BCH_CSUM_crc32c]                       = 4,
+       [BCH_CSUM_crc64_nonzero]                = 8,
+       [BCH_CSUM_crc64]                        = 8,
+       [BCH_CSUM_xxhash]                       = 8,
+       [BCH_CSUM_chacha20_poly1305_80]         = 10,
+       [BCH_CSUM_chacha20_poly1305_128]        = 16,
 };
 
 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 {
        switch (type) {
-       case BCH_CSUM_CHACHA20_POLY1305_80:
-       case BCH_CSUM_CHACHA20_POLY1305_128:
+       case BCH_CSUM_chacha20_poly1305_80:
+       case BCH_CSUM_chacha20_poly1305_128:
                return true;
        default:
                return false;
@@ -1572,7 +1662,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(usage,                5)              \
        x(data_usage,           6)              \
        x(clock,                7)              \
-       x(dev_usage,            8)
+       x(dev_usage,            8)              \
+       x(log,                  9)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1602,11 +1693,16 @@ struct jset_entry_blacklist_v2 {
        __le64                  end;
 };
 
+#define BCH_FS_USAGE_TYPES()                   \
+       x(reserved,             0)              \
+       x(inodes,               1)              \
+       x(key_version,          2)
+
 enum {
-       FS_USAGE_RESERVED               = 0,
-       FS_USAGE_INODES                 = 1,
-       FS_USAGE_KEY_VERSION            = 2,
-       FS_USAGE_NR                     = 3
+#define x(f, nr)       BCH_FS_USAGE_##f        = nr,
+       BCH_FS_USAGE_TYPES()
+#undef x
+       BCH_FS_USAGE_NR
 };
 
 struct jset_entry_usage {
@@ -1644,6 +1740,17 @@ struct jset_entry_dev_usage {
        struct jset_entry_dev_usage_type d[];
 } __attribute__((packed));
 
+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u)
+{
+       return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) /
+               sizeof(struct jset_entry_dev_usage_type);
+}
+
+struct jset_entry_log {
+       struct jset_entry       entry;
+       u8                      d[];
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1695,7 +1802,9 @@ LE32_BITMASK(JSET_NO_FLUSH,       struct jset, flags, 5, 6);
        x(alloc,        4)                      \
        x(quotas,       5)                      \
        x(stripes,      6)                      \
-       x(reflink,      7)
+       x(reflink,      7)                      \
+       x(subvolumes,   8)                      \
+       x(snapshots,    9)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
index f679fc2151bc4cfdd2e18a42674352f87e7fba7e..930981ad55355a2ad64eea94681fc6010b50fbb7 100644 (file)
@@ -78,6 +78,9 @@ struct bch_ioctl_incremental {
 #define BCH_IOCTL_DISK_RESIZE  _IOW(0xbc,      14,  struct bch_ioctl_disk_resize)
 #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15,  struct bch_ioctl_disk_resize_journal)
 
+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc,  16,  struct bch_ioctl_subvolume)
+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17,  struct bch_ioctl_subvolume)
+
 /* ioctl below act on a particular file, not the filesystem as a whole: */
 
 #define BCHFS_IOC_REINHERIT_ATTRS      _IOR(0xbc, 64, const char __user *)
@@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal {
        __u64                   nbuckets;
 };
 
+struct bch_ioctl_subvolume {
+       __u32                   flags;
+       __u32                   dirfd;
+       __u16                   mode;
+       __u16                   pad[3];
+       __u64                   dst_ptr;
+       __u64                   src_ptr;
+};
+
+#define BCH_SUBVOL_SNAPSHOT_CREATE     (1U << 0)
+#define BCH_SUBVOL_SNAPSHOT_RO         (1U << 1)
+
 #endif /* _BCACHEFS_IOCTL_H */
index 2e45d88fab0382cdc9e99e9d5449702adc8f30d0..7dee3d8e0a3d169160fab7018c6fe1ef55660eb5 100644 (file)
@@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
 #define bkey_deleted(_k)       ((_k)->type == KEY_TYPE_deleted)
 
 #define bkey_whiteout(_k)                              \
-       ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
+       ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
 
 enum bkey_lr_packed {
        BKEY_PACKED_BOTH,
@@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r)
        return bpos_cmp(l, r) > 0 ? l : r;
 }
 
-#define sbb(a, b, borrow)                              \
-do {                                                   \
-       typeof(a) d1, d2;                               \
-                                                       \
-       d1 = a - borrow;                                \
-       borrow  = d1 > a;                               \
-                                                       \
-       d2 = d1 - b;                                    \
-       borrow += d2 > d1;                              \
-       a = d2;                                         \
-} while (0)
-
-/* returns a - b: */
-static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
-{
-       int borrow = 0;
-
-       sbb(a.snapshot, b.snapshot,     borrow);
-       sbb(a.offset,   b.offset,       borrow);
-       sbb(a.inode,    b.inode,        borrow);
-       return a;
-}
-
-static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
-{
-       if (bpos_cmp(l, r) > 0)
-               swap(l, r);
-
-       return bpos_sub(r, l);
-}
-
 void bch2_bpos_swab(struct bpos *);
 void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
 
index f8adbf4372764852a6838f9ed8d1aa540d2752c2..e83aeb683a0977c84f82b0e4559c1ccc1e2d3194 100644 (file)
@@ -11,6 +11,7 @@
 #include "inode.h"
 #include "quota.h"
 #include "reflink.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 const char * const bch2_bkey_types[] = {
@@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c,
        .key_invalid = deleted_key_invalid,             \
 }
 
-#define bch2_bkey_ops_discard (struct bkey_ops) {      \
+#define bch2_bkey_ops_whiteout (struct bkey_ops) {     \
        .key_invalid = deleted_key_invalid,             \
 }
 
@@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 
 static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_extents] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_error)|
                (1U << KEY_TYPE_cookie)|
                (1U << KEY_TYPE_extent)|
@@ -107,26 +110,45 @@ static unsigned bch2_key_types_allowed[] = {
                (1U << KEY_TYPE_reflink_p)|
                (1U << KEY_TYPE_inline_data),
        [BKEY_TYPE_inodes] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_inode)|
+               (1U << KEY_TYPE_inode_v2)|
                (1U << KEY_TYPE_inode_generation),
        [BKEY_TYPE_dirents] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_hash_whiteout)|
                (1U << KEY_TYPE_dirent),
        [BKEY_TYPE_xattrs] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_whiteout)|
                (1U << KEY_TYPE_cookie)|
                (1U << KEY_TYPE_hash_whiteout)|
                (1U << KEY_TYPE_xattr),
        [BKEY_TYPE_alloc] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_alloc)|
-               (1U << KEY_TYPE_alloc_v2),
+               (1U << KEY_TYPE_alloc_v2)|
+               (1U << KEY_TYPE_alloc_v3),
        [BKEY_TYPE_quotas] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_quota),
        [BKEY_TYPE_stripes] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_stripe),
        [BKEY_TYPE_reflink] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_reflink_v)|
                (1U << KEY_TYPE_indirect_inline_data),
+       [BKEY_TYPE_subvolumes] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_subvolume),
+       [BKEY_TYPE_snapshots] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_snapshot),
        [BKEY_TYPE_btree] =
+               (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_btree_ptr)|
                (1U << KEY_TYPE_btree_ptr_v2),
 };
@@ -134,21 +156,18 @@ static unsigned bch2_key_types_allowed[] = {
 const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
                                enum btree_node_type type)
 {
-       unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
-               bch2_key_types_allowed[type] ;
-
        if (k.k->u64s < BKEY_U64s)
                return "u64s too small";
 
-       if (!(key_types_allowed & (1U << k.k->type)))
+       if (!(bch2_key_types_allowed[type] & (1U << k.k->type)))
                return "invalid key type for this btree";
 
        if (type == BKEY_TYPE_btree &&
            bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
-       if (btree_node_type_is_extents(type)) {
-               if ((k.k->size == 0) != bkey_deleted(k.k))
+       if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
+               if (k.k->size == 0)
                        return "bad size field";
 
                if (k.k->size > k.k->p.offset)
@@ -165,7 +184,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
        if (type != BKEY_TYPE_btree &&
            btree_type_has_snapshots(type) &&
-           k.k->p.snapshot != U32_MAX)
+           !k.k->p.snapshot)
                return "invalid snapshot field";
 
        if (type != BKEY_TYPE_btree &&
@@ -193,28 +212,14 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
-{
-       const char *invalid;
-
-       BUG_ON(!k.k->u64s);
-
-       invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
-               bch2_bkey_in_btree_node(b, k);
-       if (invalid) {
-               char buf[160];
-
-               bch2_bkey_val_to_text(&PBUF(buf), c, k);
-               bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
-       }
-}
-
 void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
 {
        if (!bpos_cmp(pos, POS_MIN))
                pr_buf(out, "POS_MIN");
        else if (!bpos_cmp(pos, POS_MAX))
                pr_buf(out, "POS_MAX");
+       else if (!bpos_cmp(pos, SPOS_MAX))
+               pr_buf(out, "SPOS_MAX");
        else {
                if (pos.inode == U64_MAX)
                        pr_buf(out, "U64_MAX");
index 3012035db1a33b3acd902563c0a7ea70c2718108..4fdac545cf88af8f2425f30477efd54bbf568d27 100644 (file)
@@ -34,8 +34,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
                              enum btree_node_type);
 const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
 
-void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
-
 void bch2_bpos_to_text(struct printbuf *, struct bpos);
 void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
 void bch2_val_to_text(struct printbuf *, struct bch_fs *,
index 537ab7919e886eec958e49e12e6b18962172e725..b1385a77da1146f6efd643d389a73aa999745244 100644 (file)
@@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        return nr;
 }
 
-static void extent_sort_append(struct bch_fs *c,
-                              struct bkey_format *f,
-                              struct btree_nr_keys *nr,
-                              struct bkey_packed **out,
-                              struct bkey_s k)
-{
-       if (!bkey_deleted(k.k)) {
-               if (!bch2_bkey_pack_key(*out, k.k, f))
-                       memcpy_u64s_small(*out, k.k, BKEY_U64s);
-
-               memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
-
-               btree_keys_account_key_add(nr, 0, *out);
-               *out = bkey_next(*out);
-       }
-}
-
 /* Sort + repack in a new format: */
 struct btree_nr_keys
 bch2_sort_repack(struct bset *dst, struct btree *src,
@@ -144,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
        struct bkey_format *in_f = &src->format;
        struct bkey_packed *in, *out = vstruct_last(dst);
        struct btree_nr_keys nr;
+       bool transform = memcmp(out_f, &src->format, sizeof(*out_f));
 
        memset(&nr, 0, sizeof(nr));
 
@@ -151,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
                if (filter_whiteouts && bkey_deleted(in))
                        continue;
 
-               if (bch2_bkey_transform(out_f, out, bkey_packed(in)
-                                      ? in_f : &bch2_bkey_format_current, in))
+               if (!transform)
+                       bkey_copy(out, in);
+               else if (bch2_bkey_transform(out_f, out, bkey_packed(in)
+                                            ? in_f : &bch2_bkey_format_current, in))
                        out->format = KEY_FORMAT_LOCAL_BTREE;
                else
                        bch2_bkey_unpack(src, (void *) out, in);
@@ -165,47 +151,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
        return nr;
 }
 
-/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *c,
-                      struct bset *dst, struct btree *src,
-                      struct btree_node_iter *iter,
-                      struct bkey_format *out_f,
-                      bool filter_whiteouts)
-{
-       struct bkey_packed *out = vstruct_last(dst), *k_packed;
-       struct bkey_buf k;
-       struct btree_nr_keys nr;
-
-       memset(&nr, 0, sizeof(nr));
-       bch2_bkey_buf_init(&k);
-
-       while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
-               if (filter_whiteouts && bkey_deleted(k_packed))
-                       continue;
-
-               /*
-                * NOTE:
-                * bch2_bkey_normalize may modify the key we pass it (dropping
-                * stale pointers) and we don't have a write lock on the src
-                * node; we have to make a copy of the entire key before calling
-                * normalize
-                */
-               bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
-               bch2_bkey_unpack(src, k.k, k_packed);
-
-               if (filter_whiteouts &&
-                   bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
-                       continue;
-
-               extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
-       }
-
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       bch2_bkey_buf_exit(&k, c);
-       return nr;
-}
-
 static inline int sort_keys_cmp(struct btree *b,
                                struct bkey_packed *l,
                                struct bkey_packed *r)
index 1059996dac7807cc4b23573303d7c9717668d551..79cf11d1b4e7e69d5e13512b3d5d7a8b14f39c42 100644 (file)
@@ -37,11 +37,6 @@ struct btree_nr_keys
 bch2_sort_repack(struct bset *, struct btree *,
                 struct btree_node_iter *,
                 struct bkey_format *, bool);
-struct btree_nr_keys
-bch2_sort_repack_merge(struct bch_fs *,
-                      struct bset *, struct btree *,
-                      struct btree_node_iter *,
-                      struct bkey_format *, bool);
 
 unsigned bch2_sort_keys(struct bkey_packed *,
                        struct sort_iter *, bool);
index 0eb85acdbf8bcaec42dfa6f5e33ee69df83fa9d4..6000a8796bc55326b47ed4e535f9e69962799f78 100644 (file)
@@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
                return;
 
        /* Verify no duplicates: */
-       btree_node_iter_for_each(iter, set)
+       btree_node_iter_for_each(iter, set) {
+               BUG_ON(set->k > set->end);
                btree_node_iter_for_each(iter, s2)
                        BUG_ON(set != s2 && set->end == s2->end);
+       }
 
        /* Verify that set->end is correct: */
        btree_node_iter_for_each(iter, set) {
@@ -471,7 +473,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
                                               unsigned j)
 {
        return cacheline_to_bkey(b, t,
-                       __eytzinger1_to_inorder(j, t->size, t->extra),
+                       __eytzinger1_to_inorder(j, t->size - 1, t->extra),
                        bkey_float(b, t, j)->key_offset);
 }
 
@@ -605,10 +607,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k,
 }
 
 __always_inline
-static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
-                                unsigned j,
-                                struct bkey_packed *min_key,
-                                struct bkey_packed *max_key)
+static inline void make_bfloat(struct btree *b, struct bset_tree *t,
+                              unsigned j,
+                              struct bkey_packed *min_key,
+                              struct bkey_packed *max_key)
 {
        struct bkey_float *f = bkey_float(b, t, j);
        struct bkey_packed *m = tree_to_bkey(b, t, j);
@@ -677,34 +679,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
        f->mantissa = mantissa;
 }
 
-static void make_bfloat(struct btree *b, struct bset_tree *t,
-                       unsigned j,
-                       struct bkey_packed *min_key,
-                       struct bkey_packed *max_key)
-{
-       struct bkey_i *k;
-
-       if (is_power_of_2(j) &&
-           !min_key->u64s) {
-               if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
-                       k = (void *) min_key;
-                       bkey_init(&k->k);
-                       k->k.p = b->data->min_key;
-               }
-       }
-
-       if (is_power_of_2(j + 1) &&
-           !max_key->u64s) {
-               if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
-                       k = (void *) max_key;
-                       bkey_init(&k->k);
-                       k->k.p = b->data->max_key;
-               }
-       }
-
-       __make_bfloat(b, t, j, min_key, max_key);
-}
-
 /* bytes remaining - only valid for last bset: */
 static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
 {
@@ -761,7 +735,7 @@ retry:
        t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
 
        /* First we figure out where the first key in each cacheline is */
-       eytzinger1_for_each(j, t->size) {
+       eytzinger1_for_each(j, t->size - 1) {
                while (bkey_to_cacheline(b, t, k) < cacheline)
                        prev = k, k = bkey_next(k);
 
@@ -793,10 +767,10 @@ retry:
        }
 
        /* Then we build the tree */
-       eytzinger1_for_each(j, t->size)
-               __make_bfloat(b, t, j,
-                             bkey_to_packed(&min_key),
-                             bkey_to_packed(&max_key));
+       eytzinger1_for_each(j, t->size - 1)
+               make_bfloat(b, t, j,
+                           bkey_to_packed(&min_key),
+                           bkey_to_packed(&max_key));
 }
 
 static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
@@ -895,7 +869,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
                do {
                        p = j ? tree_to_bkey(b, t,
                                        __inorder_to_eytzinger1(j--,
-                                                       t->size, t->extra))
+                                                       t->size - 1, t->extra))
                              : btree_bkey_first(b, t);
                } while (p >= k);
                break;
@@ -941,91 +915,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
 
 /* Insert */
 
-static void rw_aux_tree_fix_invalidated_key(struct btree *b,
-                                           struct bset_tree *t,
-                                           struct bkey_packed *k)
-{
-       unsigned offset = __btree_node_key_to_offset(b, k);
-       unsigned j = rw_aux_tree_bsearch(b, t, offset);
-
-       if (j < t->size &&
-           rw_aux_tree(b, t)[j].offset == offset)
-               rw_aux_tree_set(b, t, j, k);
-
-       bch2_bset_verify_rw_aux_tree(b, t);
-}
-
-static void ro_aux_tree_fix_invalidated_key(struct btree *b,
-                                           struct bset_tree *t,
-                                           struct bkey_packed *k)
-{
-       struct bkey_packed min_key, max_key;
-       unsigned inorder, j;
-
-       EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
-
-       /* signal to make_bfloat() that they're uninitialized: */
-       min_key.u64s = max_key.u64s = 0;
-
-       if (bkey_next(k) == btree_bkey_last(b, t)) {
-               for (j = 1; j < t->size; j = j * 2 + 1)
-                       make_bfloat(b, t, j, &min_key, &max_key);
-       }
-
-       inorder = bkey_to_cacheline(b, t, k);
-
-       if (inorder &&
-           inorder < t->size) {
-               j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
-
-               if (k == tree_to_bkey(b, t, j)) {
-                       /* Fix the node this key corresponds to */
-                       make_bfloat(b, t, j, &min_key, &max_key);
-
-                       /* Children for which this key is the right boundary */
-                       for (j = eytzinger1_left_child(j);
-                            j < t->size;
-                            j = eytzinger1_right_child(j))
-                               make_bfloat(b, t, j, &min_key, &max_key);
-               }
-       }
-
-       if (inorder + 1 < t->size) {
-               j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
-
-               if (k == tree_to_prev_bkey(b, t, j)) {
-                       make_bfloat(b, t, j, &min_key, &max_key);
-
-                       /* Children for which this key is the left boundary */
-                       for (j = eytzinger1_right_child(j);
-                            j < t->size;
-                            j = eytzinger1_left_child(j))
-                               make_bfloat(b, t, j, &min_key, &max_key);
-               }
-       }
-}
-
-/**
- * bch2_bset_fix_invalidated_key() - given an existing  key @k that has been
- * modified, fix any auxiliary search tree by remaking all the nodes in the
- * auxiliary search tree that @k corresponds to
- */
-void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
-{
-       struct bset_tree *t = bch2_bkey_to_bset(b, k);
-
-       switch (bset_aux_tree_type(t)) {
-       case BSET_NO_AUX_TREE:
-               break;
-       case BSET_RO_AUX_TREE:
-               ro_aux_tree_fix_invalidated_key(b, t, k);
-               break;
-       case BSET_RW_AUX_TREE:
-               rw_aux_tree_fix_invalidated_key(b, t, k);
-               break;
-       }
-}
-
 static void bch2_bset_fix_lookup_table(struct btree *b,
                                       struct bset_tree *t,
                                       struct bkey_packed *_where,
@@ -1260,7 +1149,7 @@ slowpath:
                n = n * 2 + (cmp < 0);
        } while (n < t->size);
 
-       inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
+       inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra);
 
        /*
         * n would have been the node we recursed to - the low bit tells us if
@@ -1271,7 +1160,7 @@ slowpath:
                if (unlikely(!inorder))
                        return btree_bkey_first(b, t);
 
-               f = &base->f[eytzinger1_prev(n >> 1, t->size)];
+               f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)];
        }
 
        return cacheline_to_bkey(b, t, inorder, f->key_offset);
@@ -1545,10 +1434,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 
        EBUG_ON(iter->data->k > iter->data->end);
 
-       while (!__btree_node_iter_set_end(iter, 0) &&
-              !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
-               iter->data->k++;
-
        if (unlikely(__btree_node_iter_set_end(iter, 0))) {
                bch2_btree_node_iter_set_drop(iter, iter->data);
                return;
@@ -1692,7 +1577,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
        if (!inorder || inorder >= t->size)
                return;
 
-       j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
+       j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra);
        if (k != tree_to_bkey(b, t, j))
                return;
 
index e42f866cf2ec01d33ed1c04472281aeb5e9fe52a..0d46534c3dcd148e872f222c125c144fac115b7a 100644 (file)
@@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *);
 void bch2_bset_init_next(struct bch_fs *, struct btree *,
                         struct btree_node_entry *);
 void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
-void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
 
 void bch2_bset_insert(struct btree *, struct btree_node_iter *,
                     struct bkey_packed *, struct bkey_i *, unsigned);
index cd0c5009e167b9f529fc8f9d0edfeaac460ff902..986d08d708cc9593238e482b226c16fb4d01fe2f 100644 (file)
@@ -83,6 +83,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
        b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
                           PROT_READ|PROT_WRITE|PROT_EXEC,
                           MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+       if (b->aux_data == MAP_FAILED)
+               b->aux_data = NULL;
 #endif
        if (!b->aux_data) {
                kvpfree(b->data, btree_bytes(c));
@@ -128,7 +130,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 
 void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
 {
-       rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+       int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+       BUG_ON(ret);
 
        /* Cause future lookups for this node to fail: */
        b->hash_val = 0;
@@ -273,6 +276,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        unsigned long touched = 0;
        unsigned long freed = 0;
        unsigned i, flags;
+       unsigned long ret = SHRINK_STOP;
 
        if (bch2_btree_shrinker_disabled)
                return SHRINK_STOP;
@@ -281,7 +285,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
        if (sc->gfp_mask & __GFP_FS)
                mutex_lock(&bc->lock);
        else if (!mutex_trylock(&bc->lock))
-               return -1;
+               goto out_norestore;
 
        flags = memalloc_nofs_save();
 
@@ -298,13 +302,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 
        i = 0;
        list_for_each_entry_safe(b, t, &bc->freeable, list) {
+               /*
+                * Leave a few nodes on the freeable list, so that a btree split
+                * won't have to hit the system allocator:
+                */
+               if (++i <= 3)
+                       continue;
+
                touched++;
 
-               if (freed >= nr)
+               if (touched >= nr)
                        break;
 
-               if (++i > 3 &&
-                   !btree_node_reclaim(c, b)) {
+               if (!btree_node_reclaim(c, b)) {
                        btree_node_data_free(c, b);
                        six_unlock_write(&b->c.lock);
                        six_unlock_intent(&b->c.lock);
@@ -315,7 +325,7 @@ restart:
        list_for_each_entry_safe(b, t, &bc->live, list) {
                touched++;
 
-               if (freed >= nr) {
+               if (touched >= nr) {
                        /* Save position */
                        if (&t->list != &bc->live)
                                list_move_tail(&bc->live, &t->list);
@@ -350,8 +360,14 @@ restart:
 
        mutex_unlock(&bc->lock);
 out:
+       ret = (unsigned long) freed * btree_pages(c);
        memalloc_nofs_restore(flags);
-       return (unsigned long) freed * btree_pages(c);
+out_norestore:
+       trace_btree_cache_scan(sc->nr_to_scan,
+                              sc->nr_to_scan / btree_pages(c),
+                              btree_cache_can_free(bc),
+                              ret);
+       return ret;
 }
 
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
@@ -632,7 +648,8 @@ err:
 
 /* Slowpath, don't want it inlined into btree_iter_traverse() */
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
-                               struct btree_iter *iter,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
                                const struct bkey_i *k,
                                enum btree_id btree_id,
                                unsigned level,
@@ -648,8 +665,10 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * Parent node must be locked, else we could read in a btree node that's
         * been freed:
         */
-       if (iter && !bch2_btree_node_relock(iter, level + 1)) {
-               btree_trans_restart(iter->trans);
+       if (trans && !bch2_btree_node_relock(trans, path, level + 1)) {
+               trace_trans_restart_relock_parent_for_fill(trans->fn,
+                                       _THIS_IP_, btree_id, &path->pos);
+               btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
 
@@ -680,23 +699,25 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
        six_unlock_intent(&b->c.lock);
 
        /* Unlock before doing IO: */
-       if (iter && sync)
-               bch2_trans_unlock(iter->trans);
+       if (trans && sync)
+               bch2_trans_unlock(trans);
 
        bch2_btree_node_read(c, b, sync);
 
        if (!sync)
                return NULL;
 
-       if (iter &&
-           (!bch2_trans_relock(iter->trans) ||
-            !bch2_btree_iter_relock_intent(iter))) {
-               BUG_ON(!iter->trans->restarted);
+       if (trans &&
+           (!bch2_trans_relock(trans) ||
+            !bch2_btree_path_relock_intent(trans, path))) {
+               BUG_ON(!trans->restarted);
                return ERR_PTR(-EINTR);
        }
 
        if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-               btree_trans_restart(iter->trans);
+               trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_,
+                                          btree_id, &path->pos);
+               btree_trans_restart(trans);
                return ERR_PTR(-EINTR);
        }
 
@@ -754,7 +775,7 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b)
  * The btree node will have either a read or a write lock held, depending on
  * the @write parameter.
  */
-struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path,
                                  const struct bkey_i *k, unsigned level,
                                  enum six_lock_type lock_type,
                                  unsigned long trace_ip)
@@ -766,11 +787,17 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
-       if (c->opts.btree_node_mem_ptr_optimization) {
-               b = btree_node_mem_ptr(k);
-               if (b)
+       b = btree_node_mem_ptr(k);
+
+       /*
+        * Check b->hash_val _before_ calling btree_node_lock() - this might not
+        * be the node we want anymore, and trying to lock the wrong node could
+        * cause an unneccessary transaction restart:
+        */
+       if (likely(c->opts.btree_node_mem_ptr_optimization &&
+                  b &&
+                  b->hash_val == btree_ptr_hash_val(k)))
                        goto lock_node;
-       }
 retry:
        b = btree_cache_find(bc, k);
        if (unlikely(!b)) {
@@ -779,7 +806,7 @@ retry:
                 * else we could read in a btree node from disk that's been
                 * freed:
                 */
-               b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+               b = bch2_btree_node_fill(c, trans, path, k, path->btree_id,
                                         level, lock_type, true);
 
                /* We raced and found the btree node in the cache */
@@ -818,10 +845,10 @@ lock_node:
                 * the parent was modified, when the pointer to the node we want
                 * was removed - and we'll bail out:
                 */
-               if (btree_node_read_locked(iter, level + 1))
-                       btree_node_unlock(iter, level + 1);
+               if (btree_node_read_locked(path, level + 1))
+                       btree_node_unlock(path, level + 1);
 
-               if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
+               if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type,
                                     lock_node_check_fn, (void *) k, trace_ip)) {
                        if (!trans->restarted)
                                goto retry;
@@ -832,13 +859,13 @@ lock_node:
                             b->c.level != level ||
                             race_fault())) {
                        six_unlock_type(&b->c.lock, lock_type);
-                       if (bch2_btree_node_relock(iter, level + 1))
+                       if (bch2_btree_node_relock(trans, path, level + 1))
                                goto retry;
 
-                       trace_trans_restart_btree_node_reused(trans->ip,
+                       trace_trans_restart_btree_node_reused(trans->fn,
                                                              trace_ip,
-                                                             iter->btree_id,
-                                                             &iter->real_pos);
+                                                             path->btree_id,
+                                                             &path->pos);
                        btree_trans_restart(trans);
                        return ERR_PTR(-EINTR);
                }
@@ -853,12 +880,12 @@ lock_node:
                bch2_btree_node_wait_on_read(b);
 
                /*
-                * should_be_locked is not set on this iterator yet, so we need
-                * to relock it specifically:
+                * should_be_locked is not set on this path yet, so we need to
+                * relock it specifically:
                 */
-               if (iter &&
+               if (trans &&
                    (!bch2_trans_relock(trans) ||
-                    !bch2_btree_iter_relock_intent(iter))) {
+                    !bch2_btree_path_relock_intent(trans, path))) {
                        BUG_ON(!trans->restarted);
                        return ERR_PTR(-EINTR);
                }
@@ -886,7 +913,7 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != iter->btree_id);
+       EBUG_ON(b->c.btree_id != path->btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
        btree_check_header(c, b);
 
@@ -917,7 +944,7 @@ retry:
                if (nofill)
                        goto out;
 
-               b = bch2_btree_node_fill(c, NULL, k, btree_id,
+               b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id,
                                         level, SIX_LOCK_read, true);
 
                /* We raced and found the btree node in the cache */
@@ -975,21 +1002,24 @@ out:
        return b;
 }
 
-int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
+int bch2_btree_node_prefetch(struct bch_fs *c,
+                            struct btree_trans *trans,
+                            struct btree_path *path,
                             const struct bkey_i *k,
                             enum btree_id btree_id, unsigned level)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
 
-       BUG_ON(iter && !btree_node_locked(iter, level + 1));
+       BUG_ON(trans && !btree_node_locked(path, level + 1));
        BUG_ON(level >= BTREE_MAX_DEPTH);
 
        b = btree_cache_find(bc, k);
        if (b)
                return 0;
 
-       b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
+       b = bch2_btree_node_fill(c, trans, path, k, btree_id,
+                                level, SIX_LOCK_read, false);
        return PTR_ERR_OR_ZERO(b);
 }
 
index 5032293e8628a5fc0c9c23616e30923b1feb9b7c..f7e10986f317cc2036abcb143648bc721c0c2eb0 100644 (file)
@@ -22,14 +22,14 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
-struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *,
+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *,
                                  const struct bkey_i *, unsigned,
                                  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
                                         enum btree_id, unsigned, bool);
 
-int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
+int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *,
                             const struct bkey_i *, enum btree_id, unsigned);
 
 void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
@@ -71,7 +71,7 @@ static inline bool btree_node_hashed(struct btree *b)
 
 static inline size_t btree_bytes(struct bch_fs *c)
 {
-       return c->opts.btree_node_size << 9;
+       return c->opts.btree_node_size;
 }
 
 static inline size_t btree_max_u64s(struct bch_fs *c)
@@ -86,7 +86,7 @@ static inline size_t btree_pages(struct bch_fs *c)
 
 static inline unsigned btree_blocks(struct bch_fs *c)
 {
-       return c->opts.btree_node_size >> c->block_bits;
+       return btree_sectors(c) >> c->block_bits;
 }
 
 #define BTREE_SPLIT_THRESHOLD(c)               (btree_max_u64s(c) * 2 / 3)
index 3dd1094d10c9cdf61ef233ac9c94616141ce4286..648779cc643d225e63aad0175d981b0560081b6d 100644 (file)
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -155,6 +156,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
        }
 }
 
+static void bch2_btree_node_update_key_early(struct bch_fs *c,
+                                            enum btree_id btree, unsigned level,
+                                            struct bkey_s_c old, struct bkey_i *new)
+{
+       struct btree *b;
+       struct bkey_buf tmp;
+       int ret;
+
+       bch2_bkey_buf_init(&tmp);
+       bch2_bkey_buf_reassemble(&tmp, c, old);
+
+       b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true);
+       if (!IS_ERR_OR_NULL(b)) {
+               mutex_lock(&c->btree_cache.lock);
+
+               bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+               bkey_copy(&b->key, new);
+               ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+               BUG_ON(ret);
+
+               mutex_unlock(&c->btree_cache.lock);
+               six_unlock_read(&b->c.lock);
+       }
+
+       bch2_bkey_buf_exit(&tmp, c);
+}
+
 static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 {
        struct bkey_i_btree_ptr_v2 *new;
@@ -169,7 +198,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
        new->v.min_key          = new_min;
        SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 
-       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
        if (ret) {
                kfree(new);
                return ret;
@@ -198,7 +227,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
        new->k.p                = new_max;
        SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
 
-       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i);
        if (ret) {
                kfree(new);
                return ret;
@@ -498,28 +527,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
        char buf[200];
        int ret = 0;
 
+       /*
+        * XXX
+        * use check_bucket_ref here
+        */
        bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-               struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
-               struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
+               struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-               if (fsck_err_on(g->mark.data_type &&
-                               g->mark.data_type != data_type, c,
-                               "bucket %u:%zu different types of data in same bucket: %s, %s\n"
-                               "while marking %s",
-                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                               bch2_data_types[g->mark.data_type],
-                               bch2_data_types[data_type],
-                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
-                       if (data_type == BCH_DATA_btree) {
-                               g2->_mark.data_type = g->_mark.data_type = data_type;
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-                       } else {
-                               do_update = true;
-                       }
-               }
-
                if (fsck_err_on(!g->gen_valid, c,
                                "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
                                "while marking %s",
@@ -528,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                p.ptr.gen,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (!p.ptr.cached) {
-                               g2->_mark.gen   = g->_mark.gen          = p.ptr.gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                               g->_mark.gen            = p.ptr.gen;
+                               g->gen_valid            = true;
                        } else {
                                do_update = true;
                        }
@@ -544,18 +559,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                p.ptr.gen, g->mark.gen,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (!p.ptr.cached) {
-                               g2->_mark.gen   = g->_mark.gen  = p.ptr.gen;
-                               g2->gen_valid   = g->gen_valid  = true;
-                               g2->_mark.data_type             = 0;
-                               g2->_mark.dirty_sectors         = 0;
-                               g2->_mark.cached_sectors        = 0;
+                               g->_mark.gen            = p.ptr.gen;
+                               g->gen_valid            = true;
+                               g->_mark.data_type      = 0;
+                               g->_mark.dirty_sectors  = 0;
+                               g->_mark.cached_sectors = 0;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
                        } else {
                                do_update = true;
                        }
                }
 
+               if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+                               "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+                               "while marking %s",
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+                               bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+                               p.ptr.gen,
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
+                       do_update = true;
+
                if (fsck_err_on(!p.ptr.cached &&
                                gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
                                "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
@@ -566,8 +589,27 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf)))
                        do_update = true;
 
+               if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+                       continue;
+
+               if (fsck_err_on(g->mark.data_type &&
+                               g->mark.data_type != data_type, c,
+                               "bucket %u:%zu different types of data in same bucket: %s, %s\n"
+                               "while marking %s",
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+                               bch2_data_types[g->mark.data_type],
+                               bch2_data_types[data_type],
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
+                       if (data_type == BCH_DATA_btree) {
+                               g->_mark.data_type      = data_type;
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                       } else {
+                               do_update = true;
+                       }
+               }
+
                if (p.has_ec) {
-                       struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
+                       struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx);
 
                        if (fsck_err_on(!m || !m->alive, c,
                                        "pointer to nonexistent stripe %llu\n"
@@ -613,20 +655,21 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                        ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
                        bkey_for_each_ptr(ptrs, ptr) {
                                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-                               struct bucket *g = PTR_BUCKET(ca, ptr, true);
+                               struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
                                ptr->gen = g->mark.gen;
                        }
                } else {
                        bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
                                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-                               struct bucket *g = PTR_BUCKET(ca, ptr, true);
+                               struct bucket *g = PTR_GC_BUCKET(ca, ptr);
                                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
                                (ptr->cached &&
                                 (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
                                (!ptr->cached &&
                                 gen_cmp(ptr->gen, g->mark.gen) < 0) ||
+                               gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
                                (g->mark.data_type &&
                                 g->mark.data_type != data_type);
                        }));
@@ -634,7 +677,7 @@ again:
                        ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
                        bkey_extent_entry_for_each(ptrs, entry) {
                                if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
-                                       struct stripe *m = genradix_ptr(&c->stripes[true],
+                                       struct gc_stripe *m = genradix_ptr(&c->gc_stripes,
                                                                        entry->stripe_ptr.idx);
                                        union bch_extent_entry *next_ptr;
 
@@ -659,11 +702,20 @@ found:
                        }
                }
 
-               ret = bch2_journal_key_insert(c, btree_id, level, new);
-               if (ret)
+               ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+               if (ret) {
                        kfree(new);
-               else
-                       *k = bkey_i_to_s_c(new);
+                       return ret;
+               }
+
+               if (level)
+                       bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
+
+               bch2_bkey_val_to_text(&PBUF(buf), c, *k);
+               bch_info(c, "updated %s", buf);
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new));
+               bch_info(c, "new key %s", buf);
+               *k = bkey_i_to_s_c(new);
        }
 fsck_err:
        return ret;
@@ -671,19 +723,21 @@ fsck_err:
 
 /* marking of btree keys/nodes: */
 
-static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
                            unsigned level, bool is_root,
                            struct bkey_s_c *k,
-                           u8 *max_stale, bool initial)
+                           bool initial)
 {
-       struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr;
+       struct bch_fs *c = trans->c;
+       struct bkey deleted = KEY(0, 0, 0);
+       struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
        unsigned flags =
-               BTREE_TRIGGER_INSERT|
                BTREE_TRIGGER_GC|
                (initial ? BTREE_TRIGGER_NOATOMIC : 0);
        int ret = 0;
 
+       deleted.p = k->k->p;
+
        if (initial) {
                BUG_ON(bch2_journal_seq_verify &&
                       k->k->version.lo > journal_cur_seq(&c->journal));
@@ -697,31 +751,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
                                k->k->version.lo,
                                atomic64_read(&c->key_version)))
                        atomic64_set(&c->key_version, k->k->version.lo);
-
-               if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                   fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
-                               "superblock not marked as containing replicas (type %u)",
-                               k->k->type)) {
-                       ret = bch2_mark_bkey_replicas(c, *k);
-                       if (ret) {
-                               bch_err(c, "error marking bkey replicas: %i", ret);
-                               goto err;
-                       }
-               }
        }
 
-       ptrs = bch2_bkey_ptrs_c(*k);
-       bkey_for_each_ptr(ptrs, ptr) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g = PTR_BUCKET(ca, ptr, true);
-
-               if (gen_after(g->oldest_gen, ptr->gen))
-                       g->oldest_gen = ptr->gen;
-
-               *max_stale = max(*max_stale, ptr_stale(ca, ptr));
-       }
-
-       bch2_mark_key(c, *k, flags);
+       ret = bch2_mark_key(trans, old, *k, flags);
 fsck_err:
 err:
        if (ret)
@@ -729,17 +761,15 @@ err:
        return ret;
 }
 
-static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
-                             bool initial)
+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
 {
+       struct bch_fs *c = trans->c;
        struct btree_node_iter iter;
        struct bkey unpacked;
        struct bkey_s_c k;
        struct bkey_buf prev, cur;
        int ret = 0;
 
-       *max_stale = 0;
-
        if (!btree_node_type_needs_gc(btree_node_type(b)))
                return 0;
 
@@ -749,8 +779,8 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
        bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
-               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-                                      &k, max_stale, initial);
+               ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
+                                      &k, initial);
                if (ret)
                        break;
 
@@ -771,52 +801,32 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
        return ret;
 }
 
-static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
                         bool initial, bool metadata_only)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
        struct btree *b;
        unsigned depth = metadata_only                  ? 1
                : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
-       u8 max_stale = 0;
        int ret = 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
        gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-       __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
-                             0, depth, BTREE_ITER_PREFETCH, b) {
+       __for_each_btree_node(trans, iter, btree_id, POS_MIN,
+                             0, depth, BTREE_ITER_PREFETCH, b, ret) {
                bch2_verify_btree_nr_keys(b);
 
                gc_pos_set(c, gc_pos_btree_node(b));
 
-               ret = btree_gc_mark_node(c, b, &max_stale, initial);
+               ret = btree_gc_mark_node(trans, b, initial);
                if (ret)
                        break;
-
-               if (!initial) {
-                       if (max_stale > 64)
-                               bch2_btree_node_rewrite(&trans, iter,
-                                               b->data->keys.seq,
-                                               BTREE_INSERT_NOWAIT|
-                                               BTREE_INSERT_GC_LOCK_HELD);
-                       else if (!bch2_btree_gc_rewrite_disabled &&
-                                (bch2_btree_gc_always_rewrite || max_stale > 16))
-                               bch2_btree_node_rewrite(&trans, iter,
-                                               b->data->keys.seq,
-                                               BTREE_INSERT_NOWAIT|
-                                               BTREE_INSERT_GC_LOCK_HELD);
-               }
-
-               bch2_trans_cond_resched(&trans);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                return ret;
 
@@ -825,8 +835,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        if (!btree_node_fake(b)) {
                struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-                                      &k, &max_stale, initial);
+               ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+                                      true, &k, initial);
        }
        gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
        mutex_unlock(&c->btree_root_lock);
@@ -834,13 +844,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        return ret;
 }
 
-static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b,
                                      unsigned target_depth)
 {
+       struct bch_fs *c = trans->c;
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
-       u8 max_stale = 0;
        char buf[200];
        int ret = 0;
 
@@ -853,8 +863,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
                BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
 
-               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
-                                      &k, &max_stale, true);
+               ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level,
+                                      false, &k, true);
                if (ret) {
                        bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
                        goto fsck_err;
@@ -920,7 +930,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                break;
                        }
 
-                       ret = bch2_gc_btree_init_recurse(c, child,
+                       ret = bch2_gc_btree_init_recurse(trans, child,
                                                         target_depth);
                        six_unlock_read(&child->c.lock);
 
@@ -935,16 +945,16 @@ fsck_err:
        return ret;
 }
 
-static int bch2_gc_btree_init(struct bch_fs *c,
+static int bch2_gc_btree_init(struct btree_trans *trans,
                              enum btree_id btree_id,
                              bool metadata_only)
 {
+       struct bch_fs *c = trans->c;
        struct btree *b;
        unsigned target_depth = metadata_only           ? 1
                : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
-       u8 max_stale = 0;
        char buf[100];
        int ret = 0;
 
@@ -971,13 +981,13 @@ static int bch2_gc_btree_init(struct bch_fs *c,
        }
 
        if (b->c.level >= target_depth)
-               ret = bch2_gc_btree_init_recurse(c, b, target_depth);
+               ret = bch2_gc_btree_init_recurse(trans, b, target_depth);
 
        if (!ret) {
                struct bkey_s_c k = bkey_i_to_s_c(&b->key);
 
-               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
-                                      &k, &max_stale, true);
+               ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true,
+                                      &k, true);
        }
 fsck_err:
        six_unlock_read(&b->c.lock);
@@ -995,21 +1005,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 
 static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
+       struct btree_trans trans;
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
        int ret = 0;
 
+       bch2_trans_init(&trans, c, 0, 0);
+
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
        bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
        for (i = 0; i < BTREE_ID_NR && !ret; i++)
                ret = initial
-                       ? bch2_gc_btree_init(c, ids[i], metadata_only)
-                       : bch2_gc_btree(c, ids[i], initial, metadata_only);
+                       ? bch2_gc_btree_init(&trans, ids[i], metadata_only)
+                       : bch2_gc_btree(&trans, ids[i], initial, metadata_only);
 
        if (ret < 0)
                bch_err(c, "%s: ret %i", __func__, ret);
+
+       bch2_trans_exit(&trans);
        return ret;
 }
 
@@ -1031,23 +1046,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
        } while (start < end);
 }
 
-void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
-                             unsigned flags)
+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+                                    unsigned flags)
 {
        struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
        unsigned i;
        u64 b;
 
-       /*
-        * This conditional is kind of gross, but we may be called from the
-        * device add path, before the new device has actually been added to the
-        * running filesystem:
-        */
-       if (c) {
-               lockdep_assert_held(&c->sb_lock);
-               percpu_down_read(&c->mark_lock);
-       }
-
        for (i = 0; i < layout->nr_superblocks; i++) {
                u64 offset = le64_to_cpu(layout->sb_offset[i]);
 
@@ -1066,9 +1071,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
                                          ca->mi.bucket_size,
                                          gc_phase(GC_PHASE_SB), flags);
        }
-
-       if (c)
-               percpu_up_read(&c->mark_lock);
 }
 
 static void bch2_mark_superblocks(struct bch_fs *c)
@@ -1096,8 +1098,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
        for_each_pending_btree_node_free(c, as, d)
                if (d->index_update_done)
-                       bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-                                     BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC);
+                       bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC);
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1108,7 +1109,8 @@ static void bch2_gc_free(struct bch_fs *c)
        struct bch_dev *ca;
        unsigned i;
 
-       genradix_free(&c->stripes[1]);
+       genradix_free(&c->reflink_gc_table);
+       genradix_free(&c->gc_stripes);
 
        for_each_member_device(ca, c, i) {
                kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
@@ -1133,13 +1135,14 @@ static int bch2_gc_done(struct bch_fs *c,
        unsigned i, dev;
        int ret = 0;
 
+       percpu_down_write(&c->mark_lock);
+
 #define copy_field(_f, _msg, ...)                                      \
        if (dst->_f != src->_f) {                                       \
                if (verify)                                             \
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
@@ -1149,85 +1152,28 @@ static int bch2_gc_done(struct bch_fs *c,
                                iter.pos, ##__VA_ARGS__,                \
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
-       }
-#define copy_bucket_field(_f)                                          \
-       if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
-               if (verify)                                             \
-                       fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f  \
-                               ": got %u, should be %u", dev, b,       \
-                               dst->b[b].mark.gen,                     \
-                               bch2_data_types[dst->b[b].mark.data_type],\
-                               dst->b[b].mark._f, src->b[b].mark._f);  \
-               dst->b[b]._mark._f = src->b[b].mark._f;                 \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)                                   \
        copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
 
-       if (!metadata_only) {
-               struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
-               struct stripe *dst, *src;
-
-               while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
-                       dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
-
-                       if (dst->alive          != src->alive ||
-                           dst->sectors        != src->sectors ||
-                           dst->algorithm      != src->algorithm ||
-                           dst->nr_blocks      != src->nr_blocks ||
-                           dst->nr_redundant   != src->nr_redundant) {
-                               bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
-                               ret = -EINVAL;
-                               goto fsck_err;
-                       }
-
-                       for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
-                               copy_stripe_field(block_sectors[i],
-                                                 "block_sectors[%u]", i);
-
-                       dst->blocks_nonempty = 0;
-                       for (i = 0; i < dst->nr_blocks; i++)
-                               dst->blocks_nonempty += dst->block_sectors[i] != 0;
-
-                       genradix_iter_advance(&iter, &c->stripes[1]);
-               }
-       }
-
        for (i = 0; i < ARRAY_SIZE(c->usage); i++)
                bch2_fs_usage_acc_to_base(c, i);
 
        for_each_member_device(ca, c, dev) {
-               struct bucket_array *dst = __bucket_array(ca, 0);
-               struct bucket_array *src = __bucket_array(ca, 1);
-               size_t b;
-
-               for (b = 0; b < src->nbuckets; b++) {
-                       copy_bucket_field(gen);
-                       copy_bucket_field(data_type);
-                       copy_bucket_field(stripe);
-                       copy_bucket_field(dirty_sectors);
-                       copy_bucket_field(cached_sectors);
-
-                       dst->b[b].oldest_gen = src->b[b].oldest_gen;
-               }
-
-               {
-                       struct bch_dev_usage *dst = ca->usage_base;
-                       struct bch_dev_usage *src = (void *)
-                               bch2_acc_percpu_u64s((void *) ca->usage_gc,
-                                                    dev_usage_u64s());
-
-                       copy_dev_field(buckets_ec,              "buckets_ec");
-                       copy_dev_field(buckets_unavailable,     "buckets_unavailable");
-
-                       for (i = 0; i < BCH_DATA_NR; i++) {
-                               copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
-                               copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
-                               copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
-                       }
+               struct bch_dev_usage *dst = ca->usage_base;
+               struct bch_dev_usage *src = (void *)
+                       bch2_acc_percpu_u64s((void *) ca->usage_gc,
+                                            dev_usage_u64s());
+
+               copy_dev_field(buckets_ec,              "buckets_ec");
+               copy_dev_field(buckets_unavailable,     "buckets_unavailable");
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                       copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                       copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
                }
        };
 
@@ -1269,7 +1215,6 @@ static int bch2_gc_done(struct bch_fs *c,
 
 #undef copy_fs_field
 #undef copy_dev_field
-#undef copy_bucket_field
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
@@ -1277,6 +1222,8 @@ fsck_err:
                percpu_ref_put(&ca->ref);
        if (ret)
                bch_err(c, "%s: ret %i", __func__, ret);
+
+       percpu_up_write(&c->mark_lock);
        return ret;
 }
 
@@ -1285,7 +1232,6 @@ static int bch2_gc_start(struct bch_fs *c,
 {
        struct bch_dev *ca = NULL;
        unsigned i;
-       int ret;
 
        BUG_ON(c->usage_gc);
 
@@ -1300,15 +1246,6 @@ static int bch2_gc_start(struct bch_fs *c,
                BUG_ON(ca->buckets[1]);
                BUG_ON(ca->usage_gc);
 
-               ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
-                               ca->mi.nbuckets * sizeof(struct bucket),
-                               GFP_KERNEL|__GFP_ZERO);
-               if (!ca->buckets[1]) {
-                       percpu_ref_put(&ca->ref);
-                       bch_err(c, "error allocating ca->buckets[gc]");
-                       return -ENOMEM;
-               }
-
                ca->usage_gc = alloc_percpu(struct bch_dev_usage);
                if (!ca->usage_gc) {
                        bch_err(c, "error allocating ca->usage_gc");
@@ -1317,104 +1254,175 @@ static int bch2_gc_start(struct bch_fs *c,
                }
        }
 
-       ret = bch2_ec_mem_alloc(c, true);
-       if (ret) {
-               bch_err(c, "error allocating ec gc mem");
-               return ret;
-       }
+       return 0;
+}
 
-       percpu_down_write(&c->mark_lock);
+static int bch2_alloc_write_key(struct btree_trans *trans,
+                               struct btree_iter *iter,
+                               bool initial, bool metadata_only)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+       struct bucket *g;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked old_u, new_u, gc_u;
+       struct bkey_alloc_buf *a;
+       int ret;
 
-       /*
-        * indicate to stripe code that we need to allocate for the gc stripes
-        * radix tree, too
-        */
-       gc_pos_set(c, gc_phase(GC_PHASE_START));
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
 
-       for_each_member_device(ca, c, i) {
-               struct bucket_array *dst = __bucket_array(ca, 1);
-               struct bucket_array *src = __bucket_array(ca, 0);
-               size_t b;
+       old_u = new_u = bch2_alloc_unpack(k);
 
-               dst->first_bucket       = src->first_bucket;
-               dst->nbuckets           = src->nbuckets;
+       percpu_down_read(&c->mark_lock);
+       g       = gc_bucket(ca, iter->pos.offset);
+       gc_u = (struct bkey_alloc_unpacked) {
+               .dev            = iter->pos.inode,
+               .bucket         = iter->pos.offset,
+               .gen            = g->mark.gen,
+               .data_type      = g->mark.data_type,
+               .dirty_sectors  = g->mark.dirty_sectors,
+               .cached_sectors = g->mark.cached_sectors,
+               .read_time      = g->io_time[READ],
+               .write_time     = g->io_time[WRITE],
+               .stripe         = g->stripe,
+               .stripe_redundancy = g->stripe_redundancy,
+       };
+       percpu_up_read(&c->mark_lock);
 
-               for (b = 0; b < src->nbuckets; b++) {
-                       struct bucket *d = &dst->b[b];
-                       struct bucket *s = &src->b[b];
+       if (metadata_only &&
+           gc_u.data_type != BCH_DATA_sb &&
+           gc_u.data_type != BCH_DATA_journal &&
+           gc_u.data_type != BCH_DATA_btree)
+               return 0;
 
-                       d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
-                       d->gen_valid = s->gen_valid;
+       if (gen_after(old_u.gen, gc_u.gen))
+               return 0;
 
-                       if (metadata_only &&
-                           (s->mark.data_type == BCH_DATA_user ||
-                            s->mark.data_type == BCH_DATA_cached))
-                               d->_mark = s->mark;
-               }
-       };
+#define copy_bucket_field(_f)                                          \
+       if (fsck_err_on(new_u._f != gc_u._f, c,                         \
+                       "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
+                       ": got %u, should be %u",                       \
+                       iter->pos.inode, iter->pos.offset,              \
+                       new_u.gen,                                      \
+                       bch2_data_types[new_u.data_type],               \
+                       new_u._f, gc_u._f))                             \
+               new_u._f = gc_u._f;                                     \
+
+       copy_bucket_field(gen);
+       copy_bucket_field(data_type);
+       copy_bucket_field(stripe);
+       copy_bucket_field(dirty_sectors);
+       copy_bucket_field(cached_sectors);
+       copy_bucket_field(stripe_redundancy);
+       copy_bucket_field(stripe);
+#undef copy_bucket_field
 
-       percpu_up_write(&c->mark_lock);
+       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+               return 0;
 
-       return 0;
+       a = bch2_alloc_pack(trans, new_u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
+
+       ret = initial
+               ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+               : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+       return ret;
 }
 
-static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       struct reflink_gc *r;
-       const __le64 *refcount = bkey_refcount_c(k);
-       char buf[200];
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_dev *ca;
+       unsigned i;
        int ret = 0;
 
-       if (!refcount)
-               return 0;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
-       if (!r)
-               return -ENOMEM;
+       for_each_member_device(ca, c, i) {
+               for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+                                  POS(ca->dev_idx, ca->mi.first_bucket),
+                                  BTREE_ITER_SLOTS|
+                                  BTREE_ITER_PREFETCH, k, ret) {
+                       if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+                               break;
 
-       if (!r ||
-           r->offset != k.k->p.offset ||
-           r->size != k.k->size) {
-               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
-               return -EINVAL;
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_LAZY_RW,
+                                       bch2_alloc_write_key(&trans, &iter,
+                                                            initial, metadata_only));
+                       if (ret)
+                               break;
+               }
+               bch2_trans_iter_exit(&trans, &iter);
+
+               if (ret) {
+                       bch_err(c, "error writing alloc info: %i", ret);
+                       percpu_ref_put(&ca->ref);
+                       break;
+               }
        }
 
-       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
-                       "reflink key has wrong refcount:\n"
-                       "  %s\n"
-                       "  should be %u",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
-                       r->refcount)) {
-               struct bkey_i *new;
+       bch2_trans_exit(&trans);
+       return ret;
+}
 
-               new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
-               if (!new) {
-                       ret = -ENOMEM;
-                       goto fsck_err;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+                               ca->mi.nbuckets * sizeof(struct bucket),
+                               GFP_KERNEL|__GFP_ZERO);
+               if (!buckets) {
+                       percpu_ref_put(&ca->ref);
+                       percpu_up_write(&c->mark_lock);
+                       bch_err(c, "error allocating ca->buckets[gc]");
+                       return -ENOMEM;
                }
 
-               bkey_reassemble(new, k);
+               buckets->first_bucket   = ca->mi.first_bucket;
+               buckets->nbuckets       = ca->mi.nbuckets;
+               rcu_assign_pointer(ca->buckets[1], buckets);
+       };
 
-               if (!r->refcount) {
-                       new->k.type = KEY_TYPE_deleted;
-                       new->k.size = 0;
-               } else {
-                       *bkey_refcount(new) = cpu_to_le64(r->refcount);
-               }
+       return bch2_alloc_read(c, true, metadata_only);
+}
 
-               ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
-               if (ret)
-                       kfree(new);
-       }
-fsck_err:
-       return ret;
+static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = __bucket_array(ca, true);
+               struct bucket *g;
+
+               for_each_bucket(g, buckets) {
+                       if (metadata_only &&
+                           (g->mark.data_type == BCH_DATA_user ||
+                            g->mark.data_type == BCH_DATA_cached ||
+                            g->mark.data_type == BCH_DATA_parity))
+                               continue;
+                       g->_mark.dirty_sectors = 0;
+                       g->_mark.cached_sectors = 0;
+               }
+       };
 }
 
 static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                                bool metadata_only)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
        size_t idx = 0;
@@ -1424,14 +1432,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
        if (metadata_only)
                return 0;
 
-       if (initial) {
-               c->reflink_gc_idx = 0;
-
-               ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
-                               bch2_gc_reflink_done_initial_fn);
-               goto out;
-       }
-
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
@@ -1441,7 +1441,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                if (!refcount)
                        continue;
 
-               r = genradix_ptr(&c->reflink_gc_table, idx);
+               r = genradix_ptr(&c->reflink_gc_table, idx++);
                if (!r ||
                    r->offset != k.k->p.offset ||
                    r->size != k.k->size) {
@@ -1466,12 +1466,22 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
                        bkey_reassemble(new, k);
 
-                       if (!r->refcount)
+                       if (!r->refcount) {
                                new->k.type = KEY_TYPE_deleted;
-                       else
+                               /*
+                                * XXX ugly: bch2_journal_key_insert() queues up
+                                * the key for the journal replay code, which
+                                * doesn't run the extent overwrite pass
+                                */
+                               if (initial)
+                                       new->k.size = 0;
+                       } else {
                                *bkey_refcount(new) = cpu_to_le64(r->refcount);
+                       }
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       ret = initial
+                              ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+                              : __bch2_trans_do(&trans, NULL, NULL, 0,
                                        __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
                        kfree(new);
 
@@ -1480,54 +1490,26 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
                }
        }
 fsck_err:
-       bch2_trans_iter_put(&trans, iter);
-       bch2_trans_exit(&trans);
-out:
-       genradix_free(&c->reflink_gc_table);
+       bch2_trans_iter_exit(&trans, &iter);
        c->reflink_gc_nr = 0;
+       bch2_trans_exit(&trans);
        return ret;
 }
 
-static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
-{
-
-       struct reflink_gc *r;
-       const __le64 *refcount = bkey_refcount_c(k);
-
-       if (!refcount)
-               return 0;
-
-       r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
-                              GFP_KERNEL);
-       if (!r)
-               return -ENOMEM;
-
-       r->offset       = k.k->p.offset;
-       r->size         = k.k->size;
-       r->refcount     = 0;
-       return 0;
-}
-
 static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                                 bool metadata_only)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct reflink_gc *r;
-       int ret;
+       int ret = 0;
 
        if (metadata_only)
                return 0;
 
-       genradix_free(&c->reflink_gc_table);
-       c->reflink_gc_nr = 0;
-
-       if (initial)
-               return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
-                               bch2_gc_reflink_start_initial_fn);
-
        bch2_trans_init(&trans, c, 0, 0);
+       c->reflink_gc_nr = 0;
 
        for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
@@ -1547,10 +1529,89 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
                r->size         = k.k->size;
                r->refcount     = 0;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
-       return 0;
+       return ret;
+}
+
+static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial,
+                                 bool metadata_only)
+{
+       struct genradix_iter iter;
+       struct reflink_gc *r;
+
+       genradix_for_each(&c->reflink_gc_table, iter, r)
+               r->refcount = 0;
+}
+
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+                               bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct gc_stripe *m;
+       const struct bch_stripe *s;
+       char buf[200];
+       unsigned i;
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (k.k->type != KEY_TYPE_stripe)
+                       continue;
+
+               s = bkey_s_c_to_stripe(k).v;
+               m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+
+               for (i = 0; i < s->nr_blocks; i++)
+                       if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+                               goto inconsistent;
+               continue;
+inconsistent:
+               if (fsck_err_on(true, c,
+                               "stripe has wrong block sector count %u:\n"
+                               "  %s\n"
+                               "  should be %u", i,
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                               m ? m->block_sectors[i] : 0)) {
+                       struct bkey_i_stripe *new;
+
+                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+                       if (!new) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+
+                       bkey_reassemble(&new->k_i, k);
+
+                       for (i = 0; i < new->v.nr_blocks; i++)
+                               stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+
+                       ret = initial
+                               ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+                               : __bch2_trans_do(&trans, NULL, NULL, 0,
+                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+                       kfree(new);
+               }
+       }
+fsck_err:
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial,
+                               bool metadata_only)
+{
+       genradix_free(&c->gc_stripes);
 }
 
 /**
@@ -1586,15 +1647,18 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
        /* flush interior btree updates: */
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
-again:
+
        ret   = bch2_gc_start(c, metadata_only) ?:
+               bch2_gc_alloc_start(c, initial, metadata_only) ?:
                bch2_gc_reflink_start(c, initial, metadata_only);
        if (ret)
                goto out;
+again:
+       gc_pos_set(c, gc_phase(GC_PHASE_START));
 
        bch2_mark_superblocks(c);
 
-       if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+       if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
            c->opts.fix_errors != FSCK_OPT_NO) {
                bch_info(c, "starting topology repair pass");
@@ -1628,39 +1692,40 @@ again:
 
        if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
            (!iter && bch2_test_restart_gc)) {
+               if (iter++ > 2) {
+                       bch_info(c, "Unable to fix bucket gens, looping");
+                       ret = -EINVAL;
+                       goto out;
+               }
+
                /*
                 * XXX: make sure gens we fixed got saved
                 */
-               if (iter++ <= 2) {
-                       bch_info(c, "Second GC pass needed, restarting:");
-                       clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                       __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+               bch_info(c, "Second GC pass needed, restarting:");
+               clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
-                       percpu_down_write(&c->mark_lock);
-                       bch2_gc_free(c);
-                       percpu_up_write(&c->mark_lock);
-                       /* flush fsck errors, reset counters */
-                       bch2_flush_fsck_errs(c);
-
-                       goto again;
-               }
+               bch2_gc_stripes_reset(c, initial, metadata_only);
+               bch2_gc_alloc_reset(c, initial, metadata_only);
+               bch2_gc_reflink_reset(c, initial, metadata_only);
 
-               bch_info(c, "Unable to fix bucket gens, looping");
-               ret = -EINVAL;
+               /* flush fsck errors, reset counters */
+               bch2_flush_fsck_errs(c);
+               goto again;
        }
 out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
-               percpu_down_write(&c->mark_lock);
-               ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+               ret   = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+                       bch2_gc_reflink_done(c, initial, metadata_only) ?:
+                       bch2_gc_alloc_done(c, initial, metadata_only) ?:
                        bch2_gc_done(c, initial, metadata_only);
 
                bch2_journal_unblock(&c->journal);
-       } else {
-               percpu_down_write(&c->mark_lock);
        }
 
+       percpu_down_write(&c->mark_lock);
        /* Indicates that gc is no longer in progress: */
        __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
@@ -1695,9 +1760,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
        percpu_down_read(&c->mark_lock);
        bkey_for_each_ptr(ptrs, ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g = PTR_BUCKET(ca, ptr, false);
 
-               if (gen_after(g->mark.gen, ptr->gen) > 16) {
+               if (ptr_stale(ca, ptr) > 16) {
                        percpu_up_read(&c->mark_lock);
                        return true;
                }
@@ -1705,10 +1769,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
        bkey_for_each_ptr(ptrs, ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g = PTR_BUCKET(ca, ptr, false);
+               u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
 
-               if (gen_after(g->gc_gen, ptr->gen))
-                       g->gc_gen = ptr->gen;
+               if (gen_after(*gen, ptr->gen))
+                       *gen = ptr->gen;
        }
        percpu_up_read(&c->mark_lock);
 
@@ -1719,57 +1783,85 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
  * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
  * node pointers currently never have cached pointers that can become stale:
  */
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf sk;
        int ret = 0, commit_err = 0;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH|
-                                  BTREE_ITER_NOT_EXTENTS|
-                                  BTREE_ITER_ALL_SNAPSHOTS);
+       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k))) {
-               c->gc_gens_pos = iter->pos;
+       while ((bch2_trans_begin(trans),
+               k = bch2_btree_iter_peek(&iter)).k) {
+               ret = bkey_err(k);
+
+               if (ret == -EINTR)
+                       continue;
+               if (ret)
+                       break;
+
+               c->gc_gens_pos = iter.pos;
 
                if (gc_btree_gens_key(c, k) && !commit_err) {
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-
                        commit_err =
-                               bch2_trans_update(&trans, iter, sk.k, 0) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
-                                                      BTREE_INSERT_NOWAIT|
-                                                      BTREE_INSERT_NOFAIL);
+                               bch2_trans_update(trans, &iter, sk.k, 0) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
+                                                 BTREE_INSERT_NOWAIT|
+                                                 BTREE_INSERT_NOFAIL);
                        if (commit_err == -EINTR) {
                                commit_err = 0;
                                continue;
                        }
                }
 
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
 
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       int ret;
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       u = bch2_alloc_unpack(k);
+
+       if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+               return 0;
+
+       u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+       return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+}
+
 int bch2_gc_gens(struct bch_fs *c)
 {
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket *g;
+       u64 b, start_time = local_clock();
        unsigned i;
        int ret;
 
@@ -1778,43 +1870,69 @@ int bch2_gc_gens(struct bch_fs *c)
         * introduces a deadlock in the RO path - we currently take the state
         * lock at the start of going RO, thus the gc thread may get stuck:
         */
+       if (!mutex_trylock(&c->gc_gens_lock))
+               return 0;
+
        down_read(&c->gc_lock);
+       bch2_trans_init(&trans, c, 0, 0);
 
        for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
+               struct bucket_gens *gens;
+
+               BUG_ON(ca->oldest_gen);
 
-               for_each_bucket(g, buckets)
-                       g->gc_gen = g->mark.gen;
-               up_read(&ca->bucket_lock);
+               ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+               if (!ca->oldest_gen) {
+                       percpu_ref_put(&ca->ref);
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               gens = bucket_gens(ca);
+
+               for (b = gens->first_bucket;
+                    b < gens->nbuckets; b++)
+                       ca->oldest_gen[b] = gens->b[b];
        }
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if ((1 << i) & BTREE_ID_HAS_PTRS) {
                        c->gc_gens_btree = i;
                        c->gc_gens_pos = POS_MIN;
-                       ret = bch2_gc_btree_gens(c, i);
+                       ret = bch2_gc_btree_gens(&trans, i);
                        if (ret) {
                                bch_err(c, "error recalculating oldest_gen: %i", ret);
                                goto err;
                        }
                }
 
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->oldest_gen = g->gc_gen;
-               up_read(&ca->bucket_lock);
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL,
+                               bch2_alloc_write_oldest_gen(&trans, &iter));
+               if (ret) {
+                       bch_err(c, "error writing oldest_gen: %i", ret);
+                       break;
+               }
        }
+       bch2_trans_iter_exit(&trans, &iter);
 
        c->gc_gens_btree        = 0;
        c->gc_gens_pos          = POS_MIN;
 
        c->gc_count++;
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 err:
+       for_each_member_device(ca, c, i) {
+               kvfree(ca->oldest_gen);
+               ca->oldest_gen = NULL;
+       }
+
+       bch2_trans_exit(&trans);
        up_read(&c->gc_lock);
+       mutex_unlock(&c->gc_gens_lock);
        return ret;
 }
 
index 59dfb069e699402eb57bcc62b63626d3b4f2f25d..0665f5941fcc5a6196c4a80b8e7c6f5479055ca6 100644 (file)
@@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
-void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
  * For concurrent mark and sweep (with other index updates), we define a total
index 40fa0111a3f635cbee8b95582d2aacabe1a93b8a..a3651325a02209672ce5375421a1a3717c992782 100644 (file)
@@ -391,16 +391,10 @@ void bch2_btree_sort_into(struct bch_fs *c,
 
        bch2_btree_node_iter_init_from_start(&src_iter, src);
 
-       if (btree_node_is_extents(src))
-               nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
-                               src, &src_iter,
-                               &dst->format,
-                               true);
-       else
-               nr = bch2_sort_repack(btree_bset_first(dst),
-                               src, &src_iter,
-                               &dst->format,
-                               true);
+       nr = bch2_sort_repack(btree_bset_first(dst),
+                       src, &src_iter,
+                       &dst->format,
+                       true);
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
                               start_time);
@@ -465,16 +459,13 @@ void bch2_btree_build_aux_trees(struct btree *b)
  *
  * Returns true if we sorted (i.e. invalidated iterators
  */
-void bch2_btree_init_next(struct btree_trans *trans,
-                         struct btree_iter *iter,
-                         struct btree *b)
+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
 {
        struct bch_fs *c = trans->c;
        struct btree_node_entry *bne;
        bool reinit_iter = false;
 
        EBUG_ON(!(b->c.lock.state.seq & 1));
-       EBUG_ON(iter && iter->l[b->c.level].b != b);
        BUG_ON(bset_written(b, bset(b, &b->set[1])));
 
        if (b->nsets == MAX_BSETS &&
@@ -503,8 +494,8 @@ void bch2_btree_init_next(struct btree_trans *trans,
 
        bch2_btree_build_aux_trees(b);
 
-       if (iter && reinit_iter)
-               bch2_btree_iter_reinit_node(iter, b);
+       if (reinit_iter)
+               bch2_trans_node_reinit_iter(trans, b);
 }
 
 static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
@@ -569,7 +560,8 @@ enum btree_validate_ret {
                                                                        \
        switch (write) {                                                \
        case READ:                                                      \
-               bch_err(c, "%s", _buf2);                                \
+               if (_buf2)                                              \
+                       bch_err(c, "%s", _buf2);                        \
                                                                        \
                switch (type) {                                         \
                case BTREE_ERR_FIXABLE:                                 \
@@ -695,7 +687,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                     BTREE_ERR_FATAL, c, ca, b, i,
                     "BSET_SEPARATE_WHITEOUTS no longer supported");
 
-       if (btree_err_on(offset + sectors > c->opts.btree_node_size,
+       if (btree_err_on(offset + sectors > btree_sectors(c),
                         BTREE_ERR_FIXABLE, c, ca, b, i,
                         "bset past end of btree node")) {
                i->u64s = 0;
@@ -909,7 +901,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                             b->data->keys.seq, bp->seq);
        }
 
-       while (b->written < (ptr_written ?: c->opts.btree_node_size)) {
+       while (b->written < (ptr_written ?: btree_sectors(c))) {
                unsigned sectors, whiteout_u64s = 0;
                struct nonce nonce;
                struct bch_csum csum;
@@ -980,19 +972,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
                SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
 
-               b->written += sectors;
-
                blacklisted = bch2_journal_seq_is_blacklisted(c,
                                        le64_to_cpu(i->journal_seq),
                                        true);
 
                btree_err_on(blacklisted && first,
                             BTREE_ERR_FIXABLE, c, ca, b, i,
-                            "first btree node bset has blacklisted journal seq");
+                            "first btree node bset has blacklisted journal seq (%llu)",
+                            le64_to_cpu(i->journal_seq));
 
                btree_err_on(blacklisted && ptr_written,
                             BTREE_ERR_FIXABLE, c, ca, b, i,
-                            "found blacklisted bset in btree node with sectors_written");
+                            "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u",
+                            le64_to_cpu(i->journal_seq),
+                            b->written, b->written + sectors, ptr_written);
+
+               b->written += sectors;
+
                if (blacklisted && !first)
                        continue;
 
@@ -1218,7 +1214,7 @@ static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
        if (le64_to_cpu(bn->magic) !=  bset_magic(c))
                return 0;
 
-       while (offset < c->opts.btree_node_size) {
+       while (offset < btree_sectors(c)) {
                if (!offset) {
                        offset += vstruct_sectors(bn, c->block_bits);
                } else {
@@ -1240,7 +1236,7 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
        if (!offset)
                return false;
 
-       while (offset < c->opts.btree_node_size) {
+       while (offset < btree_sectors(c)) {
                bne = data + (offset << 9);
                if (bne->keys.seq == bn->keys.seq)
                        return true;
@@ -1260,7 +1256,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl)
        bool dump_bset_maps = false;
        bool have_retry = false;
        int ret = 0, best = -1, write = READ;
-       unsigned i, written, written2;
+       unsigned i, written = 0, written2 = 0;
        __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
                ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
 
@@ -1310,7 +1306,7 @@ fsck_err:
                        if (ra->err[i])
                                continue;
 
-                       while (offset < c->opts.btree_node_size) {
+                       while (offset < btree_sectors(c)) {
                                if (!offset) {
                                        sectors = vstruct_sectors(bn, c->block_bits);
                                } else {
@@ -1327,7 +1323,7 @@ fsck_err:
                                offset += sectors;
                        }
 
-                       while (offset < c->opts.btree_node_size) {
+                       while (offset < btree_sectors(c)) {
                                bne = ra->buf[i] + (offset << 9);
                                if (bne->keys.seq == bn->keys.seq) {
                                        if (!gap)
@@ -1805,8 +1801,8 @@ do_write:
        BUG_ON(btree_node_fake(b));
        BUG_ON((b->will_make_reachable != 0) != !b->written);
 
-       BUG_ON(b->written >= c->opts.btree_node_size);
-       BUG_ON(b->written & (c->opts.block_size - 1));
+       BUG_ON(b->written >= btree_sectors(c));
+       BUG_ON(b->written & (block_sectors(c) - 1));
        BUG_ON(bset_written(b, btree_bset_last(b)));
        BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
        BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
@@ -1879,7 +1875,7 @@ do_write:
        memset(data + bytes_to_write, 0,
               (sectors_to_write << 9) - bytes_to_write);
 
-       BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
+       BUG_ON(b->written + sectors_to_write > btree_sectors(c));
        BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
        BUG_ON(i->seq != b->data->keys.seq);
 
index 7fdcf879c7d468ae796c4079a791a9c7570b648a..0f20224e2a77cec3070850226ea52cd45ebb3695 100644 (file)
@@ -134,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 void bch2_btree_node_drop_keys_outside_node(struct btree *);
 
 void bch2_btree_build_aux_trees(struct btree *);
-void bch2_btree_init_next(struct btree_trans *, struct btree_iter *,
-                         struct btree *);
+void bch2_btree_init_next(struct btree_trans *, struct btree *);
 
 int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
                              struct btree *, bool);
index fe710d19ca1994d5149f2496dc500704b4c10a42..ae63ecbc19548a66f8fd9396eca41e650dc02ec5 100644 (file)
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
+#include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 
 #include <linux/prefetch.h>
 #include <trace/events/bcachefs.h>
 
-static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
-static void btree_trans_sort_iters(struct btree_trans *);
-static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *);
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long);
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *,
-                                                struct btree_iter *);
-static void btree_iter_copy(struct btree_iter *, struct btree_iter *);
+static void btree_trans_verify_sorted(struct btree_trans *);
+static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int);
 
-static inline int btree_iter_cmp(const struct btree_iter *l,
-                                const struct btree_iter *r)
+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
+                                      struct btree_path *);
+
+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
 {
-       return   cmp_int(l->btree_id, r->btree_id) ?:
-               -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
-                bkey_cmp(l->real_pos, r->real_pos);
+#ifdef CONFIG_BCACHEFS_DEBUG
+       return iter->ip_allocated;
+#else
+       return 0;
+#endif
 }
 
-static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+{
+       if (need_resched() || race_fault()) {
+               bch2_trans_unlock(trans);
+               schedule();
+               return bch2_trans_relock(trans) ? 0 : -EINTR;
+       } else {
+               return 0;
+       }
+}
+
+static inline int __btree_path_cmp(const struct btree_path *l,
+                                  enum btree_id        r_btree_id,
+                                  bool                 r_cached,
+                                  struct bpos          r_pos,
+                                  unsigned             r_level)
+{
+       /*
+        * Must match lock ordering as defined by __bch2_btree_node_lock:
+        */
+       return   cmp_int(l->btree_id,   r_btree_id) ?:
+                cmp_int((int) l->cached,       (int) r_cached) ?:
+                bpos_cmp(l->pos,       r_pos) ?:
+               -cmp_int(l->level,      r_level);
+}
+
+static inline int btree_path_cmp(const struct btree_path *l,
+                                const struct btree_path *r)
 {
-       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
+       return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level);
+}
 
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
+{
        /* Are we iterating over keys in all snapshots? */
        if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
                p = bpos_successor(p);
@@ -50,8 +88,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
 
 static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
 {
-       EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
-
        /* Are we iterating over keys in all snapshots? */
        if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
                p = bpos_predecessor(p);
@@ -63,10 +99,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos
        return p;
 }
 
-static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
+static inline bool is_btree_node(struct btree_path *path, unsigned l)
 {
        return l < BTREE_MAX_DEPTH &&
-               (unsigned long) iter->l[l].b >= 128;
+               (unsigned long) path->l[l].b >= 128;
 }
 
 static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
@@ -79,41 +115,40 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
        return pos;
 }
 
-static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
+static inline bool btree_path_pos_before_node(struct btree_path *path,
                                              struct btree *b)
 {
-       return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
+       return bpos_cmp(path->pos, b->data->min_key) < 0;
 }
 
-static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
+static inline bool btree_path_pos_after_node(struct btree_path *path,
                                             struct btree *b)
 {
-       return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
+       return bpos_cmp(b->key.k.p, path->pos) < 0;
 }
 
-static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
+static inline bool btree_path_pos_in_node(struct btree_path *path,
                                          struct btree *b)
 {
-       return iter->btree_id == b->c.btree_id &&
-               !btree_iter_pos_before_node(iter, b) &&
-               !btree_iter_pos_after_node(iter, b);
+       return path->btree_id == b->c.btree_id &&
+               !btree_path_pos_before_node(path, b) &&
+               !btree_path_pos_after_node(path, b);
 }
 
 /* Btree node locking: */
 
-void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
+void bch2_btree_node_unlock_write(struct btree_trans *trans,
+                       struct btree_path *path, struct btree *b)
 {
-       bch2_btree_node_unlock_write_inlined(b, iter);
+       bch2_btree_node_unlock_write_inlined(trans, path, b);
 }
 
-void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
        unsigned readers = 0;
 
-       EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
-
-       trans_for_each_iter(iter->trans, linked)
+       trans_for_each_path(trans, linked)
                if (linked->l[b->c.level].b == b &&
                    btree_node_read_locked(linked, b->c.level))
                        readers++;
@@ -124,140 +159,155 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
         * goes to 0, and it's safe because we have the node intent
         * locked:
         */
-       atomic64_sub(__SIX_VAL(read_lock, readers),
-                    &b->c.lock.state.counter);
-       btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
-       atomic64_add(__SIX_VAL(read_lock, readers),
-                    &b->c.lock.state.counter);
+       if (!b->c.lock.readers)
+               atomic64_sub(__SIX_VAL(read_lock, readers),
+                            &b->c.lock.state.counter);
+       else
+               this_cpu_sub(*b->c.lock.readers, readers);
+
+       six_lock_write(&b->c.lock, NULL, NULL);
+
+       if (!b->c.lock.readers)
+               atomic64_add(__SIX_VAL(read_lock, readers),
+                            &b->c.lock.state.counter);
+       else
+               this_cpu_add(*b->c.lock.readers, readers);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+bool __bch2_btree_node_relock(struct btree_trans *trans,
+                             struct btree_path *path, unsigned level)
 {
-       struct btree *b = btree_iter_node(iter, level);
-       int want = __btree_lock_want(iter, level);
+       struct btree *b = btree_path_node(path, level);
+       int want = __btree_lock_want(path, level);
 
-       if (!is_btree_node(iter, level))
-               return false;
+       if (!is_btree_node(path, level))
+               goto fail;
 
        if (race_fault())
-               return false;
+               goto fail;
 
-       if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
-           (btree_node_lock_seq_matches(iter, b, level) &&
-            btree_node_lock_increment(iter->trans, b, level, want))) {
-               mark_btree_node_locked(iter, level, want);
+       if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) ||
+           (btree_node_lock_seq_matches(path, b, level) &&
+            btree_node_lock_increment(trans, b, level, want))) {
+               mark_btree_node_locked(path, level, want);
                return true;
-       } else {
-               return false;
        }
+fail:
+       trace_btree_node_relock_fail(trans->fn, _RET_IP_,
+                                    path->btree_id,
+                                    &path->pos,
+                                    (unsigned long) b,
+                                    path->l[level].lock_seq,
+                                    is_btree_node(path, level) ? b->c.lock.state.seq : 0);
+       return false;
 }
 
-static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+bool bch2_btree_node_upgrade(struct btree_trans *trans,
+                            struct btree_path *path, unsigned level)
 {
-       struct btree *b = iter->l[level].b;
+       struct btree *b = path->l[level].b;
 
-       EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
-
-       if (!is_btree_node(iter, level))
+       if (!is_btree_node(path, level))
                return false;
 
-       if (btree_node_intent_locked(iter, level))
+       switch (btree_lock_want(path, level)) {
+       case BTREE_NODE_UNLOCKED:
+               BUG_ON(btree_node_locked(path, level));
+               return true;
+       case BTREE_NODE_READ_LOCKED:
+               BUG_ON(btree_node_intent_locked(path, level));
+               return bch2_btree_node_relock(trans, path, level);
+       case BTREE_NODE_INTENT_LOCKED:
+               break;
+       }
+
+       if (btree_node_intent_locked(path, level))
                return true;
 
        if (race_fault())
                return false;
 
-       if (btree_node_locked(iter, level)
+       if (btree_node_locked(path, level)
            ? six_lock_tryupgrade(&b->c.lock)
-           : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
+           : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq))
                goto success;
 
-       if (btree_node_lock_seq_matches(iter, b, level) &&
-           btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
-               btree_node_unlock(iter, level);
+       if (btree_node_lock_seq_matches(path, b, level) &&
+           btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
+               btree_node_unlock(path, level);
                goto success;
        }
 
        return false;
 success:
-       mark_btree_node_intent_locked(iter, level);
+       mark_btree_node_intent_locked(path, level);
        return true;
 }
 
-static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade,
-                                       unsigned long trace_ip)
+static inline bool btree_path_get_locks(struct btree_trans *trans,
+                                       struct btree_path *path,
+                                       bool upgrade)
 {
-       unsigned l = iter->level;
+       unsigned l = path->level;
        int fail_idx = -1;
 
        do {
-               if (!btree_iter_node(iter, l))
+               if (!btree_path_node(path, l))
                        break;
 
                if (!(upgrade
-                     ? bch2_btree_node_upgrade(iter, l)
-                     : bch2_btree_node_relock(iter, l))) {
-                       (upgrade
-                        ? trace_node_upgrade_fail
-                        : trace_node_relock_fail)(iter->trans->ip, trace_ip,
-                                       btree_iter_type(iter) == BTREE_ITER_CACHED,
-                                       iter->btree_id, &iter->real_pos,
-                                       l, iter->l[l].lock_seq,
-                                       is_btree_node(iter, l)
-                                       ? 0
-                                       : (unsigned long) iter->l[l].b,
-                                       is_btree_node(iter, l)
-                                       ? iter->l[l].b->c.lock.state.seq
-                                       : 0);
+                     ? bch2_btree_node_upgrade(trans, path, l)
+                     : bch2_btree_node_relock(trans, path, l)))
                        fail_idx = l;
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               }
 
                l++;
-       } while (l < iter->locks_want);
+       } while (l < path->locks_want);
 
        /*
         * When we fail to get a lock, we have to ensure that any child nodes
-        * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+        * can't be relocked so bch2_btree_path_traverse has to walk back up to
         * the node that we failed to relock:
         */
-       while (fail_idx >= 0) {
-               btree_node_unlock(iter, fail_idx);
-               iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
-               --fail_idx;
+       if (fail_idx >= 0) {
+               __bch2_btree_path_unlock(path);
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+
+               do {
+                       path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+                       --fail_idx;
+               } while (fail_idx >= 0);
        }
 
-       if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
-               iter->uptodate = BTREE_ITER_NEED_PEEK;
+       if (path->uptodate == BTREE_ITER_NEED_RELOCK)
+               path->uptodate = BTREE_ITER_UPTODATE;
 
-       bch2_btree_trans_verify_locks(iter->trans);
+       bch2_trans_verify_locks(trans);
 
-       return iter->uptodate < BTREE_ITER_NEED_RELOCK;
+       return path->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
 static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
-                                 enum btree_iter_type type)
+                                 bool cached)
 {
-       return  type != BTREE_ITER_CACHED
+       return !cached
                ? container_of(_b, struct btree, c)->key.k.p
                : container_of(_b, struct bkey_cached, c)->key.pos;
 }
 
 /* Slowpath: */
-bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
-                           unsigned level, struct btree_iter *iter,
+bool __bch2_btree_node_lock(struct btree_trans *trans,
+                           struct btree_path *path,
+                           struct btree *b,
+                           struct bpos pos, unsigned level,
                            enum six_lock_type type,
                            six_lock_should_sleep_fn should_sleep_fn, void *p,
                            unsigned long ip)
 {
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *linked, *deadlock_iter = NULL;
-       u64 start_time = local_clock();
-       unsigned reason = 9;
-       bool ret;
+       struct btree_path *linked;
+       unsigned reason;
 
        /* Check if it's safe to block: */
-       trans_for_each_iter(trans, linked) {
+       trans_for_each_path(trans, linked) {
                if (!linked->nodes_locked)
                        continue;
 
@@ -275,141 +325,114 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                 */
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
-                       deadlock_iter = linked;
                        reason = 1;
+                       goto deadlock;
                }
 
-               if (linked->btree_id != iter->btree_id) {
-                       if (linked->btree_id > iter->btree_id) {
-                               deadlock_iter = linked;
-                               reason = 3;
-                       }
-                       continue;
+               if (linked->btree_id != path->btree_id) {
+                       if (linked->btree_id < path->btree_id)
+                               continue;
+
+                       reason = 3;
+                       goto deadlock;
                }
 
                /*
-                * Within the same btree, cached iterators come before non
-                * cached iterators:
+                * Within the same btree, non-cached paths come before cached
+                * paths:
                 */
-               if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
-                       if (btree_iter_is_cached(iter)) {
-                               deadlock_iter = linked;
-                               reason = 4;
-                       }
-                       continue;
+               if (linked->cached != path->cached) {
+                       if (!linked->cached)
+                               continue;
+
+                       reason = 4;
+                       goto deadlock;
                }
 
                /*
                 * Interior nodes must be locked before their descendants: if
-                * another iterator has possible descendants locked of the node
+                * another path has possible descendants locked of the node
                 * we're about to lock, it must have the ancestors locked too:
                 */
                if (level > __fls(linked->nodes_locked)) {
-                       deadlock_iter = linked;
                        reason = 5;
+                       goto deadlock;
                }
 
                /* Must lock btree nodes in key order: */
                if (btree_node_locked(linked, level) &&
                    bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
-                                                btree_iter_type(linked))) <= 0) {
-                       deadlock_iter = linked;
-                       reason = 7;
+                                                linked->cached)) <= 0) {
                        BUG_ON(trans->in_traverse_all);
+                       reason = 7;
+                       goto deadlock;
                }
        }
 
-       if (unlikely(deadlock_iter)) {
-               trace_trans_restart_would_deadlock(trans->ip, ip,
-                               trans->in_traverse_all, reason,
-                               deadlock_iter->btree_id,
-                               btree_iter_type(deadlock_iter),
-                               &deadlock_iter->real_pos,
-                               iter->btree_id,
-                               btree_iter_type(iter),
-                               &pos);
-               btree_trans_restart(trans);
-               return false;
-       }
-
-       if (six_trylock_type(&b->c.lock, type))
-               return true;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans->locking_iter_idx = iter->idx;
-       trans->locking_pos      = pos;
-       trans->locking_btree_id = iter->btree_id;
-       trans->locking_level    = level;
-       trans->locking          = b;
-#endif
-
-       ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
-
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans->locking = NULL;
-#endif
-       if (ret)
-               bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
-                                      start_time);
-       return ret;
+       return btree_node_lock_type(trans, path, b, pos, level,
+                                   type, should_sleep_fn, p);
+deadlock:
+       trace_trans_restart_would_deadlock(trans->fn, ip,
+                       trans->in_traverse_all, reason,
+                       linked->btree_id,
+                       linked->cached,
+                       &linked->pos,
+                       path->btree_id,
+                       path->cached,
+                       &pos);
+       btree_trans_restart(trans);
+       return false;
 }
 
 /* Btree iterator locking: */
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
+
+static void bch2_btree_path_verify_locks(struct btree_path *path)
 {
        unsigned l;
 
-       if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
-               BUG_ON(iter->nodes_locked);
+       if (!path->nodes_locked) {
+               BUG_ON(path->uptodate == BTREE_ITER_UPTODATE &&
+                      btree_path_node(path, path->level));
                return;
        }
 
-       for (l = 0; btree_iter_node(iter, l); l++) {
-               if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
-                   !btree_node_locked(iter, l))
-                       continue;
-
-               BUG_ON(btree_lock_want(iter, l) !=
-                      btree_node_locked_type(iter, l));
-       }
+       for (l = 0; btree_path_node(path, l); l++)
+               BUG_ON(btree_lock_want(path, l) !=
+                      btree_node_locked_type(path, l));
 }
 
-void bch2_btree_trans_verify_locks(struct btree_trans *trans)
+void bch2_trans_verify_locks(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               bch2_btree_iter_verify_locks(iter);
+       trans_for_each_path(trans, path)
+               bch2_btree_path_verify_locks(path);
 }
 #else
-static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {}
 #endif
 
+/* Btree path locking: */
+
 /*
  * Only for btree_cache.c - only relocks intent locks
  */
-bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
+bool bch2_btree_path_relock_intent(struct btree_trans *trans,
+                                  struct btree_path *path)
 {
        unsigned l;
 
-       for (l = iter->level;
-            l < iter->locks_want && btree_iter_node(iter, l);
+       for (l = path->level;
+            l < path->locks_want && btree_path_node(path, l);
             l++) {
-               if (!bch2_btree_node_relock(iter, l)) {
-                       trace_node_relock_fail(iter->trans->ip, _RET_IP_,
-                                       btree_iter_type(iter) == BTREE_ITER_CACHED,
-                                       iter->btree_id, &iter->real_pos,
-                                       l, iter->l[l].lock_seq,
-                                       is_btree_node(iter, l)
-                                       ? 0
-                                       : (unsigned long) iter->l[l].b,
-                                       is_btree_node(iter, l)
-                                       ? iter->l[l].b->c.lock.state.seq
-                                       : 0);
-                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-                       btree_trans_restart(iter->trans);
+               if (!bch2_btree_node_relock(trans, path, l)) {
+                       __bch2_btree_path_unlock(path);
+                       btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+                       trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_,
+                                                  path->btree_id, &path->pos);
+                       btree_trans_restart(trans);
                        return false;
                }
        }
@@ -418,25 +441,30 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter)
 }
 
 __flatten
-bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+static bool bch2_btree_path_relock(struct btree_trans *trans,
+                       struct btree_path *path, unsigned long trace_ip)
 {
-       bool ret = btree_iter_get_locks(iter, false, trace_ip);
+       bool ret = btree_path_get_locks(trans, path, false);
 
-       if (!ret)
-               btree_trans_restart(iter->trans);
+       if (!ret) {
+               trace_trans_restart_relock_path(trans->fn, trace_ip,
+                                               path->btree_id, &path->pos);
+               btree_trans_restart(trans);
+       }
        return ret;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+bool __bch2_btree_path_upgrade(struct btree_trans *trans,
+                              struct btree_path *path,
                               unsigned new_locks_want)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       EBUG_ON(iter->locks_want >= new_locks_want);
+       EBUG_ON(path->locks_want >= new_locks_want);
 
-       iter->locks_want = new_locks_want;
+       path->locks_want = new_locks_want;
 
-       if (btree_iter_get_locks(iter, true, _THIS_IP_))
+       if (btree_path_get_locks(trans, path, true))
                return true;
 
        /*
@@ -444,7 +472,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
         * iterators in the btree_trans here.
         *
         * On failure to upgrade the iterator, setting iter->locks_want and
-        * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
+        * calling get_locks() is sufficient to make bch2_btree_path_traverse()
         * get the locks we want on transaction restart.
         *
         * But if this iterator was a clone, on transaction restart what we did
@@ -456,75 +484,67 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
         *
         * The code below used to be needed to ensure ancestor nodes get locked
         * before interior nodes - now that's handled by
-        * bch2_btree_iter_traverse_all().
+        * bch2_btree_path_traverse_all().
         */
-       trans_for_each_iter(iter->trans, linked)
-               if (linked != iter &&
-                   btree_iter_type(linked) == btree_iter_type(iter) &&
-                   linked->btree_id == iter->btree_id &&
+       trans_for_each_path(trans, linked)
+               if (linked != path &&
+                   linked->cached == path->cached &&
+                   linked->btree_id == path->btree_id &&
                    linked->locks_want < new_locks_want) {
                        linked->locks_want = new_locks_want;
-                       btree_iter_get_locks(linked, true, _THIS_IP_);
+                       btree_path_get_locks(trans, linked, true);
                }
 
-       if (iter->should_be_locked)
-               btree_trans_restart(iter->trans);
        return false;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+void __bch2_btree_path_downgrade(struct btree_path *path,
                                 unsigned new_locks_want)
 {
        unsigned l;
 
-       EBUG_ON(iter->locks_want < new_locks_want);
+       EBUG_ON(path->locks_want < new_locks_want);
 
-       iter->locks_want = new_locks_want;
+       path->locks_want = new_locks_want;
 
-       while (iter->nodes_locked &&
-              (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
-               if (l > iter->level) {
-                       btree_node_unlock(iter, l);
+       while (path->nodes_locked &&
+              (l = __fls(path->nodes_locked)) >= path->locks_want) {
+               if (l > path->level) {
+                       btree_node_unlock(path, l);
                } else {
-                       if (btree_node_intent_locked(iter, l)) {
-                               six_lock_downgrade(&iter->l[l].b->c.lock);
-                               iter->nodes_intent_locked ^= 1 << l;
+                       if (btree_node_intent_locked(path, l)) {
+                               six_lock_downgrade(&path->l[l].b->c.lock);
+                               path->nodes_intent_locked ^= 1 << l;
                        }
                        break;
                }
        }
 
-       bch2_btree_trans_verify_locks(iter->trans);
+       bch2_btree_path_verify_locks(path);
 }
 
 void bch2_trans_downgrade(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               bch2_btree_iter_downgrade(iter);
+       trans_for_each_path(trans, path)
+               bch2_btree_path_downgrade(path);
 }
 
 /* Btree transaction locking: */
 
-static inline bool btree_iter_should_be_locked(struct btree_iter *iter)
-{
-       return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) ||
-               iter->should_be_locked;
-}
-
 bool bch2_trans_relock(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
        if (unlikely(trans->restarted))
                return false;
 
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_should_be_locked(iter) &&
-                   !bch2_btree_iter_relock(iter, _RET_IP_)) {
-                       trace_trans_restart_relock(trans->ip, _RET_IP_,
-                                       iter->btree_id, &iter->real_pos);
+       trans_for_each_path(trans, path)
+               if (path->should_be_locked &&
+                   !bch2_btree_path_relock(trans, path, _RET_IP_)) {
+                       trace_trans_restart_relock(trans->fn, _RET_IP_,
+                                       path->btree_id, &path->pos);
                        BUG_ON(!trans->restarted);
                        return false;
                }
@@ -533,10 +553,10 @@ bool bch2_trans_relock(struct btree_trans *trans)
 
 void bch2_trans_unlock(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               __bch2_btree_iter_unlock(iter);
+       trans_for_each_path(trans, path)
+               __bch2_btree_path_unlock(path);
 
        BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
 }
@@ -545,26 +565,27 @@ void bch2_trans_unlock(struct btree_trans *trans)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 
-static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
+static void bch2_btree_path_verify_cached(struct btree_trans *trans,
+                                         struct btree_path *path)
 {
        struct bkey_cached *ck;
-       bool locked = btree_node_locked(iter, 0);
+       bool locked = btree_node_locked(path, 0);
 
-       if (!bch2_btree_node_relock(iter, 0))
+       if (!bch2_btree_node_relock(trans, path, 0))
                return;
 
-       ck = (void *) iter->l[0].b;
-       BUG_ON(ck->key.btree_id != iter->btree_id ||
-              bkey_cmp(ck->key.pos, iter->pos));
+       ck = (void *) path->l[0].b;
+       BUG_ON(ck->key.btree_id != path->btree_id ||
+              bkey_cmp(ck->key.pos, path->pos));
 
        if (!locked)
-               btree_node_unlock(iter, 0);
+               btree_node_unlock(path, 0);
 }
 
-static void bch2_btree_iter_verify_level(struct btree_iter *iter,
-                                        unsigned level)
+static void bch2_btree_path_verify_level(struct btree_trans *trans,
+                               struct btree_path *path, unsigned level)
 {
-       struct btree_iter_level *l;
+       struct btree_path_level *l;
        struct btree_node_iter tmp;
        bool locked;
        struct bkey_packed *p, *k;
@@ -574,65 +595,52 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter,
        if (!bch2_debug_check_iterators)
                return;
 
-       l       = &iter->l[level];
+       l       = &path->l[level];
        tmp     = l->iter;
-       locked  = btree_node_locked(iter, level);
+       locked  = btree_node_locked(path, level);
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
+       if (path->cached) {
                if (!level)
-                       bch2_btree_iter_verify_cached(iter);
+                       bch2_btree_path_verify_cached(trans, path);
                return;
        }
 
-       BUG_ON(iter->level < iter->min_depth);
-
-       if (!btree_iter_node(iter, level))
+       if (!btree_path_node(path, level))
                return;
 
-       if (!bch2_btree_node_relock(iter, level))
+       if (!bch2_btree_node_relock(trans, path, level))
                return;
 
-       BUG_ON(!btree_iter_pos_in_node(iter, l->b));
-
-       /*
-        * node iterators don't use leaf node iterator:
-        */
-       if (btree_iter_type(iter) == BTREE_ITER_NODES &&
-           level <= iter->min_depth)
-               goto unlock;
+       BUG_ON(!btree_path_pos_in_node(path, l->b));
 
        bch2_btree_node_iter_verify(&l->iter, l->b);
 
        /*
-        * For interior nodes, the iterator will have skipped past
-        * deleted keys:
-        *
-        * For extents, the iterator may have skipped past deleted keys (but not
-        * whiteouts)
+        * For interior nodes, the iterator will have skipped past deleted keys:
         */
-       p = level || btree_node_type_is_extents(iter->btree_id)
+       p = level
                ? bch2_btree_node_iter_prev(&tmp, l->b)
                : bch2_btree_node_iter_prev_all(&tmp, l->b);
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
 
-       if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
+       if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) {
                msg = "before";
                goto err;
        }
 
-       if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+       if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
                msg = "after";
                goto err;
        }
-unlock:
+
        if (!locked)
-               btree_node_unlock(iter, level);
+               btree_node_unlock(path, level);
        return;
 err:
        strcpy(buf2, "(none)");
        strcpy(buf3, "(none)");
 
-       bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+       bch2_bpos_to_text(&PBUF(buf1), path->pos);
 
        if (p) {
                struct bkey uk = bkey_unpack_key(l->b, p);
@@ -644,79 +652,175 @@ err:
                bch2_bkey_to_text(&PBUF(buf3), &uk);
        }
 
-       panic("iterator should be %s key at level %u:\n"
-             "iter pos %s\n"
+       panic("path should be %s key at level %u:\n"
+             "path pos %s\n"
              "prev key %s\n"
              "cur  key %s\n",
              msg, level, buf1, buf2, buf3);
 }
 
-static void bch2_btree_iter_verify(struct btree_iter *iter)
+static void bch2_btree_path_verify(struct btree_trans *trans,
+                                  struct btree_path *path)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
-       enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
-       EBUG_ON(iter->btree_id >= BTREE_ID_NR);
+       EBUG_ON(path->btree_id >= BTREE_ID_NR);
 
-       BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
-              iter->pos.snapshot != iter->snapshot);
+       for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) {
+               if (!path->l[i].b) {
+                       BUG_ON(!path->cached &&
+                              c->btree_roots[path->btree_id].b->c.level > i);
+                       break;
+               }
+
+               bch2_btree_path_verify_level(trans, path, i);
+       }
+
+       bch2_btree_path_verify_locks(path);
+}
+
+void bch2_trans_verify_paths(struct btree_trans *trans)
+{
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               bch2_btree_path_verify(trans, path);
+}
+
+static void bch2_btree_iter_verify(struct btree_iter *iter)
+{
+       struct btree_trans *trans = iter->trans;
+
+       BUG_ON(iter->btree_id >= BTREE_ID_NR);
+
+       BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
 
        BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
 
-       BUG_ON(type == BTREE_ITER_NODES &&
-              !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
-
-       BUG_ON(type != BTREE_ITER_NODES &&
+       BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               !btree_type_has_snapshots(iter->btree_id));
 
-       for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) {
-               if (!iter->l[i].b) {
-                       BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
-                       break;
-               }
-
-               bch2_btree_iter_verify_level(iter, i);
-       }
-
-       bch2_btree_iter_verify_locks(iter);
+       if (iter->update_path)
+               bch2_btree_path_verify(trans, iter->update_path);
+       bch2_btree_path_verify(trans, iter->path);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
 {
-       enum btree_iter_type type = btree_iter_type(iter);
+       BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+              !iter->pos.snapshot);
 
        BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               iter->pos.snapshot != iter->snapshot);
 
-       BUG_ON((type == BTREE_ITER_KEYS ||
-               type == BTREE_ITER_CACHED) &&
-              (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
-               bkey_cmp(iter->pos, iter->k.p) > 0));
+       BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
+              bkey_cmp(iter->pos, iter->k.p) > 0);
 }
 
-void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k)
 {
-       struct btree_iter *iter;
+       struct btree_trans *trans = iter->trans;
+       struct btree_iter copy;
+       struct bkey_s_c prev;
+       int ret = 0;
 
        if (!bch2_debug_check_iterators)
-               return;
+               return 0;
+
+       if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS))
+               return 0;
+
+       if (bkey_err(k) || !k.k)
+               return 0;
+
+       BUG_ON(!bch2_snapshot_is_ancestor(trans->c,
+                                         iter->snapshot,
+                                         k.k->p.snapshot));
+
+       bch2_trans_iter_init(trans, &copy, iter->btree_id, iter->pos,
+                            BTREE_ITER_NOPRESERVE|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       prev = bch2_btree_iter_prev(&copy);
+       if (!prev.k)
+               goto out;
+
+       ret = bkey_err(prev);
+       if (ret)
+               goto out;
+
+       if (!bkey_cmp(prev.k->p, k.k->p) &&
+           bch2_snapshot_is_ancestor(trans->c, iter->snapshot,
+                                     prev.k->p.snapshot) > 0) {
+               char buf1[100], buf2[200];
+
+               bch2_bkey_to_text(&PBUF(buf1), k.k);
+               bch2_bkey_to_text(&PBUF(buf2), prev.k);
+
+               panic("iter snap %u\n"
+                     "k    %s\n"
+                     "prev %s\n",
+                     iter->snapshot,
+                     buf1, buf2);
+       }
+out:
+       bch2_trans_iter_exit(trans, &copy);
+       return ret;
+}
+
+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+                           struct bpos pos, bool key_cache)
+{
+       struct btree_path *path;
+       unsigned idx;
+       char buf[100];
+
+       trans_for_each_path_inorder(trans, path, idx) {
+               int cmp = cmp_int(path->btree_id, id) ?:
+                       cmp_int(path->cached, key_cache);
+
+               if (cmp > 0)
+                       break;
+               if (cmp < 0)
+                       continue;
+
+               if (!(path->nodes_locked & 1) ||
+                   !path->should_be_locked)
+                       continue;
+
+               if (!key_cache) {
+                       if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 &&
+                           bkey_cmp(pos, path->l[0].b->key.k.p) <= 0)
+                               return;
+               } else {
+                       if (!bkey_cmp(pos, path->pos))
+                               return;
+               }
+       }
 
-       trans_for_each_iter_with_node(trans, b, iter)
-               bch2_btree_iter_verify_level(iter, b->c.level);
+       bch2_dump_trans_paths_updates(trans);
+       panic("not locked: %s %s%s\n",
+             bch2_btree_ids[id],
+             (bch2_bpos_to_text(&PBUF(buf), pos), buf),
+             key_cache ? " cached" : "");
 }
 
 #else
 
-static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
+static inline void bch2_btree_path_verify_level(struct btree_trans *trans,
+                                               struct btree_path *path, unsigned l) {}
+static inline void bch2_btree_path_verify(struct btree_trans *trans,
+                                         struct btree_path *path) {}
 static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
 static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; }
 
 #endif
 
+/* Btree path: fixups after btree updates */
+
 static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
                                        struct btree *b,
                                        struct bset_tree *t,
@@ -734,40 +838,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
        bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
 }
 
-static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+static void __bch2_btree_path_fix_key_modified(struct btree_path *path,
                                               struct btree *b,
                                               struct bkey_packed *where)
 {
-       struct btree_iter_level *l = &iter->l[b->c.level];
+       struct btree_path_level *l = &path->l[b->c.level];
 
        if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
                return;
 
-       if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
+       if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0)
                bch2_btree_node_iter_advance(&l->iter, l->b);
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
                                      struct btree *b,
                                      struct bkey_packed *where)
 {
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter_with_node(iter->trans, b, linked) {
-               __bch2_btree_iter_fix_key_modified(linked, b, where);
-               bch2_btree_iter_verify_level(linked, b->c.level);
+       trans_for_each_path_with_node(trans, b, path) {
+               __bch2_btree_path_fix_key_modified(path, b, where);
+               bch2_btree_path_verify_level(trans, path, b->c.level);
        }
 }
 
-static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
-                                     struct btree *b,
-                                     struct btree_node_iter *node_iter,
-                                     struct bset_tree *t,
-                                     struct bkey_packed *where,
-                                     unsigned clobber_u64s,
-                                     unsigned new_u64s)
+static void __bch2_btree_node_iter_fix(struct btree_path *path,
+                                      struct btree *b,
+                                      struct btree_node_iter *node_iter,
+                                      struct bset_tree *t,
+                                      struct bkey_packed *where,
+                                      unsigned clobber_u64s,
+                                      unsigned new_u64s)
 {
        const struct bkey_packed *end = btree_bkey_last(b, t);
        struct btree_node_iter_set *set;
@@ -785,7 +887,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
 
        /* didn't find the bset in the iterator - might have to readd it: */
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
                bch2_btree_node_iter_push(node_iter, b, where, end);
                goto fixup_done;
        } else {
@@ -800,7 +902,7 @@ found:
                return;
 
        if (new_u64s &&
-           bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
+           bkey_iter_pos_cmp(b, where, &path->pos) >= 0) {
                set->k = offset;
        } else if (set->k < offset + clobber_u64s) {
                set->k = offset + new_u64s;
@@ -826,8 +928,7 @@ fixup_done:
         */
        if (!bch2_btree_node_iter_end(node_iter) &&
            iter_current_key_modified &&
-           (b->c.level ||
-            btree_node_type_is_extents(iter->btree_id))) {
+           b->c.level) {
                struct bset_tree *t;
                struct bkey_packed *k, *k2, *p;
 
@@ -852,14 +953,10 @@ fixup_done:
                                                            b, t, k2);
                }
        }
-
-       if (!b->c.level &&
-           node_iter == &iter->l[0].iter &&
-           iter_current_key_modified)
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-void bch2_btree_node_iter_fix(struct btree_iter *iter,
+void bch2_btree_node_iter_fix(struct btree_trans *trans,
+                             struct btree_path *path,
                              struct btree *b,
                              struct btree_node_iter *node_iter,
                              struct bkey_packed *where,
@@ -867,31 +964,31 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                              unsigned new_u64s)
 {
        struct bset_tree *t = bch2_bkey_to_bset(b, where);
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       if (node_iter != &iter->l[b->c.level].iter) {
-               __bch2_btree_node_iter_fix(iter, b, node_iter, t,
+       if (node_iter != &path->l[b->c.level].iter) {
+               __bch2_btree_node_iter_fix(path, b, node_iter, t,
                                           where, clobber_u64s, new_u64s);
 
                if (bch2_debug_check_iterators)
                        bch2_btree_node_iter_verify(node_iter, b);
        }
 
-       trans_for_each_iter_with_node(iter->trans, b, linked) {
+       trans_for_each_path_with_node(trans, b, linked) {
                __bch2_btree_node_iter_fix(linked, b,
                                           &linked->l[b->c.level].iter, t,
                                           where, clobber_u64s, new_u64s);
-               bch2_btree_iter_verify_level(linked, b->c.level);
+               bch2_btree_path_verify_level(trans, linked, b->c.level);
        }
 }
 
-static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
-                                                 struct btree_iter_level *l,
+/* Btree path level: pointer to a particular btree node and node iter */
+
+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c,
+                                                 struct btree_path_level *l,
                                                  struct bkey *u,
                                                  struct bkey_packed *k)
 {
-       struct bkey_s_c ret;
-
        if (unlikely(!k)) {
                /*
                 * signal to bch2_btree_iter_peek_slot() that we're currently at
@@ -901,58 +998,50 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
                return bkey_s_c_null;
        }
 
-       ret = bkey_disassemble(l->b, k, u);
-
-       /*
-        * XXX: bch2_btree_bset_insert_key() generates invalid keys when we
-        * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key
-        * being overwritten but doesn't change k->size. But this is ok, because
-        * those keys are never written out, we just have to avoid a spurious
-        * assertion here:
-        */
-       if (bch2_debug_check_bkeys && !bkey_deleted(ret.k))
-               bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
-
-       return ret;
+       return bkey_disassemble(l->b, k, u);
 }
 
-/* peek_all() doesn't skip deleted keys */
-static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
-                                                       struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c,
+                                                       struct btree_path_level *l,
+                                                       struct bkey *u)
 {
-       return __btree_iter_unpack(iter, l, &iter->k,
+       return __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_peek_all(&l->iter, l->b));
 }
 
-static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
-                                                   struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c,
+                                                   struct btree_path *path,
+                                                   struct btree_path_level *l,
+                                                   struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
+       path->pos = k.k ? k.k->p : l->b->key.k.p;
        return k;
 }
 
-static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
-                                                   struct btree_iter_level *l)
+static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c,
+                                                   struct btree_path *path,
+                                                   struct btree_path_level *l,
+                                                   struct bkey *u)
 {
-       struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
+       struct bkey_s_c k = __btree_iter_unpack(c, l, u,
                        bch2_btree_node_iter_prev(&l->iter, l->b));
 
-       iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
+       path->pos = k.k ? k.k->p : l->b->data->min_key;
        return k;
 }
 
-static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
-                                            struct btree_iter_level *l,
+static inline bool btree_path_advance_to_pos(struct btree_path *path,
+                                            struct btree_path_level *l,
                                             int max_advance)
 {
        struct bkey_packed *k;
        int nr_advanced = 0;
 
        while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
-              bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
+              bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) {
                if (max_advance > 0 && nr_advanced >= max_advance)
                        return false;
 
@@ -966,9 +1055,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
 /*
  * Verify that iterator for parent node points to child node:
  */
-static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
+static void btree_path_verify_new_node(struct btree_trans *trans,
+                                      struct btree_path *path, struct btree *b)
 {
-       struct btree_iter_level *l;
+       struct bch_fs *c = trans->c;
+       struct btree_path_level *l;
        unsigned plevel;
        bool parent_locked;
        struct bkey_packed *k;
@@ -976,16 +1067,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
        if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
                return;
 
+       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+               return;
+
        plevel = b->c.level + 1;
-       if (!btree_iter_node(iter, plevel))
+       if (!btree_path_node(path, plevel))
                return;
 
-       parent_locked = btree_node_locked(iter, plevel);
+       parent_locked = btree_node_locked(path, plevel);
 
-       if (!bch2_btree_node_relock(iter, plevel))
+       if (!bch2_btree_node_relock(trans, path, plevel))
                return;
 
-       l = &iter->l[plevel];
+       l = &path->l[plevel];
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
        if (!k ||
            bkey_deleted(k) ||
@@ -996,8 +1090,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                char buf4[100];
                struct bkey uk = bkey_unpack_key(b, k);
 
-               bch2_dump_btree_node(iter->trans->c, l->b);
-               bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
+               bch2_dump_btree_node(c, l->b);
+               bch2_bpos_to_text(&PBUF(buf1), path->pos);
                bch2_bkey_to_text(&PBUF(buf2), &uk);
                bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
                bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
@@ -1005,20 +1099,20 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
                      "iter pos %s %s\n"
                      "iter key %s\n"
                      "new node %s-%s\n",
-                     bch2_btree_ids[iter->btree_id], buf1,
+                     bch2_btree_ids[path->btree_id], buf1,
                      buf2, buf3, buf4);
        }
 
        if (!parent_locked)
-               btree_node_unlock(iter, b->c.level + 1);
+               btree_node_unlock(path, plevel);
 }
 
-static inline void __btree_iter_init(struct btree_iter *iter,
-                                    unsigned level)
+static inline void __btree_path_level_init(struct btree_path *path,
+                                          unsigned level)
 {
-       struct btree_iter_level *l = &iter->l[level];
+       struct btree_path_level *l = &path->l[level];
 
-       bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
+       bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
 
        /*
         * Iterators to interior nodes should always be pointed at the first non
@@ -1026,63 +1120,48 @@ static inline void __btree_iter_init(struct btree_iter *iter,
         */
        if (level)
                bch2_btree_node_iter_peek(&l->iter, l->b);
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
 }
 
-static inline void btree_iter_node_set(struct btree_iter *iter,
-                                      struct btree *b)
+static inline void btree_path_level_init(struct btree_trans *trans,
+                                        struct btree_path *path,
+                                        struct btree *b)
 {
-       BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
+       BUG_ON(path->cached);
 
-       btree_iter_verify_new_node(iter, b);
+       btree_path_verify_new_node(trans, path, b);
 
-       EBUG_ON(!btree_iter_pos_in_node(iter, b));
+       EBUG_ON(!btree_path_pos_in_node(path, b));
        EBUG_ON(b->c.lock.state.seq & 1);
 
-       iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
-       iter->l[b->c.level].b = b;
-       __btree_iter_init(iter, b->c.level);
+       path->l[b->c.level].lock_seq = b->c.lock.state.seq;
+       path->l[b->c.level].b = b;
+       __btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: fixups after btree node updates: */
+
 /*
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
 {
-       enum btree_node_locked_type t;
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter(iter->trans, linked)
-               if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
-                   btree_iter_pos_in_node(linked, b)) {
-                       /*
-                        * bch2_btree_iter_node_drop() has already been called -
-                        * the old node we're replacing has already been
-                        * unlocked and the pointer invalidated
-                        */
-                       BUG_ON(btree_node_locked(linked, b->c.level));
+       trans_for_each_path(trans, path)
+               if (!path->cached &&
+                   btree_path_pos_in_node(path, b)) {
+                       enum btree_node_locked_type t =
+                               btree_lock_want(path, b->c.level);
 
-                       t = btree_lock_want(linked, b->c.level);
-                       if (t != BTREE_NODE_UNLOCKED) {
+                       if (path->nodes_locked &&
+                           t != BTREE_NODE_UNLOCKED) {
+                               btree_node_unlock(path, b->c.level);
                                six_lock_increment(&b->c.lock, t);
-                               mark_btree_node_locked(linked, b->c.level, t);
+                               mark_btree_node_locked(path, b->c.level, t);
                        }
 
-                       btree_iter_node_set(linked, b);
-               }
-}
-
-void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
-{
-       struct btree_iter *linked;
-       unsigned level = b->c.level;
-
-       trans_for_each_iter(iter->trans, linked)
-               if (linked->l[level].b == b) {
-                       btree_node_unlock(linked, level);
-                       linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
+                       btree_path_level_init(trans, path, b);
                }
 }
 
@@ -1090,14 +1169,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
  * A btree node has been modified in such a way as to invalidate iterators - fix
  * them:
  */
-void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *path;
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
-               __btree_iter_init(linked, b->c.level);
+       trans_for_each_path_with_node(trans, b, path)
+               __btree_path_level_init(path, b->c.level);
 }
 
+/* Btree path: traverse, set_pos: */
+
 static int lock_root_check_fn(struct six_lock *lock, void *p)
 {
        struct btree *b = container_of(lock, struct btree, c.lock);
@@ -1106,38 +1187,38 @@ static int lock_root_check_fn(struct six_lock *lock, void *p)
        return b == *rootp ? 0 : -1;
 }
 
-static inline int btree_iter_lock_root(struct btree_trans *trans,
-                                      struct btree_iter *iter,
+static inline int btree_path_lock_root(struct btree_trans *trans,
+                                      struct btree_path *path,
                                       unsigned depth_want,
                                       unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
+       struct btree *b, **rootp = &c->btree_roots[path->btree_id].b;
        enum six_lock_type lock_type;
        unsigned i;
 
-       EBUG_ON(iter->nodes_locked);
+       EBUG_ON(path->nodes_locked);
 
        while (1) {
                b = READ_ONCE(*rootp);
-               iter->level = READ_ONCE(b->c.level);
+               path->level = READ_ONCE(b->c.level);
 
-               if (unlikely(iter->level < depth_want)) {
+               if (unlikely(path->level < depth_want)) {
                        /*
                         * the root is at a lower depth than the depth we want:
                         * got to the end of the btree, or we're walking nodes
                         * greater than some depth and there are no nodes >=
                         * that depth
                         */
-                       iter->level = depth_want;
-                       for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
-                               iter->l[i].b = NULL;
+                       path->level = depth_want;
+                       for (i = path->level; i < BTREE_MAX_DEPTH; i++)
+                               path->l[i].b = NULL;
                        return 1;
                }
 
-               lock_type = __btree_lock_want(iter, iter->level);
-               if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level,
-                                             iter, lock_type,
+               lock_type = __btree_lock_want(path, path->level);
+               if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX,
+                                             path->level, lock_type,
                                              lock_root_check_fn, rootp,
                                              trace_ip))) {
                        if (trans->restarted)
@@ -1146,16 +1227,16 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
                }
 
                if (likely(b == READ_ONCE(*rootp) &&
-                          b->c.level == iter->level &&
+                          b->c.level == path->level &&
                           !race_fault())) {
-                       for (i = 0; i < iter->level; i++)
-                               iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
-                       iter->l[iter->level].b = b;
-                       for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
-                               iter->l[i].b = NULL;
-
-                       mark_btree_node_locked(iter, iter->level, lock_type);
-                       btree_iter_node_set(iter, b);
+                       for (i = 0; i < path->level; i++)
+                               path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
+                       path->l[path->level].b = b;
+                       for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++)
+                               path->l[i].b = NULL;
+
+                       mark_btree_node_locked(path, path->level, lock_type);
+                       btree_path_level_init(trans, path, b);
                        return 0;
                }
 
@@ -1164,23 +1245,23 @@ static inline int btree_iter_lock_root(struct btree_trans *trans,
 }
 
 noinline
-static int btree_iter_prefetch(struct btree_iter *iter)
+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path)
 {
-       struct bch_fs *c = iter->trans->c;
-       struct btree_iter_level *l = &iter->l[iter->level];
+       struct bch_fs *c = trans->c;
+       struct btree_path_level *l = path_l(path);
        struct btree_node_iter node_iter = l->iter;
        struct bkey_packed *k;
        struct bkey_buf tmp;
        unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
-               ? (iter->level > 1 ? 0 :  2)
-               : (iter->level > 1 ? 1 : 16);
-       bool was_locked = btree_node_locked(iter, iter->level);
+               ? (path->level > 1 ? 0 :  2)
+               : (path->level > 1 ? 1 : 16);
+       bool was_locked = btree_node_locked(path, path->level);
        int ret = 0;
 
        bch2_bkey_buf_init(&tmp);
 
        while (nr && !ret) {
-               if (!bch2_btree_node_relock(iter, iter->level))
+               if (!bch2_btree_node_relock(trans, path, path->level))
                        break;
 
                bch2_btree_node_iter_advance(&node_iter, l->b);
@@ -1189,26 +1270,62 @@ static int btree_iter_prefetch(struct btree_iter *iter)
                        break;
 
                bch2_bkey_buf_unpack(&tmp, c, l->b, k);
-               ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
-                                              iter->level - 1);
+               ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+                                              path->level - 1);
+       }
+
+       if (!was_locked)
+               btree_node_unlock(path, path->level);
+
+       bch2_bkey_buf_exit(&tmp, c);
+       return ret;
+}
+
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+                                struct btree_and_journal_iter *jiter)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+       struct bkey_buf tmp;
+       unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+               ? (path->level > 1 ? 0 :  2)
+               : (path->level > 1 ? 1 : 16);
+       bool was_locked = btree_node_locked(path, path->level);
+       int ret = 0;
+
+       bch2_bkey_buf_init(&tmp);
+
+       while (nr && !ret) {
+               if (!bch2_btree_node_relock(trans, path, path->level))
+                       break;
+
+               bch2_btree_and_journal_iter_advance(jiter);
+               k = bch2_btree_and_journal_iter_peek(jiter);
+               if (!k.k)
+                       break;
+
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+               ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+                                              path->level - 1);
        }
 
        if (!was_locked)
-               btree_node_unlock(iter, iter->level);
+               btree_node_unlock(path, path->level);
 
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
-static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
+                                           struct btree_path *path,
                                            unsigned plevel, struct btree *b)
 {
-       struct btree_iter_level *l = &iter->l[plevel];
-       bool locked = btree_node_locked(iter, plevel);
+       struct btree_path_level *l = &path->l[plevel];
+       bool locked = btree_node_locked(path, plevel);
        struct bkey_packed *k;
        struct bch_btree_ptr_v2 *bp;
 
-       if (!bch2_btree_node_relock(iter, plevel))
+       if (!bch2_btree_node_relock(trans, path, plevel))
                return;
 
        k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
@@ -1218,59 +1335,96 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
        bp->mem_ptr = (unsigned long)b;
 
        if (!locked)
-               btree_node_unlock(iter, plevel);
+               btree_node_unlock(path, plevel);
+}
+
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+                                                    struct btree_path *path,
+                                                    unsigned flags,
+                                                    struct bkey_buf *out)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_path_level *l = path_l(path);
+       struct btree_and_journal_iter jiter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+       k = bch2_btree_and_journal_iter_peek(&jiter);
+
+       bch2_bkey_buf_reassemble(out, c, k);
+
+       if (flags & BTREE_ITER_PREFETCH)
+               ret = btree_path_prefetch_j(trans, path, &jiter);
+
+       bch2_btree_and_journal_iter_exit(&jiter);
+       return ret;
 }
 
-static __always_inline int btree_iter_down(struct btree_trans *trans,
-                                          struct btree_iter *iter,
+static __always_inline int btree_path_down(struct btree_trans *trans,
+                                          struct btree_path *path,
+                                          unsigned flags,
                                           unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter_level *l = &iter->l[iter->level];
+       struct btree_path_level *l = path_l(path);
        struct btree *b;
-       unsigned level = iter->level - 1;
-       enum six_lock_type lock_type = __btree_lock_want(iter, level);
+       unsigned level = path->level - 1;
+       enum six_lock_type lock_type = __btree_lock_want(path, level);
+       bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
        struct bkey_buf tmp;
        int ret;
 
-       EBUG_ON(!btree_node_locked(iter, iter->level));
+       EBUG_ON(!btree_node_locked(path, path->level));
 
        bch2_bkey_buf_init(&tmp);
-       bch2_bkey_buf_unpack(&tmp, c, l->b,
-                        bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip);
+       if (unlikely(!replay_done)) {
+               ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+               if (ret)
+                       goto err;
+       } else {
+               bch2_bkey_buf_unpack(&tmp, c, l->b,
+                                bch2_btree_node_iter_peek(&l->iter, l->b));
+
+               if (flags & BTREE_ITER_PREFETCH) {
+                       ret = btree_path_prefetch(trans, path);
+                       if (ret)
+                               goto err;
+               }
+       }
+
+       b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
        ret = PTR_ERR_OR_ZERO(b);
        if (unlikely(ret))
                goto err;
 
-       mark_btree_node_locked(iter, level, lock_type);
-       btree_iter_node_set(iter, b);
+       mark_btree_node_locked(path, level, lock_type);
+       btree_path_level_init(trans, path, b);
 
-       if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+       if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
            unlikely(b != btree_node_mem_ptr(tmp.k)))
-               btree_node_mem_ptr_set(iter, level + 1, b);
-
-       if (iter->flags & BTREE_ITER_PREFETCH)
-               ret = btree_iter_prefetch(iter);
+               btree_node_mem_ptr_set(trans, path, level + 1, b);
 
-       if (btree_node_read_locked(iter, level + 1))
-               btree_node_unlock(iter, level + 1);
-       iter->level = level;
+       if (btree_node_read_locked(path, level + 1))
+               btree_node_unlock(path, level + 1);
+       path->level = level;
 
-       bch2_btree_iter_verify_locks(iter);
+       bch2_btree_path_verify_locks(path);
 err:
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
-static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
+static int btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+                                  unsigned, unsigned long);
 
-static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
+static int __btree_path_traverse_all(struct btree_trans *trans, int ret,
                                     unsigned long trace_ip)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_path *path;
        int i;
 
        if (trans->in_traverse_all)
@@ -1280,20 +1434,20 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret,
 retry_all:
        trans->restarted = false;
 
-       trans_for_each_iter(trans, iter)
-               iter->should_be_locked = false;
+       trans_for_each_path(trans, path)
+               path->should_be_locked = false;
 
-       btree_trans_sort_iters(trans);
+       btree_trans_verify_sorted(trans);
 
        for (i = trans->nr_sorted - 2; i >= 0; --i) {
-               struct btree_iter *iter1 = trans->iters + trans->sorted[i];
-               struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1];
-
-               if (iter1->btree_id == iter2->btree_id &&
-                   iter1->locks_want < iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
-               else if (!iter1->locks_want && iter2->locks_want)
-                       __bch2_btree_iter_upgrade(iter1, 1);
+               struct btree_path *path1 = trans->paths + trans->sorted[i];
+               struct btree_path *path2 = trans->paths + trans->sorted[i + 1];
+
+               if (path1->btree_id == path2->btree_id &&
+                   path1->locks_want < path2->locks_want)
+                       __bch2_btree_path_upgrade(trans, path1, path2->locks_want);
+               else if (!path1->locks_want && path2->locks_want)
+                       __bch2_btree_path_upgrade(trans, path1, 1);
        }
 
        bch2_trans_unlock(trans);
@@ -1310,66 +1464,89 @@ retry_all:
                } while (ret);
        }
 
-       if (unlikely(ret == -EIO)) {
-               trans->error = true;
+       if (unlikely(ret == -EIO))
                goto out;
-       }
 
        BUG_ON(ret && ret != -EINTR);
 
        /* Now, redo traversals in correct order: */
-       trans_for_each_iter_inorder(trans, iter) {
-               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
-
-               ret = btree_iter_traverse_one(iter, _THIS_IP_);
-               if (ret)
-                       goto retry_all;
+       i = 0;
+       while (i < trans->nr_sorted) {
+               path = trans->paths + trans->sorted[i];
 
-               EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+               /*
+                * Traversing a path can cause another path to be added at about
+                * the same position:
+                */
+               if (path->uptodate) {
+                       ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_);
+                       if (ret)
+                               goto retry_all;
+               } else {
+                       i++;
+               }
        }
 
-       trans_for_each_iter(trans, iter)
-               BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
+       /*
+        * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock()
+        * and relock(), relock() won't relock since path->should_be_locked
+        * isn't set yet, which is all fine
+        */
+       trans_for_each_path(trans, path)
+               BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE);
 out:
        bch2_btree_cache_cannibalize_unlock(c);
 
        trans->in_traverse_all = false;
 
-       trace_trans_traverse_all(trans->ip, trace_ip);
+       trace_trans_traverse_all(trans->fn, trace_ip);
        return ret;
 }
 
-static int bch2_btree_iter_traverse_all(struct btree_trans *trans)
+static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
-       return __btree_iter_traverse_all(trans, 0, _RET_IP_);
+       return __btree_path_traverse_all(trans, 0, _RET_IP_);
 }
 
-static inline bool btree_iter_good_node(struct btree_iter *iter,
+static inline bool btree_path_good_node(struct btree_trans *trans,
+                                       struct btree_path *path,
                                        unsigned l, int check_pos)
 {
-       if (!is_btree_node(iter, l) ||
-           !bch2_btree_node_relock(iter, l))
+       if (!is_btree_node(path, l) ||
+           !bch2_btree_node_relock(trans, path, l))
                return false;
 
-       if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
+       if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b))
                return false;
-       if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
+       if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b))
                return false;
        return true;
 }
 
-static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
+                                                    struct btree_path *path,
                                                     int check_pos)
 {
-       unsigned l = iter->level;
+       unsigned i, l = path->level;
 
-       while (btree_iter_node(iter, l) &&
-              !btree_iter_good_node(iter, l, check_pos)) {
-               btree_node_unlock(iter, l);
-               iter->l[l].b = BTREE_ITER_NO_NODE_UP;
+       while (btree_path_node(path, l) &&
+              !btree_path_good_node(trans, path, l, check_pos)) {
+               btree_node_unlock(path, l);
+               path->l[l].b = BTREE_ITER_NO_NODE_UP;
                l++;
        }
 
+       /* If we need intent locks, take them too: */
+       for (i = l + 1;
+            i < path->locks_want && btree_path_node(path, i);
+            i++)
+               if (!bch2_btree_node_relock(trans, path, i))
+                       while (l <= i) {
+                               btree_node_unlock(path, l);
+                               path->l[l].b = BTREE_ITER_NO_NODE_UP;
+                               l++;
+                       }
+
        return l;
 }
 
@@ -1382,131 +1559,459 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
  * On error, caller (peek_node()/peek_key()) must return NULL; the error is
  * stashed in the iterator and returned from bch2_trans_exit().
  */
-static int btree_iter_traverse_one(struct btree_iter *iter,
+static int btree_path_traverse_one(struct btree_trans *trans,
+                                  struct btree_path *path,
+                                  unsigned flags,
                                   unsigned long trace_ip)
 {
-       struct btree_trans *trans = iter->trans;
-       unsigned l, depth_want = iter->level;
+       unsigned depth_want = path->level;
        int ret = 0;
 
+       if (unlikely(trans->restarted)) {
+               ret = -EINTR;
+               goto out;
+       }
+
        /*
-        * Ensure we obey iter->should_be_locked: if it's set, we can't unlock
-        * and re-traverse the iterator without a transaction restart:
+        * Ensure we obey path->should_be_locked: if it's set, we can't unlock
+        * and re-traverse the path without a transaction restart:
         */
-       if (iter->should_be_locked) {
-               ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR;
+       if (path->should_be_locked) {
+               ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR;
                goto out;
        }
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
-               ret = bch2_btree_iter_traverse_cached(iter);
+       if (path->cached) {
+               ret = bch2_btree_path_traverse_cached(trans, path, flags);
                goto out;
        }
 
-       if (unlikely(iter->level >= BTREE_MAX_DEPTH))
+       if (unlikely(path->level >= BTREE_MAX_DEPTH))
                goto out;
 
-       iter->level = btree_iter_up_until_good_node(iter, 0);
-
-       /* If we need intent locks, take them too: */
-       for (l = iter->level + 1;
-            l < iter->locks_want && btree_iter_node(iter, l);
-            l++)
-               if (!bch2_btree_node_relock(iter, l))
-                       while (iter->level <= l) {
-                               btree_node_unlock(iter, iter->level);
-                               iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-                               iter->level++;
-                       }
+       path->level = btree_path_up_until_good_node(trans, path, 0);
 
        /*
-        * Note: iter->nodes[iter->level] may be temporarily NULL here - that
+        * Note: path->nodes[path->level] may be temporarily NULL here - that
         * would indicate to other code that we got to the end of the btree,
         * here it indicates that relocking the root failed - it's critical that
-        * btree_iter_lock_root() comes next and that it can't fail
+        * btree_path_lock_root() comes next and that it can't fail
         */
-       while (iter->level > depth_want) {
-               ret = btree_iter_node(iter, iter->level)
-                       ? btree_iter_down(trans, iter, trace_ip)
-                       : btree_iter_lock_root(trans, iter, depth_want, trace_ip);
+       while (path->level > depth_want) {
+               ret = btree_path_node(path, path->level)
+                       ? btree_path_down(trans, path, flags, trace_ip)
+                       : btree_path_lock_root(trans, path, depth_want, trace_ip);
                if (unlikely(ret)) {
                        if (ret == 1) {
                                /*
-                                * Got to the end of the btree (in
-                                * BTREE_ITER_NODES mode)
+                                * No nodes at this level - got to the end of
+                                * the btree:
                                 */
                                ret = 0;
                                goto out;
                        }
 
-                       __bch2_btree_iter_unlock(iter);
-                       iter->level = depth_want;
+                       __bch2_btree_path_unlock(path);
+                       path->level = depth_want;
 
-                       if (ret == -EIO) {
-                               iter->flags |= BTREE_ITER_ERROR;
-                               iter->l[iter->level].b =
+                       if (ret == -EIO)
+                               path->l[path->level].b =
                                        BTREE_ITER_NO_NODE_ERROR;
-                       } else {
-                               iter->l[iter->level].b =
+                       else
+                               path->l[path->level].b =
                                        BTREE_ITER_NO_NODE_DOWN;
-                       }
                        goto out;
                }
        }
 
-       iter->uptodate = BTREE_ITER_NEED_PEEK;
+       path->uptodate = BTREE_ITER_UPTODATE;
 out:
        BUG_ON((ret == -EINTR) != !!trans->restarted);
-       trace_iter_traverse(trans->ip, trace_ip,
-                           btree_iter_type(iter) == BTREE_ITER_CACHED,
-                           iter->btree_id, &iter->real_pos, ret);
-       bch2_btree_iter_verify(iter);
+       bch2_btree_path_verify(trans, path);
        return ret;
 }
 
-static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
-{
-       struct btree_trans *trans = iter->trans;
-       int ret;
+static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long);
 
-       ret =   bch2_trans_cond_resched(trans) ?:
-               btree_iter_traverse_one(iter, _RET_IP_);
-       if (unlikely(ret) && hweight64(trans->iters_linked) == 1) {
-               ret = __btree_iter_traverse_all(trans, ret, _RET_IP_);
-               BUG_ON(ret == -EINTR);
-       }
+int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
+                                         struct btree_path *path, unsigned flags)
+{
+       if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+               return 0;
 
-       return ret;
+       return  bch2_trans_cond_resched(trans) ?:
+               btree_path_traverse_one(trans, path, flags, _RET_IP_);
 }
 
-/*
- * Note:
- * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
- * for internal btree iterator users
- *
- * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
- * btree_iter_traverse() does not:
- */
-static inline int __must_check
-btree_iter_traverse(struct btree_iter *iter)
+static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst,
+                           struct btree_path *src)
 {
-       return iter->uptodate >= BTREE_ITER_NEED_RELOCK
-               ? __bch2_btree_iter_traverse(iter)
-               : 0;
+       unsigned i;
+
+       memcpy(&dst->pos, &src->pos,
+              sizeof(struct btree_path) - offsetof(struct btree_path, pos));
+
+       for (i = 0; i < BTREE_MAX_DEPTH; i++)
+               if (btree_node_locked(dst, i))
+                       six_lock_increment(&dst->l[i].b->c.lock,
+                                          __btree_lock_want(dst, i));
+
+       btree_path_check_sort(trans, dst, 0);
 }
 
-int __must_check
-bch2_btree_iter_traverse(struct btree_iter *iter)
+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
+                                          bool intent)
 {
-       int ret;
+       struct btree_path *new = btree_path_alloc(trans, src);
 
-       btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
+       btree_path_copy(trans, new, src);
+       __btree_path_get(new, intent);
+       return new;
+}
+
+inline struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *trans,
+                        struct btree_path *path, bool intent,
+                        unsigned long ip)
+{
+       if (path->ref > 1 || path->preserve) {
+               __btree_path_put(path, intent);
+               path = btree_path_clone(trans, path, intent);
+               path->preserve = false;
+#ifdef CONFIG_BCACHEFS_DEBUG
+               path->ip_allocated = ip;
+#endif
+               btree_trans_verify_sorted(trans);
+       }
+
+       return path;
+}
+
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
+                  struct btree_path *path, struct bpos new_pos,
+                  bool intent, unsigned long ip)
+{
+       int cmp = bpos_cmp(new_pos, path->pos);
+       unsigned l = path->level;
+
+       EBUG_ON(trans->restarted);
+       EBUG_ON(!path->ref);
+
+       if (!cmp)
+               return path;
+
+       path = bch2_btree_path_make_mut(trans, path, intent, ip);
+
+       path->pos               = new_pos;
+       path->should_be_locked  = false;
+
+       btree_path_check_sort(trans, path, cmp);
+
+       if (unlikely(path->cached)) {
+               btree_node_unlock(path, 0);
+               path->l[0].b = BTREE_ITER_NO_NODE_CACHED;
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               goto out;
+       }
 
-       ret = btree_iter_traverse(iter);
+       l = btree_path_up_until_good_node(trans, path, cmp);
+
+       if (btree_path_node(path, l)) {
+               /*
+                * We might have to skip over many keys, or just a few: try
+                * advancing the node iterator, and if we have to skip over too
+                * many keys just reinit it (or if we're rewinding, since that
+                * is expensive).
+                */
+               if (cmp < 0 ||
+                   !btree_path_advance_to_pos(path, &path->l[l], 8))
+                       __btree_path_level_init(path, l);
+       }
+
+       if (l != path->level) {
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
+               __bch2_btree_path_unlock(path);
+       }
+out:
+       bch2_btree_path_verify(trans, path);
+       return path;
+}
+
+/* Btree path: main interface: */
+
+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+       struct btree_path *next;
+
+       next = prev_btree_path(trans, path);
+       if (next && !btree_path_cmp(next, path))
+               return next;
+
+       next = next_btree_path(trans, path);
+       if (next && !btree_path_cmp(next, path))
+               return next;
+
+       return NULL;
+}
+
+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path)
+{
+       struct btree_path *next;
+
+       next = prev_btree_path(trans, path);
+       if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+               return next;
+
+       next = next_btree_path(trans, path);
+       if (next && next->level == path->level && path_l(next)->b == path_l(path)->b)
+               return next;
+
+       return NULL;
+}
+
+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+{
+       __bch2_btree_path_unlock(path);
+       btree_path_list_remove(trans, path);
+       trans->paths_allocated &= ~(1ULL << path->idx);
+}
+
+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+{
+       struct btree_path *dup;
+
+       EBUG_ON(trans->paths + path->idx != path);
+       EBUG_ON(!path->ref);
+
+       if (!__btree_path_put(path, intent))
+               return;
+
+       /*
+        * Perhaps instead we should check for duplicate paths in traverse_all:
+        */
+       if (path->preserve &&
+           (dup = have_path_at_pos(trans, path))) {
+               dup->preserve = true;
+               path->preserve = false;
+               goto free;
+       }
+
+       if (!path->preserve &&
+           (dup = have_node_at_pos(trans, path)))
+               goto free;
+       return;
+free:
+       if (path->should_be_locked &&
+           !btree_node_locked(dup, path->level))
+               return;
+
+       dup->should_be_locked |= path->should_be_locked;
+       __bch2_path_free(trans, path);
+}
+
+noinline __cold
+void bch2_dump_trans_paths_updates(struct btree_trans *trans)
+{
+       struct btree_path *path;
+       struct btree_insert_entry *i;
+       unsigned idx;
+       char buf1[300], buf2[300];
+
+       btree_trans_verify_sorted(trans);
+
+       trans_for_each_path_inorder(trans, path, idx)
+               printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n",
+                      path->idx, path->ref, path->intent_ref,
+                      path->should_be_locked ? " S" : "",
+                      path->preserve ? " P" : "",
+                      bch2_btree_ids[path->btree_id],
+                      (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1),
+                      path->nodes_locked,
+#ifdef CONFIG_BCACHEFS_DEBUG
+                      (void *) path->ip_allocated
+#else
+                      NULL
+#endif
+                      );
+
+       trans_for_each_update(trans, i) {
+               struct bkey u;
+               struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u);
+
+               printk(KERN_ERR "update: btree %s %pS\n  old %s\n  new %s",
+                      bch2_btree_ids[i->btree_id],
+                      (void *) i->ip_allocated,
+                      (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1),
+                      (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2));
+       }
+}
+
+static struct btree_path *btree_path_alloc(struct btree_trans *trans,
+                                          struct btree_path *pos)
+{
+       struct btree_path *path;
+       unsigned idx;
+
+       if (unlikely(trans->paths_allocated ==
+                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) {
+               bch2_dump_trans_paths_updates(trans);
+               panic("trans path oveflow\n");
+       }
+
+       idx = __ffs64(~trans->paths_allocated);
+       trans->paths_allocated |= 1ULL << idx;
+
+       path = &trans->paths[idx];
+
+       path->idx               = idx;
+       path->ref               = 0;
+       path->intent_ref        = 0;
+       path->nodes_locked      = 0;
+       path->nodes_intent_locked = 0;
+
+       btree_path_list_add(trans, pos, path);
+       return path;
+}
+
+struct btree_path *bch2_path_get(struct btree_trans *trans,
+                                enum btree_id btree_id, struct bpos pos,
+                                unsigned locks_want, unsigned level,
+                                unsigned flags, unsigned long ip)
+{
+       struct btree_path *path, *path_pos = NULL;
+       bool cached = flags & BTREE_ITER_CACHED;
+       bool intent = flags & BTREE_ITER_INTENT;
+       int i;
+
+       BUG_ON(trans->restarted);
+
+       trans_for_each_path_inorder(trans, path, i) {
+               if (__btree_path_cmp(path,
+                                    btree_id,
+                                    cached,
+                                    pos,
+                                    level) > 0)
+                       break;
+
+               path_pos = path;
+       }
+
+       if (path_pos &&
+           path_pos->cached    == cached &&
+           path_pos->btree_id  == btree_id &&
+           path_pos->level     == level) {
+               __btree_path_get(path_pos, intent);
+               path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+       } else {
+               path = btree_path_alloc(trans, path_pos);
+               path_pos = NULL;
+
+               __btree_path_get(path, intent);
+               path->pos                       = pos;
+               path->btree_id                  = btree_id;
+               path->cached                    = cached;
+               path->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
+               path->should_be_locked          = false;
+               path->level                     = level;
+               path->locks_want                = locks_want;
+               path->nodes_locked              = 0;
+               path->nodes_intent_locked       = 0;
+               for (i = 0; i < ARRAY_SIZE(path->l); i++)
+                       path->l[i].b            = BTREE_ITER_NO_NODE_INIT;
+#ifdef CONFIG_BCACHEFS_DEBUG
+               path->ip_allocated              = ip;
+#endif
+               btree_trans_verify_sorted(trans);
+       }
+
+       if (!(flags & BTREE_ITER_NOPRESERVE))
+               path->preserve = true;
+
+       if (path->intent_ref)
+               locks_want = max(locks_want, level + 1);
+
+       /*
+        * If the path has locks_want greater than requested, we don't downgrade
+        * it here - on transaction restart because btree node split needs to
+        * upgrade locks, we might be putting/getting the iterator again.
+        * Downgrading iterators only happens via bch2_trans_downgrade(), after
+        * a successful transaction commit.
+        */
+
+       locks_want = min(locks_want, BTREE_MAX_DEPTH);
+       if (locks_want > path->locks_want) {
+               path->locks_want = locks_want;
+               btree_path_get_locks(trans, path, true);
+       }
+
+       return path;
+}
+
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
+{
+
+       struct bkey_s_c k;
+
+       if (!path->cached) {
+               struct btree_path_level *l = path_l(path);
+               struct bkey_packed *_k;
+
+               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+               _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
+               k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
+
+               EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
+
+               if (!k.k || bpos_cmp(path->pos, k.k->p))
+                       goto hole;
+       } else {
+               struct bkey_cached *ck = (void *) path->l[0].b;
+
+               EBUG_ON(ck &&
+                       (path->btree_id != ck->key.btree_id ||
+                        bkey_cmp(path->pos, ck->key.pos)));
+
+               /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+               if (unlikely(!ck || !ck->valid))
+                       return bkey_s_c_null;
+
+               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
+
+               k = bkey_i_to_s_c(ck->k);
+       }
+
+       return k;
+hole:
+       bkey_init(u);
+       u->p = path->pos;
+       return (struct bkey_s_c) { u, NULL };
+}
+
+/* Btree iterators: */
+
+int __must_check
+__bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
+}
+
+int __must_check
+bch2_btree_iter_traverse(struct btree_iter *iter)
+{
+       int ret;
+
+       iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+                                       btree_iter_search_key(iter),
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+       ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags);
        if (ret)
                return ret;
 
-       iter->should_be_locked = true;
+       iter->path->should_be_locked = true;
        return 0;
 }
 
@@ -1514,149 +2019,134 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
 {
-       struct btree *b;
+       struct btree_trans *trans = iter->trans;
+       struct btree *b = NULL;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+       EBUG_ON(iter->path->cached);
        bch2_btree_iter_verify(iter);
 
-       ret = btree_iter_traverse(iter);
+       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
        if (ret)
-               return NULL;
+               goto err;
 
-       b = btree_iter_node(iter, iter->level);
+       b = btree_path_node(iter->path, iter->path->level);
        if (!b)
-               return NULL;
+               goto out;
 
        BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
 
-       iter->pos = iter->real_pos = b->key.k.p;
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos = b->key.k.p;
 
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+       iter->path->should_be_locked = true;
+       BUG_ON(iter->path->uptodate);
+out:
+       bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
 
        return b;
+err:
+       b = ERR_PTR(ret);
+       goto out;
 }
 
 struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
 {
-       struct btree *b;
+       struct btree_trans *trans = iter->trans;
+       struct btree_path *path = iter->path;
+       struct btree *b = NULL;
+       unsigned l;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
+       BUG_ON(trans->restarted);
+       EBUG_ON(iter->path->cached);
        bch2_btree_iter_verify(iter);
 
-       /* already got to end? */
-       if (!btree_iter_node(iter, iter->level))
-               return NULL;
-
-       bch2_trans_cond_resched(iter->trans);
-
-       btree_node_unlock(iter, iter->level);
-       iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
-       iter->level++;
-
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-       ret = btree_iter_traverse(iter);
-       if (ret)
+       /* already at end? */
+       if (!btree_path_node(path, path->level))
                return NULL;
 
        /* got to end? */
-       b = btree_iter_node(iter, iter->level);
-       if (!b)
+       if (!btree_path_node(path, path->level + 1)) {
+               btree_node_unlock(path, path->level);
+               path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+               path->level++;
                return NULL;
+       }
+
+       if (!bch2_btree_node_relock(trans, path, path->level + 1)) {
+               __bch2_btree_path_unlock(path);
+               path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+               path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS;
+               trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_,
+                                          path->btree_id, &path->pos);
+               btree_trans_restart(trans);
+               ret = -EINTR;
+               goto err;
+       }
+
+       b = btree_path_node(path, path->level + 1);
 
-       if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
+       if (!bpos_cmp(iter->pos, b->key.k.p)) {
+               btree_node_unlock(path, path->level);
+               path->l[path->level].b = BTREE_ITER_NO_NODE_UP;
+               path->level++;
+       } else {
                /*
                 * Haven't gotten to the end of the parent node: go back down to
                 * the next child node
                 */
-               btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
+               path = iter->path =
+                       bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+                                          iter->flags & BTREE_ITER_INTENT,
+                                          btree_iter_ip_allocated(iter));
+
+               path->level = iter->min_depth;
 
-               /* Unlock to avoid screwing up our lock invariants: */
-               btree_node_unlock(iter, iter->level);
+               for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++)
+                       if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED)
+                               btree_node_unlock(path, l);
 
-               iter->level = iter->min_depth;
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
                bch2_btree_iter_verify(iter);
 
-               ret = btree_iter_traverse(iter);
+               ret = bch2_btree_path_traverse(trans, path, iter->flags);
                if (ret)
-                       return NULL;
+                       goto err;
 
-               b = iter->l[iter->level].b;
+               b = path->l[path->level].b;
        }
 
-       iter->pos = iter->real_pos = b->key.k.p;
+       bkey_init(&iter->k);
+       iter->k.p = iter->pos = b->key.k.p;
 
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+       iter->path->should_be_locked = true;
+       BUG_ON(iter->path->uptodate);
+out:
+       bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
 
        return b;
+err:
+       b = ERR_PTR(ret);
+       goto out;
 }
 
 /* Iterate across keys (in leaf nodes only) */
 
-static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct bpos old_pos = iter->real_pos;
-#endif
-       int cmp = bpos_cmp(new_pos, iter->real_pos);
-       unsigned l = iter->level;
-
-       EBUG_ON(iter->trans->restarted);
-
-       if (!cmp)
-               goto out;
-
-       iter->real_pos = new_pos;
-       iter->should_be_locked = false;
-
-       btree_iter_check_sort(iter->trans, iter);
-
-       if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
-               btree_node_unlock(iter, 0);
-               iter->l[0].b = BTREE_ITER_NO_NODE_CACHED;
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               return;
-       }
-
-       l = btree_iter_up_until_good_node(iter, cmp);
-
-       if (btree_iter_node(iter, l)) {
-               /*
-                * We might have to skip over many keys, or just a few: try
-                * advancing the node iterator, and if we have to skip over too
-                * many keys just reinit it (or if we're rewinding, since that
-                * is expensive).
-                */
-               if (cmp < 0 ||
-                   !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
-                       __btree_iter_init(iter, l);
-
-               /* Don't leave it locked if we're not supposed to: */
-               if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
-                       btree_node_unlock(iter, l);
-       }
-out:
-       if (l != iter->level)
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-       else
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
-
-       bch2_btree_iter_verify(iter);
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trace_iter_set_search_pos(iter->trans->ip, _RET_IP_,
-                                 iter->btree_id,
-                                 &old_pos, &new_pos, l);
-#endif
-}
-
 inline bool bch2_btree_iter_advance(struct btree_iter *iter)
 {
        struct bpos pos = iter->k.p;
-       bool ret = bpos_cmp(pos, SPOS_MAX) != 0;
+       bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+                   ? bpos_cmp(pos, SPOS_MAX)
+                   : bkey_cmp(pos, SPOS_MAX)) != 0;
 
        if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
                pos = bkey_successor(iter, pos);
@@ -1677,54 +2167,177 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
        return ret;
 }
 
-static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
+static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
+                                                     enum btree_id btree_id,
+                                                     struct bpos pos)
 {
-       struct bpos next_pos = iter->l[0].b->key.k.p;
-       bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0;
+       struct btree_insert_entry *i;
 
-       /*
-        * Typically, we don't want to modify iter->pos here, since that
-        * indicates where we searched from - unless we got to the end of the
-        * btree, in that case we want iter->pos to reflect that:
-        */
-       if (ret)
-               btree_iter_set_search_pos(iter, bpos_successor(next_pos));
-       else
-               bch2_btree_iter_set_pos(iter, SPOS_MAX);
+       trans_for_each_update(trans, i)
+               if ((cmp_int(btree_id,  i->btree_id) ?:
+                    bpos_cmp(pos,      i->k->k.p)) <= 0) {
+                       if (btree_id == i->btree_id)
+                               return i->k;
+                       break;
+               }
 
-       return ret;
+       return NULL;
 }
 
-static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
+static noinline
+struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+                                         struct btree_path *path)
 {
-       struct bpos next_pos = iter->l[0].b->data->min_key;
-       bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
+       struct journal_keys *keys = &trans->c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, path->btree_id,
+                                            path->level, path->pos);
 
-       if (ret)
-               btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
-       else
-               bch2_btree_iter_set_pos(iter, POS_MIN);
+       while (idx < keys->nr && keys->d[idx].overwritten)
+               idx++;
 
-       return ret;
+       return (idx < keys->nr &&
+               keys->d[idx].btree_id   == path->btree_id &&
+               keys->d[idx].level      == path->level)
+               ? keys->d[idx].k
+               : NULL;
 }
 
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
-                                                     struct bpos pos)
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+                                        struct btree_iter *iter,
+                                        struct bkey_s_c k)
 {
-       struct btree_insert_entry *i;
+       struct bkey_i *next_journal =
+               __btree_trans_peek_journal(trans, iter->path);
 
-       if (!(iter->flags & BTREE_ITER_WITH_UPDATES))
-               return NULL;
+       if (next_journal &&
+           bpos_cmp(next_journal->k.p,
+                    k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+               iter->k = next_journal->k;
+               k = bkey_i_to_s_c(next_journal);
+       }
 
-       trans_for_each_update(iter->trans, i)
-               if ((cmp_int(iter->btree_id,    i->iter->btree_id) ?:
-                    bkey_cmp(pos,              i->k->k.p)) <= 0) {
-                       if (iter->btree_id ==   i->iter->btree_id)
-                               return i->k;
+       return k;
+}
+
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bch_fs *c = trans->c;
+       struct bkey u;
+       int ret;
+
+       if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+               return bkey_s_c_null;
+
+       if (!iter->key_cache_path)
+               iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+                                                    iter->flags & BTREE_ITER_INTENT, 0,
+                                                    iter->flags|BTREE_ITER_CACHED,
+                                                    _THIS_IP_);
+
+       iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+       ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
+       iter->key_cache_path->should_be_locked = true;
+
+       return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bkey_i *next_update;
+       struct bkey_s_c k, k2;
+       int ret;
+
+       EBUG_ON(iter->path->cached || iter->path->level);
+       bch2_btree_iter_verify(iter);
+
+       while (1) {
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
+               if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
+                       k = bkey_s_c_err(ret);
+                       goto out;
+               }
+
+               iter->path->should_be_locked = true;
+
+               k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
+
+               if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+                   k.k &&
+                   (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+                       ret = bkey_err(k2);
+                       if (ret) {
+                               k = k2;
+                               bch2_btree_iter_set_pos(iter, iter->pos);
+                               goto out;
+                       }
+
+                       k = k2;
+                       iter->k = *k.k;
+               }
+
+               if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+                       k = btree_trans_peek_journal(trans, iter, k);
+
+               next_update = iter->flags & BTREE_ITER_WITH_UPDATES
+                       ? btree_trans_peek_updates(trans, iter->btree_id, search_key)
+                       : NULL;
+               if (next_update &&
+                   bpos_cmp(next_update->k.p,
+                            k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+                       iter->k = next_update->k;
+                       k = bkey_i_to_s_c(next_update);
+               }
+
+               if (k.k && bkey_deleted(k.k)) {
+                       /*
+                        * If we've got a whiteout, and it's after the search
+                        * key, advance the search key to the whiteout instead
+                        * of just after the whiteout - it might be a btree
+                        * whiteout, with a real key at the same position, since
+                        * in the btree deleted keys sort before non deleted.
+                        */
+                       search_key = bpos_cmp(search_key, k.k->p)
+                               ? k.k->p
+                               : bpos_successor(k.k->p);
+                       continue;
+               }
+
+               if (likely(k.k)) {
                        break;
+               } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) {
+                       /* Advance to next leaf node: */
+                       search_key = bpos_successor(iter->path->l[0].b->key.k.p);
+               } else {
+                       /* End of btree: */
+                       bch2_btree_iter_set_pos(iter, SPOS_MAX);
+                       k = bkey_s_c_null;
+                       goto out;
                }
+       }
+out:
+       bch2_btree_iter_verify(iter);
 
-       return NULL;
+       return k;
 }
 
 /**
@@ -1733,42 +2346,79 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter,
  */
 struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 {
+       struct btree_trans *trans = iter->trans;
        struct bpos search_key = btree_iter_search_key(iter);
-       struct bkey_i *next_update;
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
-       bch2_btree_iter_verify(iter);
+       if (iter->update_path) {
+               bch2_path_put(trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
+               iter->update_path = NULL;
+       }
+
        bch2_btree_iter_verify_entry_exit(iter);
-start:
-       next_update = btree_trans_peek_updates(iter, search_key);
-       btree_iter_set_search_pos(iter, search_key);
 
        while (1) {
-               ret = btree_iter_traverse(iter);
-               if (unlikely(ret))
-                       return bkey_s_c_err(ret);
+               k = __bch2_btree_iter_peek(iter, search_key);
+               if (!k.k || bkey_err(k))
+                       goto out;
 
-               k = btree_iter_level_peek(iter, &iter->l[0]);
+               if (iter->update_path &&
+                   bkey_cmp(iter->update_path->pos, k.k->p)) {
+                       bch2_path_put(trans, iter->update_path,
+                                     iter->flags & BTREE_ITER_INTENT);
+                       iter->update_path = NULL;
+               }
 
-               if (next_update &&
-                   bpos_cmp(next_update->k.p, iter->real_pos) <= 0) {
-                       iter->k = next_update->k;
-                       k = bkey_i_to_s_c(next_update);
+               if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                   (iter->flags & BTREE_ITER_INTENT) &&
+                   !(iter->flags & BTREE_ITER_IS_EXTENTS) &&
+                   !iter->update_path) {
+                       struct bpos pos = k.k->p;
+
+                       if (pos.snapshot < iter->snapshot) {
+                               search_key = bpos_successor(k.k->p);
+                               continue;
+                       }
+
+                       pos.snapshot = iter->snapshot;
+
+                       /*
+                        * advance, same as on exit for iter->path, but only up
+                        * to snapshot
+                        */
+                       __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+                       iter->update_path = iter->path;
+
+                       iter->update_path = bch2_btree_path_set_pos(trans,
+                                               iter->update_path, pos,
+                                               iter->flags & BTREE_ITER_INTENT,
+                                               btree_iter_ip_allocated(iter));
+
+                       BUG_ON(!(iter->update_path->nodes_locked & 1));
+                       iter->update_path->should_be_locked = true;
+               }
+
+               /*
+                * We can never have a key in a leaf node at POS_MAX, so
+                * we don't have to check these successor() calls:
+                */
+               if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
+                   !bch2_snapshot_is_ancestor(trans->c,
+                                              iter->snapshot,
+                                              k.k->p.snapshot)) {
+                       search_key = bpos_successor(k.k->p);
+                       continue;
                }
 
-               if (likely(k.k)) {
-                       if (bkey_deleted(k.k)) {
-                               search_key = bkey_successor(iter, k.k->p);
-                               goto start;
-                       }
-
-                       break;
+               if (bkey_whiteout(k.k) &&
+                   !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                       search_key = bkey_successor(iter, k.k->p);
+                       continue;
                }
 
-               if (!btree_iter_set_pos_to_next_leaf(iter))
-                       return bkey_s_c_null;
+               break;
        }
 
        /*
@@ -1780,9 +2430,28 @@ start:
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
+                               iter->flags & BTREE_ITER_INTENT,
+                               btree_iter_ip_allocated(iter));
+       BUG_ON(!iter->path->nodes_locked);
+out:
+       if (iter->update_path) {
+               BUG_ON(!(iter->update_path->nodes_locked & 1));
+               iter->update_path->should_be_locked = true;
+       }
+       iter->path->should_be_locked = true;
+
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               iter->pos.snapshot = iter->snapshot;
+
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret)) {
+               bch2_btree_iter_set_pos(iter, iter->pos);
+               k = bkey_s_c_err(ret);
+       }
+
        bch2_btree_iter_verify_entry_exit(iter);
-       bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
+
        return k;
 }
 
@@ -1804,37 +2473,103 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
  */
 struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 {
-       struct btree_iter_level *l = &iter->l[0];
+       struct btree_trans *trans = iter->trans;
+       struct bpos search_key = iter->pos;
+       struct btree_path *saved_path = NULL;
        struct bkey_s_c k;
+       struct bkey saved_k;
+       const struct bch_val *saved_v;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
+       EBUG_ON(iter->path->cached || iter->path->level);
        EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+       if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+               return bkey_s_c_err(-EIO);
+
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
-       btree_iter_set_search_pos(iter, iter->pos);
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               search_key.snapshot = U32_MAX;
 
        while (1) {
-               ret = btree_iter_traverse(iter);
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+                                               iter->flags & BTREE_ITER_INTENT,
+                                               btree_iter_ip_allocated(iter));
+
+               ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
                if (unlikely(ret)) {
+                       /* ensure that iter->k is consistent with iter->pos: */
+                       bch2_btree_iter_set_pos(iter, iter->pos);
                        k = bkey_s_c_err(ret);
-                       goto no_key;
+                       goto out;
                }
 
-               k = btree_iter_level_peek(iter, l);
+               k = btree_path_level_peek(trans->c, iter->path,
+                                         &iter->path->l[0], &iter->k);
                if (!k.k ||
                    ((iter->flags & BTREE_ITER_IS_EXTENTS)
-                    ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
-                    : bkey_cmp(k.k->p, iter->pos) > 0))
-                       k = btree_iter_level_prev(iter, l);
+                    ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0
+                    : bpos_cmp(k.k->p, search_key) > 0))
+                       k = btree_path_level_prev(trans->c, iter->path,
+                                                 &iter->path->l[0], &iter->k);
 
-               if (likely(k.k))
-                       break;
+               btree_path_check_sort(trans, iter->path, 0);
+
+               if (likely(k.k)) {
+                       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
+                               if (k.k->p.snapshot == iter->snapshot)
+                                       goto got_key;
+
+                               /*
+                                * If we have a saved candidate, and we're no
+                                * longer at the same _key_ (not pos), return
+                                * that candidate
+                                */
+                               if (saved_path && bkey_cmp(k.k->p, saved_k.p)) {
+                                       bch2_path_put(trans, iter->path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       iter->path = saved_path;
+                                       saved_path = NULL;
+                                       iter->k = saved_k;
+                                       k.v     = saved_v;
+                                       goto got_key;
+                               }
+
+                               if (bch2_snapshot_is_ancestor(iter->trans->c,
+                                                             iter->snapshot,
+                                                             k.k->p.snapshot)) {
+                                       if (saved_path)
+                                               bch2_path_put(trans, saved_path,
+                                                     iter->flags & BTREE_ITER_INTENT);
+                                       saved_path = btree_path_clone(trans, iter->path,
+                                                               iter->flags & BTREE_ITER_INTENT);
+                                       saved_k = *k.k;
+                                       saved_v = k.v;
+                               }
+
+                               search_key = bpos_predecessor(k.k->p);
+                               continue;
+                       }
+got_key:
+                       if (bkey_whiteout(k.k) &&
+                           !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) {
+                               search_key = bkey_predecessor(iter, k.k->p);
+                               if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+                                       search_key.snapshot = U32_MAX;
+                               continue;
+                       }
 
-               if (!btree_iter_set_pos_to_prev_leaf(iter)) {
+                       break;
+               } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) {
+                       /* Advance to previous leaf node: */
+                       search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+               } else {
+                       /* Start of btree: */
+                       bch2_btree_iter_set_pos(iter, POS_MIN);
                        k = bkey_s_c_null;
-                       goto no_key;
+                       goto out;
                }
        }
 
@@ -1843,20 +2578,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
        /* Extents can straddle iter->pos: */
        if (bkey_cmp(k.k->p, iter->pos) < 0)
                iter->pos = k.k->p;
+
+       if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
+               iter->pos.snapshot = iter->snapshot;
 out:
+       if (saved_path)
+               bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
+       iter->path->should_be_locked = true;
+
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
+
        return k;
-no_key:
-       /*
-        * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
-        * then we errored going to the previous leaf - make sure it's
-        * consistent with iter->pos:
-        */
-       bkey_init(&iter->k);
-       iter->k.p = iter->pos;
-       goto out;
 }
 
 /**
@@ -1873,12 +2606,12 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 {
+       struct btree_trans *trans = iter->trans;
        struct bpos search_key;
        struct bkey_s_c k;
        int ret;
 
-       EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS &&
-               btree_iter_type(iter) != BTREE_ITER_CACHED);
+       EBUG_ON(iter->path->level);
        bch2_btree_iter_verify(iter);
        bch2_btree_iter_verify_entry_exit(iter);
 
@@ -1892,50 +2625,57 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        }
 
        search_key = btree_iter_search_key(iter);
-       btree_iter_set_search_pos(iter, search_key);
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
 
-       ret = btree_iter_traverse(iter);
+       ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
        if (unlikely(ret))
                return bkey_s_c_err(ret);
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED ||
-           !(iter->flags & BTREE_ITER_IS_EXTENTS)) {
+       if ((iter->flags & BTREE_ITER_CACHED) ||
+           !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
                struct bkey_i *next_update;
-               struct bkey_cached *ck;
 
-               switch (btree_iter_type(iter)) {
-               case BTREE_ITER_KEYS:
-                       k = btree_iter_level_peek_all(iter, &iter->l[0]);
-                       EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0);
-                       break;
-               case BTREE_ITER_CACHED:
-                       ck = (void *) iter->l[0].b;
-                       EBUG_ON(iter->btree_id != ck->key.btree_id ||
-                               bkey_cmp(iter->pos, ck->key.pos));
-                       BUG_ON(!ck->valid);
-
-                       k = bkey_i_to_s_c(ck->k);
-                       break;
-               case BTREE_ITER_NODES:
-                       BUG();
+               if ((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+                   (next_update = btree_trans_peek_updates(trans,
+                                               iter->btree_id, search_key)) &&
+                   !bpos_cmp(next_update->k.p, iter->pos)) {
+                       iter->k = next_update->k;
+                       k = bkey_i_to_s_c(next_update);
+                       goto out;
                }
 
-               next_update = btree_trans_peek_updates(iter, search_key);
-               if (next_update &&
-                   (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) {
+               if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+                   (next_update = __btree_trans_peek_journal(trans, iter->path)) &&
+                   !bpos_cmp(next_update->k.p, iter->pos)) {
                        iter->k = next_update->k;
                        k = bkey_i_to_s_c(next_update);
+                       goto out;
                }
+
+               if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+                   (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+                       if (!bkey_err(k))
+                               iter->k = *k.k;
+                       goto out;
+               }
+
+               k = bch2_btree_path_peek_slot(iter->path, &iter->k);
        } else {
-               if ((iter->flags & BTREE_ITER_INTENT)) {
-                       struct btree_iter *child =
-                               btree_iter_child_alloc(iter, _THIS_IP_);
+               struct bpos next;
 
-                       btree_iter_copy(child, iter);
-                       k = bch2_btree_iter_peek(child);
+               if (iter->flags & BTREE_ITER_INTENT) {
+                       struct btree_iter iter2;
 
-                       if (k.k && !bkey_err(k))
-                               iter->k = child->k;
+                       bch2_trans_copy_iter(&iter2, iter);
+                       k = bch2_btree_iter_peek(&iter2);
+
+                       if (k.k && !bkey_err(k)) {
+                               iter->k = iter2.k;
+                               k.k = &iter->k;
+                       }
+                       bch2_trans_iter_exit(trans, &iter2);
                } else {
                        struct bpos pos = iter->pos;
 
@@ -1945,38 +2685,34 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
                if (unlikely(bkey_err(k)))
                        return k;
-       }
 
-       if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) {
-               if (!k.k ||
-                   ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
-                    ? bpos_cmp(iter->pos, k.k->p)
-                    : bkey_cmp(iter->pos, k.k->p))) {
-                       bkey_init(&iter->k);
-                       iter->k.p = iter->pos;
-                       k = (struct bkey_s_c) { &iter->k, NULL };
-               }
-       } else {
-               struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX;
+               next = k.k ? bkey_start_pos(k.k) : POS_MAX;
 
                if (bkey_cmp(iter->pos, next) < 0) {
                        bkey_init(&iter->k);
                        iter->k.p = iter->pos;
-                       bch2_key_resize(&iter->k,
-                                       min_t(u64, KEY_SIZE_MAX,
-                                             (next.inode == iter->pos.inode
-                                              ? next.offset
-                                              : KEY_OFFSET_MAX) -
-                                             iter->pos.offset));
+
+                       if (iter->flags & BTREE_ITER_IS_EXTENTS) {
+                               bch2_key_resize(&iter->k,
+                                               min_t(u64, KEY_SIZE_MAX,
+                                                     (next.inode == iter->pos.inode
+                                                      ? next.offset
+                                                      : KEY_OFFSET_MAX) -
+                                                     iter->pos.offset));
+                               EBUG_ON(!iter->k.size);
+                       }
 
                        k = (struct bkey_s_c) { &iter->k, NULL };
-                       EBUG_ON(!k.k->size);
                }
        }
+out:
+       iter->path->should_be_locked = true;
 
        bch2_btree_iter_verify_entry_exit(iter);
        bch2_btree_iter_verify(iter);
-       iter->should_be_locked = true;
+       ret = bch2_btree_iter_verify_ret(iter, k);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
 
        return k;
 }
@@ -1997,35 +2733,14 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
        return bch2_btree_iter_peek_slot(iter);
 }
 
-static inline void bch2_btree_iter_init(struct btree_trans *trans,
-                       struct btree_iter *iter, enum btree_id btree_id)
-{
-       struct bch_fs *c = trans->c;
-       unsigned i;
-
-       iter->trans                     = trans;
-       iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
-       iter->btree_id                  = btree_id;
-       iter->real_pos                  = POS_MIN;
-       iter->level                     = 0;
-       iter->min_depth                 = 0;
-       iter->locks_want                = 0;
-       iter->nodes_locked              = 0;
-       iter->nodes_intent_locked       = 0;
-       for (i = 0; i < ARRAY_SIZE(iter->l); i++)
-               iter->l[i].b            = BTREE_ITER_NO_NODE_INIT;
-
-       prefetch(c->btree_roots[btree_id].b);
-}
-
 /* new transactional stuff: */
 
-static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans,
-                                               struct btree_iter *iter)
+static inline void btree_path_verify_sorted_ref(struct btree_trans *trans,
+                                               struct btree_path *path)
 {
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
-       EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx);
-       EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+       EBUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+       EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 }
 
 static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
@@ -2034,432 +2749,201 @@ static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans)
        unsigned i;
 
        for (i = 0; i < trans->nr_sorted; i++)
-               btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]);
+               btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]);
 #endif
 }
 
-static inline void btree_trans_verify_sorted(struct btree_trans *trans)
+static void btree_trans_verify_sorted(struct btree_trans *trans)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct btree_iter *iter, *prev = NULL;
+       struct btree_path *path, *prev = NULL;
+       unsigned i;
 
-       trans_for_each_iter_inorder(trans, iter)
-               BUG_ON(prev && btree_iter_cmp(prev, iter) > 0);
+       trans_for_each_path_inorder(trans, path, i) {
+               BUG_ON(prev && btree_path_cmp(prev, path) > 0);
+               prev = path;
+       }
 #endif
 }
 
-static inline void btree_iter_swap(struct btree_trans *trans,
-                                  struct btree_iter *l, struct btree_iter *r)
+static inline void btree_path_swap(struct btree_trans *trans,
+                                  struct btree_path *l, struct btree_path *r)
 {
        swap(l->sorted_idx, r->sorted_idx);
        swap(trans->sorted[l->sorted_idx],
             trans->sorted[r->sorted_idx]);
 
-       btree_iter_verify_sorted_ref(trans, l);
-       btree_iter_verify_sorted_ref(trans, r);
+       btree_path_verify_sorted_ref(trans, l);
+       btree_path_verify_sorted_ref(trans, r);
 }
 
-static void btree_trans_sort_iters(struct btree_trans *trans)
+static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path,
+                                 int cmp)
 {
-       bool swapped = false;
-       int i, l = 0, r = trans->nr_sorted;
-
-       while (1) {
-               for (i = l; i + 1 < r; i++) {
-                       if (btree_iter_cmp(trans->iters + trans->sorted[i],
-                                          trans->iters + trans->sorted[i + 1]) > 0) {
-                               swap(trans->sorted[i], trans->sorted[i + 1]);
-                               trans->iters[trans->sorted[i]].sorted_idx = i;
-                               trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
-                               swapped = true;
-                       }
-               }
+       struct btree_path *n;
 
-               if (!swapped)
-                       break;
+       if (cmp <= 0) {
+               n = prev_btree_path(trans, path);
+               if (n && btree_path_cmp(n, path) > 0) {
+                       do {
+                               btree_path_swap(trans, n, path);
+                               n = prev_btree_path(trans, path);
+                       } while (n && btree_path_cmp(n, path) > 0);
 
-               r--;
-               swapped = false;
-
-               for (i = r - 2; i >= l; --i) {
-                       if (btree_iter_cmp(trans->iters + trans->sorted[i],
-                                          trans->iters + trans->sorted[i + 1]) > 0) {
-                               swap(trans->sorted[i],
-                                    trans->sorted[i + 1]);
-                               trans->iters[trans->sorted[i]].sorted_idx = i;
-                               trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1;
-                               swapped = true;
-                       }
+                       goto out;
                }
-
-               if (!swapped)
-                       break;
-
-               l++;
-               swapped = false;
-       }
-
-       btree_trans_verify_sorted_refs(trans);
-       btree_trans_verify_sorted(trans);
-}
-
-static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter)
-{
-       struct btree_iter *n;
-
-       EBUG_ON(iter->sorted_idx == U8_MAX);
-
-       n = next_btree_iter(trans, iter);
-       if (n && btree_iter_cmp(iter, n) > 0) {
-               do {
-                       btree_iter_swap(trans, iter, n);
-                       n = next_btree_iter(trans, iter);
-               } while (n && btree_iter_cmp(iter, n) > 0);
-
-               return;
        }
 
-       n = prev_btree_iter(trans, iter);
-       if (n && btree_iter_cmp(n, iter) > 0) {
-               do {
-                       btree_iter_swap(trans, n, iter);
-                       n = prev_btree_iter(trans, iter);
-               } while (n && btree_iter_cmp(n, iter) > 0);
+       if (cmp >= 0) {
+               n = next_btree_path(trans, path);
+               if (n && btree_path_cmp(path, n) > 0) {
+                       do {
+                               btree_path_swap(trans, path, n);
+                               n = next_btree_path(trans, path);
+                       } while (n && btree_path_cmp(path, n) > 0);
+               }
        }
-
+out:
        btree_trans_verify_sorted(trans);
 }
 
-static inline void btree_iter_list_remove(struct btree_trans *trans,
-                                         struct btree_iter *iter)
+static inline void btree_path_list_remove(struct btree_trans *trans,
+                                         struct btree_path *path)
 {
        unsigned i;
 
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
 
-       array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx);
+       array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
 
-       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-               trans->iters[trans->sorted[i]].sorted_idx = i;
+       for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+               trans->paths[trans->sorted[i]].sorted_idx = i;
 
-       iter->sorted_idx = U8_MAX;
+       path->sorted_idx = U8_MAX;
 
        btree_trans_verify_sorted_refs(trans);
 }
 
-static inline void btree_iter_list_add(struct btree_trans *trans,
-                                      struct btree_iter *pos,
-                                      struct btree_iter *iter)
+static inline void btree_path_list_add(struct btree_trans *trans,
+                                      struct btree_path *pos,
+                                      struct btree_path *path)
 {
        unsigned i;
 
        btree_trans_verify_sorted_refs(trans);
 
-       iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted;
+       path->sorted_idx = pos ? pos->sorted_idx + 1 : 0;
 
-       array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx);
+       array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
 
-       for (i = iter->sorted_idx; i < trans->nr_sorted; i++)
-               trans->iters[trans->sorted[i]].sorted_idx = i;
+       for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+               trans->paths[trans->sorted[i]].sorted_idx = i;
 
        btree_trans_verify_sorted_refs(trans);
 }
 
-static void btree_iter_child_free(struct btree_iter *iter)
-{
-       struct btree_iter *child = btree_iter_child(iter);
-
-       if (child) {
-               bch2_trans_iter_free(iter->trans, child);
-               iter->child_idx = U8_MAX;
-       }
-}
-
-static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter,
-                                                unsigned long ip)
-{
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *child = btree_iter_child(iter);
-
-       if (!child) {
-               child = btree_trans_iter_alloc(trans, iter);
-               child->ip_allocated     = ip;
-               iter->child_idx         = child->idx;
-
-               trans->iters_live       |= 1ULL << child->idx;
-               trans->iters_touched    |= 1ULL << child->idx;
-       }
-
-       return child;
-}
-
-static inline void __bch2_trans_iter_free(struct btree_trans *trans,
-                                         unsigned idx)
-{
-       btree_iter_child_free(&trans->iters[idx]);
-
-       btree_iter_list_remove(trans, &trans->iters[idx]);
-
-       __bch2_btree_iter_unlock(&trans->iters[idx]);
-       trans->iters_linked             &= ~(1ULL << idx);
-       trans->iters_live               &= ~(1ULL << idx);
-       trans->iters_touched            &= ~(1ULL << idx);
-}
-
-int bch2_trans_iter_put(struct btree_trans *trans,
-                       struct btree_iter *iter)
-{
-       int ret;
-
-       if (IS_ERR_OR_NULL(iter))
-               return 0;
-
-       BUG_ON(trans->iters + iter->idx != iter);
-       BUG_ON(!btree_iter_live(trans, iter));
-
-       ret = btree_iter_err(iter);
-
-       if (!(trans->iters_touched & (1ULL << iter->idx)) &&
-           !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
-               __bch2_trans_iter_free(trans, iter->idx);
-
-       trans->iters_live       &= ~(1ULL << iter->idx);
-       return ret;
-}
-
-int bch2_trans_iter_free(struct btree_trans *trans,
-                        struct btree_iter *iter)
-{
-       if (IS_ERR_OR_NULL(iter))
-               return 0;
-
-       set_btree_iter_dontneed(trans, iter);
-
-       return bch2_trans_iter_put(trans, iter);
-}
-
-noinline __cold
-static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
-{
-
-       struct btree_iter *iter;
-       struct btree_insert_entry *i;
-       char buf[100];
-
-       btree_trans_sort_iters(trans);
-
-       trans_for_each_iter_inorder(trans, iter)
-               printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
-                      bch2_btree_ids[iter->btree_id],
-                      (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf),
-                      btree_iter_live(trans, iter) ? " live" : "",
-                      (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
-                      iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
-                      (void *) iter->ip_allocated);
-
-       trans_for_each_update(trans, i) {
-               char buf[300];
-
-               bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
-               printk(KERN_ERR "update: btree %s %s\n",
-                      bch2_btree_ids[i->iter->btree_id], buf);
-       }
-       panic("trans iter oveflow\n");
-}
-
-static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans,
-                                                struct btree_iter *pos)
-{
-       struct btree_iter *iter;
-       unsigned idx;
-
-       if (unlikely(trans->iters_linked ==
-                    ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
-               btree_trans_iter_alloc_fail(trans);
-
-       idx = __ffs64(~trans->iters_linked);
-       iter = &trans->iters[idx];
-
-       iter->trans             = trans;
-       iter->idx               = idx;
-       iter->child_idx         = U8_MAX;
-       iter->sorted_idx        = U8_MAX;
-       iter->flags             = 0;
-       iter->nodes_locked      = 0;
-       iter->nodes_intent_locked = 0;
-       trans->iters_linked     |= 1ULL << idx;
-
-       btree_iter_list_add(trans, pos, iter);
-       return iter;
-}
-
-static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src)
+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
 {
-       unsigned i;
-
-       __bch2_btree_iter_unlock(dst);
-       btree_iter_child_free(dst);
-
-       memcpy(&dst->flags, &src->flags,
-              sizeof(struct btree_iter) - offsetof(struct btree_iter, flags));
-
-       for (i = 0; i < BTREE_MAX_DEPTH; i++)
-               if (btree_node_locked(dst, i))
-                       six_lock_increment(&dst->l[i].b->c.lock,
-                                          __btree_lock_want(dst, i));
-
-       dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
-       dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
-
-       btree_iter_check_sort(dst->trans, dst);
+       if (iter->path)
+               bch2_path_put(trans, iter->path,
+                             iter->flags & BTREE_ITER_INTENT);
+       if (iter->update_path)
+               bch2_path_put(trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
+       if (iter->key_cache_path)
+               bch2_path_put(trans, iter->key_cache_path,
+                             iter->flags & BTREE_ITER_INTENT);
+       iter->path = NULL;
+       iter->update_path = NULL;
+       iter->key_cache_path = NULL;
 }
 
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
-                                        unsigned btree_id, struct bpos pos,
-                                        unsigned locks_want,
-                                        unsigned depth,
-                                        unsigned flags)
+static void __bch2_trans_iter_init(struct btree_trans *trans,
+                                  struct btree_iter *iter,
+                                  unsigned btree_id, struct bpos pos,
+                                  unsigned locks_want,
+                                  unsigned depth,
+                                  unsigned flags,
+                                  unsigned long ip)
 {
-       struct btree_iter *iter, *best = NULL;
-       struct bpos real_pos, pos_min = POS_MIN;
-
        EBUG_ON(trans->restarted);
 
-       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
-           btree_node_type_is_extents(btree_id) &&
-           !(flags & BTREE_ITER_NOT_EXTENTS) &&
-           !(flags & BTREE_ITER_ALL_SNAPSHOTS))
+       if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
+           btree_node_type_is_extents(btree_id))
                flags |= BTREE_ITER_IS_EXTENTS;
 
-       if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
+       if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) &&
            !btree_type_has_snapshots(btree_id))
                flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
 
-       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
-               pos.snapshot = btree_type_has_snapshots(btree_id)
-                       ? U32_MAX : 0;
-
-       real_pos = pos;
-
-       if ((flags & BTREE_ITER_IS_EXTENTS) &&
-           bkey_cmp(pos, POS_MAX))
-               real_pos = bpos_nosnap_successor(pos);
-
-       trans_for_each_iter(trans, iter) {
-               if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
-                       continue;
-
-               if (iter->btree_id != btree_id)
-                       continue;
-
-               if (best) {
-                       int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
-                                          bpos_diff(iter->real_pos, real_pos));
-
-                       if (cmp < 0 ||
-                           ((cmp == 0 && btree_iter_keep(trans, iter))))
-                               continue;
-               }
-
-               best = iter;
-       }
-
-       if (!best) {
-               iter = btree_trans_iter_alloc(trans, NULL);
-               bch2_btree_iter_init(trans, iter, btree_id);
-       } else if (btree_iter_keep(trans, best)) {
-               iter = btree_trans_iter_alloc(trans, best);
-               btree_iter_copy(iter, best);
-       } else {
-               iter = best;
-       }
-
-       trans->iters_live       |= 1ULL << iter->idx;
-       trans->iters_touched    |= 1ULL << iter->idx;
-
-       iter->flags = flags;
-
-       iter->snapshot = pos.snapshot;
+       if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) &&
+           btree_type_has_snapshots(btree_id))
+               flags |= BTREE_ITER_FILTER_SNAPSHOTS;
 
-       /*
-        * If the iterator has locks_want greater than requested, we explicitly
-        * do not downgrade it here - on transaction restart because btree node
-        * split needs to upgrade locks, we might be putting/getting the
-        * iterator again. Downgrading iterators only happens via an explicit
-        * bch2_trans_downgrade().
-        */
-
-       locks_want = min(locks_want, BTREE_MAX_DEPTH);
-       if (locks_want > iter->locks_want) {
-               iter->locks_want = locks_want;
-               btree_iter_get_locks(iter, true, _THIS_IP_);
-       }
+       if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
+               flags |= BTREE_ITER_WITH_JOURNAL;
 
-       while (iter->level != depth) {
-               btree_node_unlock(iter, iter->level);
-               iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
-               iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
-               if (iter->level < depth)
-                       iter->level++;
-               else
-                       iter->level--;
-       }
+       if (!btree_id_cached(trans->c, btree_id)) {
+               flags &= ~BTREE_ITER_CACHED;
+               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+       } else if (!(flags & BTREE_ITER_CACHED))
+               flags |= BTREE_ITER_WITH_KEY_CACHE;
 
+       iter->trans     = trans;
+       iter->path      = NULL;
+       iter->update_path = NULL;
+       iter->key_cache_path = NULL;
+       iter->btree_id  = btree_id;
        iter->min_depth = depth;
+       iter->flags     = flags;
+       iter->snapshot  = pos.snapshot;
+       iter->pos       = pos;
+       iter->k.type    = KEY_TYPE_deleted;
+       iter->k.p       = pos;
+       iter->k.size    = 0;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       iter->ip_allocated = ip;
+#endif
 
-       bch2_btree_iter_set_pos(iter, pos);
-       btree_iter_set_search_pos(iter, real_pos);
-
-       trace_trans_get_iter(_RET_IP_, trans->ip,
-                            btree_id,
-                            &real_pos, locks_want, iter->uptodate,
-                            best ? &best->real_pos     : &pos_min,
-                            best ? best->locks_want    : U8_MAX,
-                            best ? best->uptodate      : U8_MAX);
-
-       return iter;
+       iter->path = bch2_path_get(trans, btree_id, iter->pos,
+                                  locks_want, depth, flags, ip);
 }
 
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
-                                           enum btree_id btree_id,
-                                           struct bpos pos,
-                                           unsigned locks_want,
-                                           unsigned depth,
-                                           unsigned flags)
+void bch2_trans_iter_init(struct btree_trans *trans,
+                         struct btree_iter *iter,
+                         unsigned btree_id, struct bpos pos,
+                         unsigned flags)
 {
-       struct btree_iter *iter =
-               __bch2_trans_get_iter(trans, btree_id, pos,
-                                     locks_want, depth,
-                                     BTREE_ITER_NODES|
-                                     BTREE_ITER_NOT_EXTENTS|
-                                     BTREE_ITER_ALL_SNAPSHOTS|
-                                     flags);
-
-       BUG_ON(bkey_cmp(iter->pos, pos));
-       BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
-       BUG_ON(iter->level      != depth);
-       BUG_ON(iter->min_depth  != depth);
-       iter->ip_allocated = _RET_IP_;
-
-       return iter;
+       __bch2_trans_iter_init(trans, iter, btree_id, pos,
+                              0, 0, flags, _RET_IP_);
 }
 
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
-                                       struct btree_iter *src)
+void bch2_trans_node_iter_init(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              enum btree_id btree_id,
+                              struct bpos pos,
+                              unsigned locks_want,
+                              unsigned depth,
+                              unsigned flags)
 {
-       struct btree_iter *iter;
-
-       iter = btree_trans_iter_alloc(trans, src);
-       btree_iter_copy(iter, src);
-
-       trans->iters_live |= 1ULL << iter->idx;
-       /*
-        * We don't need to preserve this iter since it's cheap to copy it
-        * again - this will cause trans_iter_put() to free it right away:
-        */
-       set_btree_iter_dontneed(trans, iter);
+       __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth,
+                              BTREE_ITER_NOT_EXTENTS|
+                              __BTREE_ITER_ALL_SNAPSHOTS|
+                              BTREE_ITER_ALL_SNAPSHOTS|
+                              flags, _RET_IP_);
+       BUG_ON(iter->path->locks_want    < min(locks_want, BTREE_MAX_DEPTH));
+       BUG_ON(iter->path->level        != depth);
+       BUG_ON(iter->min_depth          != depth);
+}
 
-       return iter;
+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
+{
+       *dst = *src;
+       if (src->path)
+               __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+       if (src->update_path)
+               __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+       dst->key_cache_path = NULL;
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
@@ -2488,7 +2972,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
                trans->mem_bytes = new_bytes;
 
                if (old_bytes) {
-                       trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
+                       trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes);
                        btree_trans_restart(trans);
                        return ERR_PTR(-EINTR);
                }
@@ -2500,20 +2984,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        return p;
 }
 
-inline void bch2_trans_unlink_iters(struct btree_trans *trans)
-{
-       u64 iters = trans->iters_linked &
-               ~trans->iters_touched &
-               ~trans->iters_live;
-
-       while (iters) {
-               unsigned idx = __ffs64(iters);
-
-               iters &= ~(1ULL << idx);
-               __bch2_trans_iter_free(trans, idx);
-       }
-}
-
 /**
  * bch2_trans_begin() - reset a transaction after a interrupted attempt
  * @trans: transaction to reset
@@ -2524,19 +2994,13 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans)
  */
 void bch2_trans_begin(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
-
-       trans_for_each_iter(trans, iter)
-               iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
-                                BTREE_ITER_SET_POS_AFTER_COMMIT);
+       struct btree_insert_entry *i;
+       struct btree_path *path;
 
-       /*
-        * XXX: we shouldn't be doing this if the transaction was restarted, but
-        * currently we still overflow transaction iterators if we do that
-        * */
-       bch2_trans_unlink_iters(trans);
-       trans->iters_touched &= trans->iters_live;
+       trans_for_each_update(trans, i)
+               __btree_path_put(i->path, true);
 
+       memset(&trans->journal_res, 0, sizeof(trans->journal_res));
        trans->extra_journal_res        = 0;
        trans->nr_updates               = 0;
        trans->mem_top                  = 0;
@@ -2552,48 +3016,59 @@ void bch2_trans_begin(struct btree_trans *trans)
                       (void *) &trans->fs_usage_deltas->memset_start);
        }
 
+       trans_for_each_path(trans, path) {
+               path->should_be_locked = false;
+
+               /*
+                * XXX: we probably shouldn't be doing this if the transaction
+                * was restarted, but currently we still overflow transaction
+                * iterators if we do that
+                */
+               if (!path->ref && !path->preserve)
+                       __bch2_path_free(trans, path);
+               else if (!path->ref)
+                       path->preserve = false;
+       }
+
        bch2_trans_cond_resched(trans);
 
        if (trans->restarted)
-               bch2_btree_iter_traverse_all(trans);
+               bch2_btree_path_traverse_all(trans);
 
        trans->restarted = false;
 }
 
-static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c)
 {
-       size_t iters_bytes      = sizeof(struct btree_iter) * BTREE_ITER_MAX;
+       size_t paths_bytes      = sizeof(struct btree_path) * BTREE_ITER_MAX;
        size_t updates_bytes    = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
-       size_t sorted_bytes     = sizeof(u8) * BTREE_ITER_MAX;
        void *p = NULL;
 
        BUG_ON(trans->used_mempool);
 
 #ifdef __KERNEL__
-       p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
+       p = this_cpu_xchg(c->btree_paths_bufs->path , NULL);
 #endif
        if (!p)
-               p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
+               p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS);
 
-       trans->iters            = p; p += iters_bytes;
+       trans->paths            = p; p += paths_bytes;
        trans->updates          = p; p += updates_bytes;
-       trans->sorted           = p; p += sorted_bytes;
 }
 
-void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
-                    unsigned expected_nr_iters,
-                    size_t expected_mem_bytes)
+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
+                      unsigned expected_nr_iters,
+                      size_t expected_mem_bytes,
+                      const char *fn)
        __acquires(&c->btree_trans_barrier)
 {
+       BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key));
+
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
-       trans->ip               = _RET_IP_;
+       trans->fn               = fn;
 
-       /*
-        * reallocating iterators currently completely breaks
-        * bch2_trans_iter_put(), we always allocate the max:
-        */
-       bch2_trans_alloc_iters(trans, c);
+       bch2_trans_alloc_paths(trans, c);
 
        if (expected_mem_bytes) {
                trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
@@ -2607,62 +3082,67 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 
        trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
        trans->pid = current->pid;
        mutex_lock(&c->btree_trans_lock);
        list_add(&trans->list, &c->btree_trans_list);
        mutex_unlock(&c->btree_trans_lock);
+}
+
+static void check_btree_paths_leaked(struct btree_trans *trans)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct bch_fs *c = trans->c;
+       struct btree_path *path;
+
+       trans_for_each_path(trans, path)
+               if (path->ref)
+                       goto leaked;
+       return;
+leaked:
+       bch_err(c, "btree paths leaked from %s!", trans->fn);
+       trans_for_each_path(trans, path)
+               if (path->ref)
+                       printk(KERN_ERR "  btree %s %pS\n",
+                              bch2_btree_ids[path->btree_id],
+                              (void *) path->ip_allocated);
+       /* Be noisy about this: */
+       bch2_fatal_error(c);
 #endif
 }
 
-int bch2_trans_exit(struct btree_trans *trans)
+void bch2_trans_exit(struct btree_trans *trans)
        __releases(&c->btree_trans_barrier)
 {
+       struct btree_insert_entry *i;
        struct bch_fs *c = trans->c;
 
        bch2_trans_unlock(trans);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       if (trans->iters_live) {
-               struct btree_iter *iter;
-
-               trans_for_each_iter(trans, iter)
-                       btree_iter_child_free(iter);
-       }
-
-       if (trans->iters_live) {
-               struct btree_iter *iter;
+       trans_for_each_update(trans, i)
+               __btree_path_put(i->path, true);
+       trans->nr_updates               = 0;
 
-               bch_err(c, "btree iterators leaked!");
-               trans_for_each_iter(trans, iter)
-                       if (btree_iter_live(trans, iter))
-                               printk(KERN_ERR "  btree %s allocated at %pS\n",
-                                      bch2_btree_ids[iter->btree_id],
-                                      (void *) iter->ip_allocated);
-               /* Be noisy about this: */
-               bch2_fatal_error(c);
-       }
+       check_btree_paths_leaked(trans);
 
-       mutex_lock(&trans->c->btree_trans_lock);
+       mutex_lock(&c->btree_trans_lock);
        list_del(&trans->list);
-       mutex_unlock(&trans->c->btree_trans_lock);
-#endif
+       mutex_unlock(&c->btree_trans_lock);
 
        srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
 
-       bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
        if (trans->fs_usage_deltas) {
                if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
                    REPLICAS_DELTA_LIST_MAX)
                        mempool_free(trans->fs_usage_deltas,
-                                    &trans->c->replicas_delta_pool);
+                                    &c->replicas_delta_pool);
                else
                        kfree(trans->fs_usage_deltas);
        }
 
        if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
-               mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+               mempool_free(trans->mem, &c->btree_trans_mem_pool);
        else
                kfree(trans->mem);
 
@@ -2670,74 +3150,70 @@ int bch2_trans_exit(struct btree_trans *trans)
        /*
         * Userspace doesn't have a real percpu implementation:
         */
-       trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
+       trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths);
 #endif
 
-       if (trans->iters)
-               mempool_free(trans->iters, &trans->c->btree_iters_pool);
+       if (trans->paths)
+               mempool_free(trans->paths, &c->btree_paths_pool);
 
        trans->mem      = (void *) 0x1;
-       trans->iters    = (void *) 0x1;
-
-       return trans->error ? -EIO : 0;
+       trans->paths    = (void *) 0x1;
 }
 
 static void __maybe_unused
-bch2_btree_iter_node_to_text(struct printbuf *out,
+bch2_btree_path_node_to_text(struct printbuf *out,
                             struct btree_bkey_cached_common *_b,
-                            enum btree_iter_type type)
+                            bool cached)
 {
        pr_buf(out, "    l=%u %s:",
               _b->level, bch2_btree_ids[_b->btree_id]);
-       bch2_bpos_to_text(out, btree_node_pos(_b, type));
+       bch2_bpos_to_text(out, btree_node_pos(_b, cached));
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
+static bool trans_has_locks(struct btree_trans *trans)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
-                   iter->nodes_locked)
+       trans_for_each_path(trans, path)
+               if (path->nodes_locked)
                        return true;
        return false;
 }
-#endif
 
 void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
        struct btree_trans *trans;
-       struct btree_iter *iter;
+       struct btree_path *path;
        struct btree *b;
+       static char lock_types[] = { 'r', 'i', 'w' };
        unsigned l;
 
        mutex_lock(&c->btree_trans_lock);
        list_for_each_entry(trans, &c->btree_trans_list, list) {
-               if (!trans_has_btree_nodes_locked(trans))
+               if (!trans_has_locks(trans))
                        continue;
 
-               pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
+               pr_buf(out, "%i %s\n", trans->pid, trans->fn);
 
-               trans_for_each_iter(trans, iter) {
-                       if (!iter->nodes_locked)
+               trans_for_each_path(trans, path) {
+                       if (!path->nodes_locked)
                                continue;
 
-                       pr_buf(out, "  iter %u %c %s:",
-                              iter->idx,
-                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
-                              bch2_btree_ids[iter->btree_id]);
-                       bch2_bpos_to_text(out, iter->pos);
+                       pr_buf(out, "  path %u %c l=%u %s:",
+                              path->idx,
+                              path->cached ? 'c' : 'b',
+                              path->level,
+                              bch2_btree_ids[path->btree_id]);
+                       bch2_bpos_to_text(out, path->pos);
                        pr_buf(out, "\n");
 
                        for (l = 0; l < BTREE_MAX_DEPTH; l++) {
-                               if (btree_node_locked(iter, l)) {
+                               if (btree_node_locked(path, l)) {
                                        pr_buf(out, "    %s l=%u ",
-                                              btree_node_intent_locked(iter, l) ? "i" : "r", l);
-                                       bch2_btree_iter_node_to_text(out,
-                                                       (void *) iter->l[l].b,
-                                                       btree_iter_type(iter));
+                                              btree_node_intent_locked(path, l) ? "i" : "r", l);
+                                       bch2_btree_path_node_to_text(out,
+                                                       (void *) path->l[l].b,
+                                                       path->cached);
                                        pr_buf(out, "\n");
                                }
                        }
@@ -2745,44 +3221,47 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
                b = READ_ONCE(trans->locking);
                if (b) {
-                       iter = &trans->iters[trans->locking_iter_idx];
-                       pr_buf(out, "  locking iter %u %c l=%u %s:",
-                              trans->locking_iter_idx,
-                              btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
+                       path = &trans->paths[trans->locking_path_idx];
+                       pr_buf(out, "  locking path %u %c l=%u %c %s:",
+                              trans->locking_path_idx,
+                              path->cached ? 'c' : 'b',
                               trans->locking_level,
+                              lock_types[trans->locking_lock_type],
                               bch2_btree_ids[trans->locking_btree_id]);
                        bch2_bpos_to_text(out, trans->locking_pos);
 
                        pr_buf(out, " node ");
-                       bch2_btree_iter_node_to_text(out,
-                                       (void *) b,
-                                       btree_iter_type(iter));
+                       bch2_btree_path_node_to_text(out,
+                                       (void *) b, path->cached);
                        pr_buf(out, "\n");
                }
        }
        mutex_unlock(&c->btree_trans_lock);
-#endif
 }
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+       if (c->btree_trans_barrier_initialized)
+               cleanup_srcu_struct(&c->btree_trans_barrier);
        mempool_exit(&c->btree_trans_mem_pool);
-       mempool_exit(&c->btree_iters_pool);
-       cleanup_srcu_struct(&c->btree_trans_barrier);
+       mempool_exit(&c->btree_paths_pool);
 }
 
 int bch2_fs_btree_iter_init(struct bch_fs *c)
 {
        unsigned nr = BTREE_ITER_MAX;
+       int ret;
 
        INIT_LIST_HEAD(&c->btree_trans_list);
        mutex_init(&c->btree_trans_lock);
 
-       return  init_srcu_struct(&c->btree_trans_barrier) ?:
-               mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
-                       sizeof(u8) * nr +
-                       sizeof(struct btree_iter) * nr +
+       ret   = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1,
+                       sizeof(struct btree_path) * nr +
                        sizeof(struct btree_insert_entry) * nr) ?:
                mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
-                                         BTREE_TRANS_MEM_MAX);
+                                         BTREE_TRANS_MEM_MAX) ?:
+               init_srcu_struct(&c->btree_trans_barrier);
+       if (!ret)
+               c->btree_trans_barrier_initialized = true;
+       return ret;
 }
index 39124e68e48828f3870522b625d8a46cc5b0a244..759c7b52f4a24f34ddf061735341c27aee49bee1 100644 (file)
 #include "bset.h"
 #include "btree_types.h"
 
-static inline void btree_iter_set_dirty(struct btree_iter *iter,
-                                       enum btree_iter_uptodate u)
+static inline void __btree_path_get(struct btree_path *path, bool intent)
 {
-       iter->uptodate = max_t(unsigned, iter->uptodate, u);
+       path->ref++;
+       path->intent_ref += intent;
 }
 
-static inline struct btree *btree_iter_node(struct btree_iter *iter,
-                                           unsigned level)
+static inline bool __btree_path_put(struct btree_path *path, bool intent)
 {
-       return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
+       EBUG_ON(!path->ref);
+       EBUG_ON(!path->intent_ref && intent);
+       path->intent_ref -= intent;
+       return --path->ref == 0;
 }
 
-static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
-                                       const struct btree *b, unsigned level)
+static inline void btree_path_set_dirty(struct btree_path *path,
+                                       enum btree_path_uptodate u)
 {
-       /*
-        * We don't compare the low bits of the lock sequence numbers because
-        * @iter might have taken a write lock on @b, and we don't want to skip
-        * the linked iterator if the sequence numbers were equal before taking
-        * that write lock. The lock sequence number is incremented by taking
-        * and releasing write locks and is even when unlocked:
-        */
-       return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
+       path->uptodate = max_t(unsigned, path->uptodate, u);
 }
 
-static inline struct btree *btree_node_parent(struct btree_iter *iter,
-                                             struct btree *b)
+static inline struct btree *btree_path_node(struct btree_path *path,
+                                           unsigned level)
 {
-       return btree_iter_node(iter, b->c.level + 1);
+       return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL;
 }
 
-static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
+static inline bool btree_node_lock_seq_matches(const struct btree_path *path,
+                                       const struct btree *b, unsigned level)
 {
-       return hweight64(trans->iters_linked) > 1;
+       /*
+        * We don't compare the low bits of the lock sequence numbers because
+        * @path might have taken a write lock on @b, and we don't want to skip
+        * the linked path if the sequence numbers were equal before taking that
+        * write lock. The lock sequence number is incremented by taking and
+        * releasing write locks and is even when unlocked:
+        */
+       return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
 }
 
-static inline int btree_iter_err(const struct btree_iter *iter)
+static inline struct btree *btree_node_parent(struct btree_path *path,
+                                             struct btree *b)
 {
-       return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
+       return btree_path_node(path, b->c.level + 1);
 }
 
-/* Iterate over iters within a transaction: */
+/* Iterate over paths within a transaction: */
 
-static inline struct btree_iter *
-__trans_next_iter(struct btree_trans *trans, unsigned idx)
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned idx)
 {
        u64 l;
 
        if (idx == BTREE_ITER_MAX)
                return NULL;
 
-       l = trans->iters_linked >> idx;
+       l = trans->paths_allocated >> idx;
        if (!l)
                return NULL;
 
        idx += __ffs64(l);
        EBUG_ON(idx >= BTREE_ITER_MAX);
-       EBUG_ON(trans->iters[idx].idx != idx);
-       return &trans->iters[idx];
+       EBUG_ON(trans->paths[idx].idx != idx);
+       return &trans->paths[idx];
 }
 
-#define trans_for_each_iter(_trans, _iter)                             \
-       for (_iter = __trans_next_iter((_trans), 0);                    \
-            (_iter);                                                   \
-            _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
+#define trans_for_each_path(_trans, _path)                             \
+       for (_path = __trans_next_path((_trans), 0);                    \
+            (_path);                                                   \
+            _path = __trans_next_path((_trans), (_path)->idx + 1))
 
-static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-       unsigned idx = iter ? iter->sorted_idx + 1 : 0;
+       unsigned idx = path ? path->sorted_idx + 1 : 0;
 
        EBUG_ON(idx > trans->nr_sorted);
 
        return idx < trans->nr_sorted
-               ? trans->iters + trans->sorted[idx]
+               ? trans->paths + trans->sorted[idx]
                : NULL;
 }
 
-static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter)
+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path)
 {
-       EBUG_ON(iter->sorted_idx >= trans->nr_sorted);
-       return iter->sorted_idx
-               ? trans->iters + trans->sorted[iter->sorted_idx - 1]
+       EBUG_ON(path->sorted_idx >= trans->nr_sorted);
+       return path->sorted_idx
+               ? trans->paths + trans->sorted[path->sorted_idx - 1]
                : NULL;
 }
 
-#define trans_for_each_iter_inorder(_trans, _iter)                     \
-       for (_iter = next_btree_iter(trans, NULL);                      \
-            (_iter);                                                   \
-            _iter = next_btree_iter((_trans), (_iter)))
+#define trans_for_each_path_inorder(_trans, _path, _i)                 \
+       for (_i = 0;                                                    \
+            ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
+            _i++)
 
-static inline bool __iter_has_node(const struct btree_iter *iter,
+static inline bool __path_has_node(const struct btree_path *path,
                                   const struct btree *b)
 {
-       return iter->l[b->c.level].b == b &&
-               btree_node_lock_seq_matches(iter, b, b->c.level);
+       return path->l[b->c.level].b == b &&
+               btree_node_lock_seq_matches(path, b, b->c.level);
 }
 
-static inline struct btree_iter *
-__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
+static inline struct btree_path *
+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
                            unsigned idx)
 {
-       struct btree_iter *iter = __trans_next_iter(trans, idx);
+       struct btree_path *path = __trans_next_path(trans, idx);
 
-       while (iter && !__iter_has_node(iter, b))
-               iter = __trans_next_iter(trans, iter->idx + 1);
+       while (path && !__path_has_node(path, b))
+               path = __trans_next_path(trans, path->idx + 1);
 
-       return iter;
+       return path;
 }
 
-#define trans_for_each_iter_with_node(_trans, _b, _iter)               \
-       for (_iter = __trans_next_iter_with_node((_trans), (_b), 0);    \
-            (_iter);                                                   \
-            _iter = __trans_next_iter_with_node((_trans), (_b),        \
-                                                (_iter)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path)               \
+       for (_path = __trans_next_path_with_node((_trans), (_b), 0);    \
+            (_path);                                                   \
+            _path = __trans_next_path_with_node((_trans), (_b),        \
+                                                (_path)->idx + 1))
+
+struct btree_path * __must_check
+bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
+                        bool, unsigned long);
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+                       struct bpos, bool, unsigned long);
+int __must_check bch2_btree_path_traverse(struct btree_trans *,
+                                         struct btree_path *, unsigned);
+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+                                unsigned, unsigned, unsigned, unsigned long);
+inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
 
 #ifdef CONFIG_BCACHEFS_DEBUG
-void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
-void bch2_btree_trans_verify_locks(struct btree_trans *);
+void bch2_trans_verify_paths(struct btree_trans *);
+void bch2_trans_verify_locks(struct btree_trans *);
+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id,
+                           struct bpos, bool);
 #else
-static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
-                                                struct btree *b) {}
-static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {}
+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {}
+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
+                                         struct bpos pos, bool key_cache) {}
 #endif
 
-void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
-                                          struct bkey_packed *);
-void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
-                             struct btree_node_iter *, struct bkey_packed *,
-                             unsigned, unsigned);
+void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
+                                     struct btree *, struct bkey_packed *);
+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
+                             struct btree *, struct btree_node_iter *,
+                             struct bkey_packed *, unsigned, unsigned);
 
-bool bch2_btree_iter_relock_intent(struct btree_iter *);
-bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
+bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
+
+void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
 
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
@@ -149,35 +169,39 @@ static inline int btree_trans_restart(struct btree_trans *trans)
        return -EINTR;
 }
 
-bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+bool bch2_btree_node_upgrade(struct btree_trans *,
+                            struct btree_path *, unsigned);
+
+bool __bch2_btree_path_upgrade(struct btree_trans *,
+                              struct btree_path *, unsigned);
 
-static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+static inline bool bch2_btree_path_upgrade(struct btree_trans *trans,
+                                          struct btree_path *path,
                                           unsigned new_locks_want)
 {
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
-       return iter->locks_want < new_locks_want
-               ? __bch2_btree_iter_upgrade(iter, new_locks_want)
-               : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+       return path->locks_want < new_locks_want
+               ? __bch2_btree_path_upgrade(trans, path, new_locks_want)
+               : path->uptodate == BTREE_ITER_UPTODATE;
 }
 
-void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
+void __bch2_btree_path_downgrade(struct btree_path *, unsigned);
 
-static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+static inline void bch2_btree_path_downgrade(struct btree_path *path)
 {
-       unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT);
+       unsigned new_locks_want = path->level + !!path->intent_ref;
 
-       if (iter->locks_want > new_locks_want)
-               __bch2_btree_iter_downgrade(iter, new_locks_want);
+       if (path->locks_want > new_locks_want)
+               __bch2_btree_path_downgrade(path, new_locks_want);
 }
 
 void bch2_trans_downgrade(struct btree_trans *);
 
-void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
-
-void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
 
+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
 int __must_check bch2_btree_iter_traverse(struct btree_iter *);
 
 struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
@@ -196,17 +220,26 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
 bool bch2_btree_iter_advance(struct btree_iter *);
 bool bch2_btree_iter_rewind(struct btree_iter *);
 
-static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
 {
-       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
-               new_pos.snapshot = iter->snapshot;
-
        iter->k.type = KEY_TYPE_deleted;
        iter->k.p.inode         = iter->pos.inode       = new_pos.inode;
        iter->k.p.offset        = iter->pos.offset      = new_pos.offset;
        iter->k.p.snapshot      = iter->pos.snapshot    = new_pos.snapshot;
        iter->k.size = 0;
-       iter->should_be_locked = false;
+}
+
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
+{
+       if (unlikely(iter->update_path))
+               bch2_path_put(iter->trans, iter->update_path,
+                             iter->flags & BTREE_ITER_INTENT);
+       iter->update_path = NULL;
+
+       if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
+               new_pos.snapshot = iter->snapshot;
+
+       __bch2_btree_iter_set_pos(iter, new_pos);
 }
 
 static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter)
@@ -215,45 +248,62 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it
        iter->pos = bkey_start_pos(&iter->k);
 }
 
-static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx)
+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot)
 {
-       return idx != U8_MAX ? trans->iters + idx : NULL;
+       struct bpos pos = iter->pos;
+
+       iter->snapshot = snapshot;
+       pos.snapshot = snapshot;
+       bch2_btree_iter_set_pos(iter, pos);
 }
 
-static inline struct btree_iter *btree_iter_child(struct btree_iter *iter)
+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *);
+void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *,
+                         unsigned, struct bpos, unsigned);
+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *,
+                              enum btree_id, struct bpos,
+                              unsigned, unsigned, unsigned);
+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
+
+static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
-       return idx_to_btree_iter(iter->trans, iter->child_idx);
+       iter->path->preserve = false;
 }
 
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline int bch2_trans_cond_resched(struct btree_trans *trans)
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
+void bch2_trans_begin(struct btree_trans *);
+
+static inline struct btree *
+__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter)
 {
-       if (need_resched() || race_fault()) {
-               bch2_trans_unlock(trans);
-               schedule();
-               return bch2_trans_relock(trans) ? 0 : -EINTR;
-       } else {
-               return 0;
-       }
+       struct btree *b;
+
+       while (b = bch2_btree_iter_peek_node(iter),
+              PTR_ERR_OR_ZERO(b) == -EINTR)
+               bch2_trans_begin(trans);
+
+       return b;
 }
 
-#define __for_each_btree_node(_trans, _iter, _btree_id, _start,        \
-                             _locks_want, _depth, _flags, _b)          \
-       for (iter = bch2_trans_get_node_iter((_trans), (_btree_id),     \
-                               _start, _locks_want, _depth, _flags),   \
-            _b = bch2_btree_iter_peek_node(_iter);                     \
-            (_b);                                                      \
-            (_b) = bch2_btree_iter_next_node(_iter))
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start,                \
+                             _locks_want, _depth, _flags, _b, _ret)    \
+       for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \
+                               _start, _locks_want, _depth, _flags);   \
+            (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\
+            !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b);                   \
+            (_b) = bch2_btree_iter_next_node(&(_iter)))
 
 #define for_each_btree_node(_trans, _iter, _btree_id, _start,          \
-                           _flags, _b)                                 \
+                           _flags, _b, _ret)                           \
        __for_each_btree_node(_trans, _iter, _btree_id, _start,         \
-                             0, 0, _flags, _b)
+                             0, 0, _flags, _b, _ret)
+
+static inline int bkey_err(struct bkey_s_c k)
+{
+       return PTR_ERR_OR_ZERO(k.k);
+}
 
-static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
                                                     unsigned flags)
 {
        return flags & BTREE_ITER_SLOTS
@@ -261,92 +311,62 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
                : bch2_btree_iter_peek(iter);
 }
 
-static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
-                                                    unsigned flags)
+static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
-       return flags & BTREE_ITER_SLOTS
-               ? bch2_btree_iter_next_slot(iter)
-               : bch2_btree_iter_next(iter);
+       return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2
+               ? -EINTR : 0;
 }
 
-static inline int bkey_err(struct bkey_s_c k)
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+                                  struct btree_iter *iter, unsigned flags)
 {
-       return PTR_ERR_OR_ZERO(k.k);
+       struct bkey_s_c k;
+
+       while (btree_trans_too_many_iters(trans) ||
+              (k = bch2_btree_iter_peek_type(iter, flags),
+               bkey_err(k) == -EINTR))
+               bch2_trans_begin(trans);
+
+       return k;
 }
 
 #define for_each_btree_key(_trans, _iter, _btree_id,                   \
                           _start, _flags, _k, _ret)                    \
-       for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id),       \
-                                          (_start), (_flags)),         \
-            (_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
+                                 (_start), (_flags));                  \
+            (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
-            (_k) = __bch2_btree_iter_next(_iter, _flags))
+            bch2_btree_iter_advance(&(_iter)))
 
-#define for_each_btree_key_continue(_iter, _flags, _k, _ret)           \
-       for ((_k) = __bch2_btree_iter_peek(_iter, _flags);              \
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id,         \
+                          _start, _flags, _k, _ret)                    \
+       for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),      \
+                                 (_start), (_flags));                  \
+            (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),        \
             !((_ret) = bkey_err(_k)) && (_k).k;                        \
-            (_k) = __bch2_btree_iter_next(_iter, _flags))
-
-/* new multiple iterator interface: */
-
-int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
-int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
+            bch2_btree_iter_advance(&(_iter)))
 
-void bch2_trans_unlink_iters(struct btree_trans *);
-
-struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
-                                        struct bpos, unsigned,
-                                        unsigned, unsigned);
-
-static inline struct btree_iter *
-bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
-                   struct bpos pos, unsigned flags)
-{
-       struct btree_iter *iter =
-               __bch2_trans_get_iter(trans, btree_id, pos,
-                                     (flags & BTREE_ITER_INTENT) != 0, 0,
-                                     flags);
-       iter->ip_allocated = _THIS_IP_;
-       return iter;
-}
-
-struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
-                                       struct btree_iter *);
-static inline struct btree_iter *
-bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
-{
-       struct btree_iter *iter =
-               __bch2_trans_copy_iter(trans, src);
-
-       iter->ip_allocated = _THIS_IP_;
-       return iter;
-}
-
-struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
-                               enum btree_id, struct bpos,
-                               unsigned, unsigned, unsigned);
-
-static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return (trans->iters_live & (1ULL << iter->idx)) != 0;
-}
+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret)   \
+       for (;                                                          \
+            (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
+            !((_ret) = bkey_err(_k)) && (_k).k;                        \
+            bch2_btree_iter_advance(&(_iter)))
 
-static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
-{
-       return btree_iter_live(trans, iter) ||
-               (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
-}
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+       for (;                                                          \
+            (_k) = bch2_btree_iter_peek_type(&(_iter), _flags),        \
+            !((_ret) = bkey_err(_k)) && (_k).k;                        \
+            bch2_btree_iter_advance(&(_iter)))
 
-static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
-{
-       trans->iters_touched &= ~(1ULL << iter->idx);
-}
+/* new multiple iterator interface: */
 
-void bch2_trans_begin(struct btree_trans *);
+void bch2_dump_trans_paths_updates(struct btree_trans *);
+void __bch2_trans_init(struct btree_trans *, struct bch_fs *,
+                      unsigned, size_t, const char *);
+void bch2_trans_exit(struct btree_trans *);
 
-void *bch2_trans_kmalloc(struct btree_trans *, size_t);
-void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
-int bch2_trans_exit(struct btree_trans *);
+#define bch2_trans_init(...)   __bch2_trans_init(__VA_ARGS__, __func__)
 
 void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
 
index e327ef39d4329512f3e576b6f2bed239ea1c0272..928aab61bcf6c25877700e61ec1e088523945653 100644 (file)
@@ -146,23 +146,32 @@ bkey_cached_reuse(struct btree_key_cache *c)
 }
 
 static struct bkey_cached *
-btree_key_cache_create(struct btree_key_cache *c,
+btree_key_cache_create(struct bch_fs *c,
                       enum btree_id btree_id,
                       struct bpos pos)
 {
+       struct btree_key_cache *bc = &c->btree_key_cache;
        struct bkey_cached *ck;
        bool was_new = true;
 
-       ck = bkey_cached_alloc(c);
+       ck = bkey_cached_alloc(bc);
 
        if (unlikely(!ck)) {
-               ck = bkey_cached_reuse(c);
-               if (unlikely(!ck))
+               ck = bkey_cached_reuse(bc);
+               if (unlikely(!ck)) {
+                       bch_err(c, "error allocating memory for key cache item, btree %s",
+                               bch2_btree_ids[btree_id]);
                        return ERR_PTR(-ENOMEM);
+               }
 
                was_new = false;
        }
 
+       if (btree_id == BTREE_ID_subvolumes)
+               six_lock_pcpu_alloc(&ck->c.lock);
+       else
+               six_lock_pcpu_free(&ck->c.lock);
+
        ck->c.level             = 0;
        ck->c.btree_id          = btree_id;
        ck->key.btree_id        = btree_id;
@@ -170,7 +179,7 @@ btree_key_cache_create(struct btree_key_cache *c,
        ck->valid               = false;
        ck->flags               = 1U << BKEY_CACHED_ACCESSED;
 
-       if (unlikely(rhashtable_lookup_insert_fast(&c->table,
+       if (unlikely(rhashtable_lookup_insert_fast(&bc->table,
                                          &ck->hash,
                                          bch2_btree_key_cache_params))) {
                /* We raced with another fill: */
@@ -180,15 +189,15 @@ btree_key_cache_create(struct btree_key_cache *c,
                        six_unlock_intent(&ck->c.lock);
                        kfree(ck);
                } else {
-                       mutex_lock(&c->lock);
-                       bkey_cached_free(c, ck);
-                       mutex_unlock(&c->lock);
+                       mutex_lock(&bc->lock);
+                       bkey_cached_free(bc, ck);
+                       mutex_unlock(&bc->lock);
                }
 
                return NULL;
        }
 
-       atomic_long_inc(&c->nr_keys);
+       atomic_long_inc(&bc->nr_keys);
 
        six_unlock_write(&ck->c.lock);
 
@@ -196,24 +205,27 @@ btree_key_cache_create(struct btree_key_cache *c,
 }
 
 static int btree_key_cache_fill(struct btree_trans *trans,
-                               struct btree_iter *ck_iter,
+                               struct btree_path *ck_path,
                                struct bkey_cached *ck)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
        struct bkey_s_c k;
        unsigned new_u64s = 0;
        struct bkey_i *new_k = NULL;
+       struct bkey u;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, ck->key.btree_id,
-                                  ck->key.pos, BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
+       path = bch2_path_get(trans, ck->key.btree_id,
+                            ck->key.pos, 0, 0, 0, _THIS_IP_);
+       ret = bch2_btree_path_traverse(trans, path, 0);
        if (ret)
                goto err;
 
-       if (!bch2_btree_node_relock(ck_iter, 0)) {
-               trace_transaction_restart_ip(trans->ip, _THIS_IP_);
+       k = bch2_btree_path_peek_slot(path, &u);
+
+       if (!bch2_btree_node_relock(trans, ck_path, 0)) {
+               trace_trans_restart_relock_key_cache_fill(trans->fn,
+                               _THIS_IP_, ck_path->btree_id, &ck_path->pos);
                ret = btree_trans_restart(trans);
                goto err;
        }
@@ -228,6 +240,8 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                new_u64s = roundup_pow_of_two(new_u64s);
                new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
                if (!new_k) {
+                       bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
+                               bch2_btree_ids[ck->key.btree_id], new_u64s);
                        ret = -ENOMEM;
                        goto err;
                }
@@ -237,7 +251,7 @@ static int btree_key_cache_fill(struct btree_trans *trans,
         * XXX: not allowed to be holding read locks when we take a write lock,
         * currently
         */
-       bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
+       bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b);
        if (new_k) {
                kfree(ck->k);
                ck->u64s = new_u64s;
@@ -246,93 +260,91 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 
        bkey_reassemble(ck->k, k);
        ck->valid = true;
-       bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
+       bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
        /* We're not likely to need this iterator again: */
-       set_btree_iter_dontneed(trans, iter);
+       path->preserve = false;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_path_put(trans, path, 0);
        return ret;
 }
 
 static int bkey_cached_check_fn(struct six_lock *lock, void *p)
 {
        struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
-       const struct btree_iter *iter = p;
+       const struct btree_path *path = p;
 
-       return ck->key.btree_id == iter->btree_id &&
-               !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
+       return ck->key.btree_id == path->btree_id &&
+               !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1;
 }
 
 __flatten
-int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path,
+                                   unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct bkey_cached *ck;
        int ret = 0;
 
-       BUG_ON(iter->level);
+       BUG_ON(path->level);
 
-       iter->l[1].b = NULL;
+       path->l[1].b = NULL;
 
-       if (bch2_btree_node_relock(iter, 0)) {
-               ck = (void *) iter->l[0].b;
+       if (bch2_btree_node_relock(trans, path, 0)) {
+               ck = (void *) path->l[0].b;
                goto fill;
        }
 retry:
-       ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
+       ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos);
        if (!ck) {
-               if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
-                       iter->l[0].b = NULL;
+               if (flags & BTREE_ITER_CACHED_NOCREATE) {
+                       path->l[0].b = NULL;
                        return 0;
                }
 
-               ck = btree_key_cache_create(&c->btree_key_cache,
-                                           iter->btree_id, iter->pos);
+               ck = btree_key_cache_create(c, path->btree_id, path->pos);
                ret = PTR_ERR_OR_ZERO(ck);
                if (ret)
                        goto err;
                if (!ck)
                        goto retry;
 
-               mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
-               iter->locks_want = 1;
+               mark_btree_node_locked(path, 0, SIX_LOCK_intent);
+               path->locks_want = 1;
        } else {
-               enum six_lock_type lock_want = __btree_lock_want(iter, 0);
+               enum six_lock_type lock_want = __btree_lock_want(path, 0);
 
-               if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
-                                    bkey_cached_check_fn, iter, _THIS_IP_)) {
+               if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0,
+                                    lock_want,
+                                    bkey_cached_check_fn, path, _THIS_IP_)) {
                        if (!trans->restarted)
                                goto retry;
 
-                       trace_transaction_restart_ip(trans->ip, _THIS_IP_);
                        ret = -EINTR;
                        goto err;
                }
 
-               if (ck->key.btree_id != iter->btree_id ||
-                   bpos_cmp(ck->key.pos, iter->pos)) {
+               if (ck->key.btree_id != path->btree_id ||
+                   bpos_cmp(ck->key.pos, path->pos)) {
                        six_unlock_type(&ck->c.lock, lock_want);
                        goto retry;
                }
 
-               mark_btree_node_locked(iter, 0, lock_want);
+               mark_btree_node_locked(path, 0, lock_want);
        }
 
-       iter->l[0].lock_seq     = ck->c.lock.state.seq;
-       iter->l[0].b            = (void *) ck;
+       path->l[0].lock_seq     = ck->c.lock.state.seq;
+       path->l[0].b            = (void *) ck;
 fill:
-       if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
-               if (!iter->locks_want &&
-                   !!__bch2_btree_iter_upgrade(iter, 1)) {
-                       trace_transaction_restart_ip(trans->ip, _THIS_IP_);
-                       BUG_ON(!trans->restarted);
-                       ret = -EINTR;
+       if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) {
+               if (!path->locks_want &&
+                   !__bch2_btree_path_upgrade(trans, path, 1)) {
+                       trace_transaction_restart_ip(trans->fn, _THIS_IP_);
+                       ret = btree_trans_restart(trans);
                        goto err;
                }
 
-               ret = btree_key_cache_fill(trans, iter, ck);
+               ret = btree_key_cache_fill(trans, path, ck);
                if (ret)
                        goto err;
        }
@@ -340,22 +352,14 @@ fill:
        if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
                set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
 
-       iter->uptodate = BTREE_ITER_NEED_PEEK;
-
-       if ((iter->flags & BTREE_ITER_INTENT) &&
-           !bch2_btree_iter_upgrade(iter, 1)) {
-               BUG_ON(!trans->restarted);
-               ret = -EINTR;
-       }
-
-       BUG_ON(!ret && !btree_node_locked(iter, 0));
+       path->uptodate = BTREE_ITER_UPTODATE;
+       BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0));
 
        return ret;
 err:
        if (ret != -EINTR) {
-               btree_node_unlock(iter, 0);
-               iter->flags |= BTREE_ITER_ERROR;
-               iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
+               btree_node_unlock(path, 0);
+               path->l[0].b = BTREE_ITER_NO_NODE_ERROR;
        }
        return ret;
 }
@@ -368,40 +372,48 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
-       struct btree_iter *c_iter = NULL, *b_iter = NULL;
+       struct btree_iter c_iter, b_iter;
        struct bkey_cached *ck = NULL;
        int ret;
 
-       b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-                                    BTREE_ITER_SLOTS|
-                                    BTREE_ITER_INTENT);
-       c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
-                                    BTREE_ITER_CACHED|
-                                    BTREE_ITER_CACHED_NOFILL|
-                                    BTREE_ITER_CACHED_NOCREATE|
-                                    BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(c_iter);
+       bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos,
+                            BTREE_ITER_SLOTS|
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos,
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_CACHED_NOFILL|
+                            BTREE_ITER_CACHED_NOCREATE|
+                            BTREE_ITER_INTENT);
+       b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
+       ret = bch2_btree_iter_traverse(&c_iter);
        if (ret)
                goto out;
 
-       ck = (void *) c_iter->l[0].b;
-       if (!ck ||
-           (journal_seq && ck->journal.seq != journal_seq))
+       ck = (void *) c_iter.path->l[0].b;
+       if (!ck)
                goto out;
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               if (!evict)
-                       goto out;
-               goto evict;
+               if (evict)
+                       goto evict;
+               goto out;
        }
 
+       BUG_ON(!ck->valid);
+
+       if (journal_seq && ck->journal.seq != journal_seq)
+               goto out;
+
        /*
         * Since journal reclaim depends on us making progress here, and the
         * allocator/copygc depend on journal reclaim making progress, we need
         * to be using alloc reserves:
         * */
-       ret   = bch2_btree_iter_traverse(b_iter) ?:
-               bch2_trans_update(trans, b_iter, ck->k,
+       ret   = bch2_btree_iter_traverse(&b_iter) ?:
+               bch2_trans_update(trans, &b_iter, ck->k,
+                                 BTREE_UPDATE_KEY_CACHE_RECLAIM|
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
@@ -423,7 +435,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
        bch2_journal_pin_drop(j, &ck->journal);
        bch2_journal_preres_put(j, &ck->res);
 
-       BUG_ON(!btree_node_locked(c_iter, 0));
+       BUG_ON(!btree_node_locked(c_iter.path, 0));
 
        if (!evict) {
                if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -432,10 +444,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                }
        } else {
 evict:
-               BUG_ON(!btree_node_intent_locked(c_iter, 0));
+               BUG_ON(!btree_node_intent_locked(c_iter.path, 0));
 
-               mark_btree_node_unlocked(c_iter, 0);
-               c_iter->l[0].b = NULL;
+               mark_btree_node_unlocked(c_iter.path, 0);
+               c_iter.path->l[0].b = NULL;
 
                six_lock_write(&ck->c.lock, NULL, NULL);
 
@@ -451,8 +463,8 @@ evict:
                mutex_unlock(&c->btree_key_cache.lock);
        }
 out:
-       bch2_trans_iter_put(trans, b_iter);
-       bch2_trans_iter_put(trans, c_iter);
+       bch2_trans_iter_exit(trans, &b_iter);
+       bch2_trans_iter_exit(trans, &c_iter);
        return ret;
 }
 
@@ -503,11 +515,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
-                                 struct btree_iter *iter,
+                                 struct btree_path *path,
                                  struct bkey_i *insert)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_cached *ck = (void *) iter->l[0].b;
+       struct bkey_cached *ck = (void *) path->l[0].b;
        bool kick_reclaim = false;
 
        BUG_ON(insert->u64s > ck->u64s);
@@ -664,11 +676,12 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 
        rcu_read_lock();
        tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
-       for (i = 0; i < tbl->size; i++)
-               rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
-                       bkey_cached_evict(bc, ck);
-                       list_add(&ck->list, &bc->freed);
-               }
+       if (tbl)
+               for (i = 0; i < tbl->size; i++)
+                       rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
+                               bkey_cached_evict(bc, ck);
+                               list_add(&ck->list, &bc->freed);
+                       }
        rcu_read_unlock();
 
        list_for_each_entry_safe(ck, n, &bc->freed, list) {
index 7e2b0a08f745255b3b5c6986f4837d1674e00ec1..b3d241b134539e545a44557afd7fb16ebe87f4cf 100644 (file)
@@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
        size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
        size_t max_dirty = 4096 + (nr_keys * 3) / 4;
 
-       return nr_dirty > max_dirty &&
-               test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
+       return nr_dirty > max_dirty;
 }
 
 int bch2_btree_key_cache_journal_flush(struct journal *,
@@ -26,10 +25,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *,
 struct bkey_cached *
 bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
 
-int bch2_btree_iter_traverse_cached(struct btree_iter *);
+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
+                                   unsigned);
 
 bool bch2_btree_insert_key_cached(struct btree_trans *,
-                       struct btree_iter *, struct bkey_i *);
+                       struct btree_path *, struct bkey_i *);
 int bch2_btree_key_cache_flush(struct btree_trans *,
                               enum btree_id, struct bpos);
 #ifdef CONFIG_BCACHEFS_DEBUG
index 7532bcdef96732b44bafac02aeb77ae92a82ebbd..b4434eca0746c7635c10b291534f526661dc4cf6 100644 (file)
@@ -21,7 +21,7 @@ enum btree_node_locked_type {
        BTREE_NODE_INTENT_LOCKED        = SIX_LOCK_intent,
 };
 
-static inline int btree_node_locked_type(struct btree_iter *iter,
+static inline int btree_node_locked_type(struct btree_path *path,
                                         unsigned level)
 {
        /*
@@ -30,35 +30,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter,
         * branches:
         */
        return BTREE_NODE_UNLOCKED +
-               ((iter->nodes_locked >> level) & 1) +
-               ((iter->nodes_intent_locked >> level) & 1);
+               ((path->nodes_locked >> level) & 1) +
+               ((path->nodes_intent_locked >> level) & 1);
 }
 
-static inline bool btree_node_intent_locked(struct btree_iter *iter,
+static inline bool btree_node_intent_locked(struct btree_path *path,
                                            unsigned level)
 {
-       return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
+       return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED;
 }
 
-static inline bool btree_node_read_locked(struct btree_iter *iter,
+static inline bool btree_node_read_locked(struct btree_path *path,
                                          unsigned level)
 {
-       return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
+       return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED;
 }
 
-static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
+static inline bool btree_node_locked(struct btree_path *path, unsigned level)
 {
-       return iter->nodes_locked & (1 << level);
+       return path->nodes_locked & (1 << level);
 }
 
-static inline void mark_btree_node_unlocked(struct btree_iter *iter,
+static inline void mark_btree_node_unlocked(struct btree_path *path,
                                            unsigned level)
 {
-       iter->nodes_locked &= ~(1 << level);
-       iter->nodes_intent_locked &= ~(1 << level);
+       path->nodes_locked &= ~(1 << level);
+       path->nodes_intent_locked &= ~(1 << level);
 }
 
-static inline void mark_btree_node_locked(struct btree_iter *iter,
+static inline void mark_btree_node_locked(struct btree_path *path,
                                          unsigned level,
                                          enum six_lock_type type)
 {
@@ -66,52 +66,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter,
        BUILD_BUG_ON(SIX_LOCK_read   != 0);
        BUILD_BUG_ON(SIX_LOCK_intent != 1);
 
-       iter->nodes_locked |= 1 << level;
-       iter->nodes_intent_locked |= type << level;
+       path->nodes_locked |= 1 << level;
+       path->nodes_intent_locked |= type << level;
 }
 
-static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
+static inline void mark_btree_node_intent_locked(struct btree_path *path,
                                                 unsigned level)
 {
-       mark_btree_node_locked(iter, level, SIX_LOCK_intent);
+       mark_btree_node_locked(path, level, SIX_LOCK_intent);
 }
 
-static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level)
 {
-       return level < iter->locks_want
+       return level < path->locks_want
                ? SIX_LOCK_intent
                : SIX_LOCK_read;
 }
 
 static inline enum btree_node_locked_type
-btree_lock_want(struct btree_iter *iter, int level)
+btree_lock_want(struct btree_path *path, int level)
 {
-       if (level < iter->level)
+       if (level < path->level)
                return BTREE_NODE_UNLOCKED;
-       if (level < iter->locks_want)
+       if (level < path->locks_want)
                return BTREE_NODE_INTENT_LOCKED;
-       if (level == iter->level)
+       if (level == path->level)
                return BTREE_NODE_READ_LOCKED;
        return BTREE_NODE_UNLOCKED;
 }
 
-static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
+static inline void btree_node_unlock(struct btree_path *path, unsigned level)
 {
-       int lock_type = btree_node_locked_type(iter, level);
+       int lock_type = btree_node_locked_type(path, level);
 
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 
        if (lock_type != BTREE_NODE_UNLOCKED)
-               six_unlock_type(&iter->l[level].b->c.lock, lock_type);
-       mark_btree_node_unlocked(iter, level);
+               six_unlock_type(&path->l[level].b->c.lock, lock_type);
+       mark_btree_node_unlocked(path, level);
 }
 
-static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+static inline void __bch2_btree_path_unlock(struct btree_path *path)
 {
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+       btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK);
 
-       while (iter->nodes_locked)
-               btree_node_unlock(iter, __ffs(iter->nodes_locked));
+       while (path->nodes_locked)
+               btree_node_unlock(path, __ffs(path->nodes_locked));
 }
 
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
@@ -128,23 +128,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
        }
 }
 
-/*
- * wrapper around six locks that just traces lock contended time
- */
-static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
-                                         enum six_lock_type type)
+static inline bool btree_node_lock_type(struct btree_trans *trans,
+                                      struct btree_path *path,
+                                      struct btree *b,
+                                      struct bpos pos, unsigned level,
+                                      enum six_lock_type type,
+                                      six_lock_should_sleep_fn should_sleep_fn, void *p)
 {
-       u64 start_time = local_clock();
+       struct bch_fs *c = trans->c;
+       u64 start_time;
+       bool ret;
 
-       six_lock_type(&b->c.lock, type, NULL, NULL);
-       bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
-}
+       if (six_trylock_type(&b->c.lock, type))
+               return true;
 
-static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
-                                       enum six_lock_type type)
-{
-       if (!six_trylock_type(&b->c.lock, type))
-               __btree_node_lock_type(c, b, type);
+       start_time = local_clock();
+
+       trans->locking_path_idx = path->idx;
+       trans->locking_pos      = pos;
+       trans->locking_btree_id = path->btree_id;
+       trans->locking_level    = level;
+       trans->locking_lock_type = type;
+       trans->locking          = b;
+       ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
+       trans->locking = NULL;
+
+       if (ret)
+               bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
+
+       return ret;
 }
 
 /*
@@ -155,11 +167,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
                                             struct btree *b, unsigned level,
                                             enum btree_node_locked_type want)
 {
-       struct btree_iter *iter;
+       struct btree_path *path;
 
-       trans_for_each_iter(trans, iter)
-               if (iter->l[level].b == b &&
-                   btree_node_locked_type(iter, level) >= want) {
+       trans_for_each_path(trans, path)
+               if (path->l[level].b == b &&
+                   btree_node_locked_type(path, level) >= want) {
                        six_lock_increment(&b->c.lock, want);
                        return true;
                }
@@ -167,40 +179,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
        return false;
 }
 
-bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
-                           struct btree_iter *, enum six_lock_type,
+bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *,
+                           struct btree *, struct bpos, unsigned,
+                           enum six_lock_type,
                            six_lock_should_sleep_fn, void *,
                            unsigned long);
 
-static inline bool btree_node_lock(struct btree *b,
-                       struct bpos pos, unsigned level,
-                       struct btree_iter *iter,
+static inline bool btree_node_lock(struct btree_trans *trans,
+                       struct btree_path *path,
+                       struct btree *b, struct bpos pos, unsigned level,
                        enum six_lock_type type,
                        six_lock_should_sleep_fn should_sleep_fn, void *p,
                        unsigned long ip)
 {
-       struct btree_trans *trans = iter->trans;
-
        EBUG_ON(level >= BTREE_MAX_DEPTH);
-       EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
+       EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
 
        return likely(six_trylock_type(&b->c.lock, type)) ||
                btree_node_lock_increment(trans, b, level, type) ||
-               __bch2_btree_node_lock(b, pos, level, iter, type,
+               __bch2_btree_node_lock(trans, path, b, pos, level, type,
                                       should_sleep_fn, p, ip);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned);
 
-static inline bool bch2_btree_node_relock(struct btree_iter *iter,
-                                         unsigned level)
+static inline bool bch2_btree_node_relock(struct btree_trans *trans,
+                                         struct btree_path *path, unsigned level)
 {
-       EBUG_ON(btree_node_locked(iter, level) &&
-               btree_node_locked_type(iter, level) !=
-               __btree_lock_want(iter, level));
+       EBUG_ON(btree_node_locked(path, level) &&
+               btree_node_locked_type(path, level) !=
+               __btree_lock_want(path, level));
 
-       return likely(btree_node_locked(iter, level)) ||
-               __bch2_btree_node_relock(iter, level);
+       return likely(btree_node_locked(path, level)) ||
+               __bch2_btree_node_relock(trans, path, level);
 }
 
 /*
@@ -208,30 +219,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter,
  * succeed:
  */
 static inline void
-bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path,
+                                    struct btree *b)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       EBUG_ON(iter->l[b->c.level].b != b);
-       EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
+       EBUG_ON(path->l[b->c.level].b != b);
+       EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
+       trans_for_each_path_with_node(trans, b, linked)
                linked->l[b->c.level].lock_seq += 2;
 
        six_unlock_write(&b->c.lock);
 }
 
-void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
+void bch2_btree_node_unlock_write(struct btree_trans *,
+                       struct btree_path *, struct btree *);
 
-void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
+void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *);
 
-static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
+static inline void bch2_btree_node_lock_write(struct btree_trans *trans,
+                                             struct btree_path *path,
+                                             struct btree *b)
 {
-       EBUG_ON(iter->l[b->c.level].b != b);
-       EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
+       EBUG_ON(path->l[b->c.level].b != b);
+       EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq);
+       EBUG_ON(!btree_node_intent_locked(path, b->c.level));
 
        if (unlikely(!six_trylock_write(&b->c.lock)))
-               __bch2_btree_node_lock_write(b, iter);
+               __bch2_btree_node_lock_write(trans, b);
 }
 
 #endif /* _BCACHEFS_BTREE_LOCKING_H */
index a1e5debf19f3623d10366b399fd944813caa71a4..68272f26f0171f889b3986a981ddb9c771c517f7 100644 (file)
@@ -176,52 +176,47 @@ struct btree_node_iter {
        } data[MAX_BSETS];
 };
 
-enum btree_iter_type {
-       BTREE_ITER_KEYS,
-       BTREE_ITER_NODES,
-       BTREE_ITER_CACHED,
-};
-
-#define BTREE_ITER_TYPE                        ((1 << 2) - 1)
-
 /*
  * Iterate over all possible positions, synthesizing deleted keys for holes:
  */
-#define BTREE_ITER_SLOTS               (1 << 2)
+#define BTREE_ITER_SLOTS               (1 << 0)
 /*
  * Indicates that intent locks should be taken on leaf nodes, because we expect
  * to be doing updates:
  */
-#define BTREE_ITER_INTENT              (1 << 3)
+#define BTREE_ITER_INTENT              (1 << 1)
 /*
  * Causes the btree iterator code to prefetch additional btree nodes from disk:
  */
-#define BTREE_ITER_PREFETCH            (1 << 4)
+#define BTREE_ITER_PREFETCH            (1 << 2)
 /*
  * Indicates that this iterator should not be reused until transaction commit,
  * either because a pending update references it or because the update depends
  * on that particular key being locked (e.g. by the str_hash code, for hash
  * table consistency)
  */
-#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 5)
+#define BTREE_ITER_KEEP_UNTIL_COMMIT   (1 << 3)
 /*
  * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
  * @pos or the first key strictly greater than @pos
  */
-#define BTREE_ITER_IS_EXTENTS          (1 << 6)
-#define BTREE_ITER_NOT_EXTENTS         (1 << 7)
-#define BTREE_ITER_ERROR               (1 << 8)
-#define BTREE_ITER_SET_POS_AFTER_COMMIT        (1 << 9)
-#define BTREE_ITER_CACHED_NOFILL       (1 << 10)
-#define BTREE_ITER_CACHED_NOCREATE     (1 << 11)
-#define BTREE_ITER_WITH_UPDATES                (1 << 12)
+#define BTREE_ITER_IS_EXTENTS          (1 << 4)
+#define BTREE_ITER_NOT_EXTENTS         (1 << 5)
+#define BTREE_ITER_CACHED              (1 << 6)
+#define BTREE_ITER_CACHED_NOFILL       (1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE     (1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE      (1 << 9)
+#define BTREE_ITER_WITH_UPDATES                (1 << 10)
+#define BTREE_ITER_WITH_JOURNAL                (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 12)
 #define BTREE_ITER_ALL_SNAPSHOTS       (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS    (1 << 14)
+#define BTREE_ITER_NOPRESERVE          (1 << 15)
 
-enum btree_iter_uptodate {
+enum btree_path_uptodate {
        BTREE_ITER_UPTODATE             = 0,
-       BTREE_ITER_NEED_PEEK            = 1,
-       BTREE_ITER_NEED_RELOCK          = 2,
-       BTREE_ITER_NEED_TRAVERSE        = 3,
+       BTREE_ITER_NEED_RELOCK          = 1,
+       BTREE_ITER_NEED_TRAVERSE        = 2,
 };
 
 #define BTREE_ITER_NO_NODE_GET_LOCKS   ((struct btree *) 1)
@@ -233,74 +228,78 @@ enum btree_iter_uptodate {
 #define BTREE_ITER_NO_NODE_ERROR       ((struct btree *) 7)
 #define BTREE_ITER_NO_NODE_CACHED      ((struct btree *) 8)
 
-/*
- * @pos                        - iterator's current position
- * @level              - current btree depth
- * @locks_want         - btree level below which we start taking intent locks
- * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
- * @nodes_intent_locked        - bitmask indicating which locks are intent locks
- */
-struct btree_iter {
-       struct btree_trans      *trans;
-       unsigned long           ip_allocated;
-
+struct btree_path {
        u8                      idx;
-       u8                      child_idx;
        u8                      sorted_idx;
+       u8                      ref;
+       u8                      intent_ref;
 
        /* btree_iter_copy starts here: */
-       u16                     flags;
-
-       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
-       unsigned                snapshot;
-
        struct bpos             pos;
-       struct bpos             real_pos;
-       struct bpos             pos_after_commit;
 
        enum btree_id           btree_id:4;
-       enum btree_iter_uptodate uptodate:3;
+       bool                    cached:1;
+       bool                    preserve:1;
+       enum btree_path_uptodate uptodate:2;
        /*
-        * True if we've returned a key (and thus are expected to keep it
-        * locked), false after set_pos - for avoiding spurious transaction
-        * restarts in bch2_trans_relock():
+        * When true, failing to relock this path will cause the transaction to
+        * restart:
         */
        bool                    should_be_locked:1;
-       unsigned                level:4,
-                               min_depth:4,
+       unsigned                level:3,
                                locks_want:4,
                                nodes_locked:4,
                                nodes_intent_locked:4;
 
-       struct btree_iter_level {
+       struct btree_path_level {
                struct btree    *b;
                struct btree_node_iter iter;
                u32             lock_seq;
        }                       l[BTREE_MAX_DEPTH];
+#ifdef CONFIG_BCACHEFS_DEBUG
+       unsigned long           ip_allocated;
+#endif
+};
+
+static inline struct btree_path_level *path_l(struct btree_path *path)
+{
+       return path->l + path->level;
+}
+
+/*
+ * @pos                        - iterator's current position
+ * @level              - current btree depth
+ * @locks_want         - btree level below which we start taking intent locks
+ * @nodes_locked       - bitmask indicating which nodes in @nodes are locked
+ * @nodes_intent_locked        - bitmask indicating which locks are intent locks
+ */
+struct btree_iter {
+       struct btree_trans      *trans;
+       struct btree_path       *path;
+       struct btree_path       *update_path;
+       struct btree_path       *key_cache_path;
 
+       enum btree_id           btree_id:4;
+       unsigned                min_depth:4;
+
+       /* btree_iter_copy starts here: */
+       u16                     flags;
+
+       /* When we're filtering by snapshot, the snapshot ID we're looking for: */
+       unsigned                snapshot;
+
+       struct bpos             pos;
+       struct bpos             pos_after_commit;
        /*
         * Current unpacked key - so that bch2_btree_iter_next()/
         * bch2_btree_iter_next_slot() can correctly advance pos.
         */
        struct bkey             k;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       unsigned long           ip_allocated;
+#endif
 };
 
-static inline enum btree_iter_type
-btree_iter_type(const struct btree_iter *iter)
-{
-       return iter->flags & BTREE_ITER_TYPE;
-}
-
-static inline bool btree_iter_is_cached(const struct btree_iter *iter)
-{
-       return btree_iter_type(iter) == BTREE_ITER_CACHED;
-}
-
-static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
-{
-       return iter->l + iter->level;
-}
-
 struct btree_key_cache {
        struct mutex            lock;
        struct rhashtable       table;
@@ -345,9 +344,12 @@ struct btree_insert_entry {
        u8                      bkey_type;
        enum btree_id           btree_id:8;
        u8                      level;
-       unsigned                trans_triggers_run:1;
+       bool                    cached:1;
+       bool                    insert_trigger_run:1;
+       bool                    overwrite_trigger_run:1;
        struct bkey_i           *k;
-       struct btree_iter       *iter;
+       struct btree_path       *path;
+       unsigned long           ip_allocated;
 };
 
 #ifndef CONFIG_LOCKDEP
@@ -368,40 +370,37 @@ struct btree_trans_commit_hook {
 
 struct btree_trans {
        struct bch_fs           *c;
-#ifdef CONFIG_BCACHEFS_DEBUG
+       const char              *fn;
        struct list_head        list;
        struct btree            *locking;
-       unsigned                locking_iter_idx;
+       unsigned                locking_path_idx;
        struct bpos             locking_pos;
        u8                      locking_btree_id;
        u8                      locking_level;
+       u8                      locking_lock_type;
        pid_t                   pid;
-#endif
-       unsigned long           ip;
        int                     srcu_idx;
 
        u8                      nr_sorted;
        u8                      nr_updates;
        bool                    used_mempool:1;
-       bool                    error:1;
        bool                    in_traverse_all:1;
        bool                    restarted:1;
+       bool                    journal_transaction_names:1;
        /*
         * For when bch2_trans_update notices we'll be splitting a compressed
         * extent:
         */
        unsigned                extra_journal_res;
 
-       u64                     iters_linked;
-       u64                     iters_live;
-       u64                     iters_touched;
+       u64                     paths_allocated;
 
        unsigned                mem_top;
        unsigned                mem_bytes;
        void                    *mem;
 
-       u8                      *sorted;
-       struct btree_iter       *iters;
+       u8                      sorted[BTREE_ITER_MAX];
+       struct btree_path       *paths;
        struct btree_insert_entry *updates;
 
        /* update path: */
@@ -605,16 +604,6 @@ static inline bool btree_node_is_extents(struct btree *b)
        return btree_node_type_is_extents(btree_node_type(b));
 }
 
-static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
-{
-       return __btree_node_type(iter->level, iter->btree_id);
-}
-
-static inline bool btree_iter_is_extents(struct btree_iter *iter)
-{
-       return btree_node_type_is_extents(btree_iter_key_type(iter));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
        ((1U << BKEY_TYPE_extents)|                     \
         (1U << BKEY_TYPE_inodes)|                      \
@@ -624,7 +613,9 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
 
 #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS               \
        ((1U << BKEY_TYPE_alloc)|                       \
-        (1U << BKEY_TYPE_stripes))
+        (1U << BKEY_TYPE_inodes)|                      \
+        (1U << BKEY_TYPE_stripes)|                     \
+        (1U << BKEY_TYPE_snapshots))
 
 #define BTREE_NODE_TYPE_HAS_TRIGGERS                   \
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
@@ -647,6 +638,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
@@ -659,6 +651,7 @@ enum btree_update_flags {
 };
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
 
@@ -670,8 +663,13 @@ enum btree_update_flags {
 #define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
 
 #define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode))
+       ((1U << KEY_TYPE_alloc)|                \
+        (1U << KEY_TYPE_alloc_v2)|             \
+        (1U << KEY_TYPE_alloc_v3)|             \
+        (1U << KEY_TYPE_stripe)|               \
+        (1U << KEY_TYPE_inode)|                \
+        (1U << KEY_TYPE_inode_v2)|             \
+        (1U << KEY_TYPE_snapshot))
 
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
@@ -688,11 +686,6 @@ struct btree_root {
        s8                      error;
 };
 
-/*
- * Optional hook that will be called just prior to a btree node update, when
- * we're holding the write lock and we know what key is about to be overwritten:
- */
-
 enum btree_insert_ret {
        BTREE_INSERT_OK,
        /* leaf node needs to be split */
@@ -713,8 +706,4 @@ enum btree_node_sibling {
        btree_next_sib,
 };
 
-typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
-                                                       struct btree *,
-                                                       struct btree_node_iter *);
-
 #endif /* _BCACHEFS_BTREE_TYPES_H */
index 217b52e1a1683a7977198acce09e3f9668eab296..d9a406a28f4728b920b74a353f606f56c57e0dc7 100644 (file)
@@ -8,10 +8,11 @@
 struct bch_fs;
 struct btree;
 
-void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *,
+void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *,
                                     struct btree *);
-bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
-                               struct btree_node_iter *, struct bkey_i *);
+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
+                               struct btree *, struct btree_node_iter *,
+                               struct bkey_i *);
 void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
@@ -60,20 +61,24 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
                     struct disk_reservation *, u64 *, int flags);
 
 int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
-                                 struct bpos, struct bpos, u64 *);
+                                 struct bpos, struct bpos, unsigned, u64 *);
 int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
-                           struct bpos, struct bpos, u64 *);
+                           struct bpos, struct bpos, unsigned, u64 *);
 
 int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *,
-                           __le64, unsigned);
+                           struct btree *, unsigned);
 void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *,
                               struct btree *, struct bkey_i *, bool);
 int bch2_btree_node_update_key_get_iter(struct btree_trans *,
                                struct btree *, struct bkey_i *, bool);
 
-int bch2_trans_update(struct btree_trans *, struct btree_iter *,
-                     struct bkey_i *, enum btree_update_flags);
+int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
+                            struct bkey_i *, enum btree_update_flags);
+
+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
+                                  struct bkey_i *, enum btree_update_flags);
+
 void bch2_trans_commit_hook(struct btree_trans *,
                            struct btree_trans_commit_hook *);
 int __bch2_trans_commit(struct btree_trans *);
@@ -119,14 +124,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
 #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do)                \
 ({                                                                     \
        struct btree_trans trans;                                       \
-       int _ret, _ret2;                                                \
+       int _ret;                                                       \
                                                                        \
        bch2_trans_init(&trans, (_c), 0, 0);                            \
        _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
                               _do);                                    \
-       _ret2 = bch2_trans_exit(&trans);                                \
+       bch2_trans_exit(&trans);                                        \
                                                                        \
-       _ret ?: _ret2;                                                  \
+       _ret;                                                           \
 })
 
 #define trans_for_each_update(_trans, _i)                              \
index c8c3382f48c7e31ded2872dbf34af9057306cff9..088c320493d3c133bd1b61a8832ab3a3b3e6c754 100644 (file)
@@ -16,6 +16,7 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super-io.h"
 
@@ -23,8 +24,9 @@
 #include <trace/events/bcachefs.h>
 
 static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-                                  struct btree_iter *, struct btree *,
+                                  struct btree_path *, struct btree *,
                                   struct keylist *, unsigned);
+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 /* Debug code: */
 
@@ -43,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
 
        BUG_ON(!b->c.level);
 
-       if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
                return;
 
        bch2_btree_node_iter_init_from_start(&iter, b);
@@ -152,38 +154,26 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b)
 
        clear_btree_node_noevict(b);
 
-       bch2_btree_node_hash_remove(&c->btree_cache, b);
-
        mutex_lock(&c->btree_cache.lock);
        list_move(&b->list, &c->btree_cache.freeable);
        mutex_unlock(&c->btree_cache.lock);
 }
 
-void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
+static void bch2_btree_node_free_inmem(struct btree_trans *trans,
+                                      struct btree *b)
 {
-       struct open_buckets ob = b->ob;
+       struct bch_fs *c = trans->c;
+       struct btree_path *path;
 
-       b->ob.nr = 0;
+       trans_for_each_path(trans, path)
+               BUG_ON(path->l[b->c.level].b == b &&
+                      path->l[b->c.level].lock_seq == b->c.lock.state.seq);
 
-       clear_btree_node_dirty(c, b);
+       six_lock_write(&b->c.lock, NULL, NULL);
 
-       btree_node_lock_type(c, b, SIX_LOCK_write);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
        __btree_node_free(c, b);
-       six_unlock_write(&b->c.lock);
 
-       bch2_open_buckets_put(c, &ob);
-}
-
-void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
-                               struct btree_iter *iter)
-{
-       struct btree_iter *linked;
-
-       trans_for_each_iter(iter->trans, linked)
-               BUG_ON(linked->l[b->c.level].b == b);
-
-       six_lock_write(&b->c.lock, NULL, NULL);
-       __btree_node_free(c, b);
        six_unlock_write(&b->c.lock);
        six_unlock_intent(&b->c.lock);
 }
@@ -234,12 +224,12 @@ retry:
        if (IS_ERR(wp))
                return ERR_CAST(wp);
 
-       if (wp->sectors_free < c->opts.btree_node_size) {
+       if (wp->sectors_free < btree_sectors(c)) {
                struct open_bucket *ob;
                unsigned i;
 
                open_bucket_for_each(c, &wp->ptrs, ob, i)
-                       if (ob->sectors_free < c->opts.btree_node_size)
+                       if (ob->sectors_free < btree_sectors(c))
                                ob->sectors_free = 0;
 
                bch2_alloc_sectors_done(c, wp);
@@ -247,12 +237,14 @@ retry:
        }
 
        bkey_btree_ptr_v2_init(&tmp.k);
-       bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
+       bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
 
        bch2_open_bucket_get(c, wp, &ob);
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
        b = bch2_btree_node_mem_alloc(c);
+       six_unlock_write(&b->c.lock);
+       six_unlock_intent(&b->c.lock);
 
        /* we hold cannibalize_lock: */
        BUG_ON(IS_ERR(b));
@@ -275,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
        b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
+       six_lock_intent(&b->c.lock, NULL, NULL);
+       six_lock_write(&b->c.lock, NULL, NULL);
+
        set_btree_node_accessed(b);
        set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
@@ -388,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
        while (as->nr_prealloc_nodes) {
                struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
-               six_unlock_write(&b->c.lock);
+               six_lock_intent(&b->c.lock, NULL, NULL);
+               six_lock_write(&b->c.lock, NULL, NULL);
 
                if (c->btree_reserve_cache_nr <
                    ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -402,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
                        bch2_open_buckets_put(c, &b->ob);
                }
 
-               btree_node_lock_type(c, b, SIX_LOCK_write);
                __btree_node_free(c, b);
                six_unlock_write(&b->c.lock);
-
                six_unlock_intent(&b->c.lock);
        }
 
@@ -413,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 }
 
 static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
-                                 unsigned flags, struct closure *cl)
+                                 unsigned flags)
 {
        struct bch_fs *c = as->c;
+       struct closure cl;
        struct btree *b;
        int ret;
 
+       closure_init_stack(&cl);
+retry:
+
        BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
 
        /*
         * Protects reaping from the btree node cache and using the btree node
         * open bucket reserve:
+        *
+        * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+        * blocking on this lock:
         */
-       ret = bch2_btree_cache_cannibalize_lock(c, cl);
+       ret = bch2_btree_cache_cannibalize_lock(c, &cl);
        if (ret)
-               return ret;
+               goto err;
 
        while (as->nr_prealloc_nodes < nr_nodes) {
                b = __bch2_btree_node_alloc(c, &as->disk_res,
                                            flags & BTREE_INSERT_NOWAIT
-                                           ? NULL : cl, flags);
+                                           ? NULL : &cl, flags);
                if (IS_ERR(b)) {
                        ret = PTR_ERR(b);
-                       goto err_free;
+                       goto err;
                }
 
                as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
        }
 
        bch2_btree_cache_cannibalize_unlock(c);
+       closure_sync(&cl);
        return 0;
-err_free:
+err:
        bch2_btree_cache_cannibalize_unlock(c);
-       trace_btree_reserve_get_fail(c, nr_nodes, cl);
+       closure_sync(&cl);
+
+       if (ret == -EAGAIN)
+               goto retry;
+
+       trace_btree_reserve_get_fail(c, nr_nodes, &cl);
        return ret;
 }
 
@@ -466,15 +473,23 @@ static void bch2_btree_update_free(struct btree_update *as)
        bch2_disk_reservation_put(c, &as->disk_res);
        bch2_btree_reserve_put(as);
 
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
+                              as->start_time);
+
        mutex_lock(&c->btree_interior_update_lock);
        list_del(&as->unwritten_list);
        list_del(&as->list);
-       mutex_unlock(&c->btree_interior_update_lock);
 
        closure_debug_destroy(&as->cl);
        mempool_free(as, &c->btree_interior_update_pool);
 
+       /*
+        * Have to do the wakeup with btree_interior_update_lock still held,
+        * since being on btree_interior_update_list is our ref on @c:
+        */
        closure_wake_up(&c->btree_interior_update_wait);
+
+       mutex_unlock(&c->btree_interior_update_lock);
 }
 
 static void btree_update_will_delete_key(struct btree_update *as,
@@ -605,8 +620,8 @@ err:
                 * we're in journal error state:
                 */
 
-               btree_node_lock_type(c, b, SIX_LOCK_intent);
-               btree_node_lock_type(c, b, SIX_LOCK_write);
+               six_lock_intent(&b->c.lock, NULL, NULL);
+               six_lock_write(&b->c.lock, NULL, NULL);
                mutex_lock(&c->btree_interior_update_lock);
 
                list_del(&as->write_blocked_list);
@@ -660,7 +675,7 @@ err:
        for (i = 0; i < as->nr_new_nodes; i++) {
                b = as->new_nodes[i];
 
-               btree_node_lock_type(c, b, SIX_LOCK_read);
+               six_lock_read(&b->c.lock, NULL, NULL);
                btree_node_write_if_need(c, b, SIX_LOCK_read);
                six_unlock_read(&b->c.lock);
        }
@@ -773,7 +788,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
  * And it adds @b to the list of @as's new nodes, so that we can update sector
  * counts in bch2_btree_update_nodes_written:
  */
-void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
 
@@ -827,7 +842,7 @@ found:
                closure_put(&as->cl);
 }
 
-void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
 {
        while (b->ob.nr)
                as->open_buckets[as->nr_open_buckets++] =
@@ -839,7 +854,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b
  * nodes and thus outstanding btree_updates - redirect @b's
  * btree_updates to point to this btree_update:
  */
-void bch2_btree_interior_update_will_free_node(struct btree_update *as,
+static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                                               struct btree *b)
 {
        struct bch_fs *c = as->c;
@@ -911,8 +926,11 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        as->nr_old_nodes++;
 }
 
-void bch2_btree_update_done(struct btree_update *as)
+static void bch2_btree_update_done(struct btree_update *as)
 {
+       struct bch_fs *c = as->c;
+       u64 start_time = as->start_time;
+
        BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
 
        if (as->took_gc_lock)
@@ -923,38 +941,39 @@ void bch2_btree_update_done(struct btree_update *as)
 
        continue_at(&as->cl, btree_update_set_nodes_written,
                    as->c->btree_interior_update_worker);
+
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
+                              start_time);
 }
 
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *iter, unsigned level,
-                       unsigned nr_nodes, unsigned flags)
+static struct btree_update *
+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
+                       unsigned level, unsigned nr_nodes, unsigned flags)
 {
-       struct btree_trans *trans = iter->trans;
        struct bch_fs *c = trans->c;
        struct btree_update *as;
-       struct closure cl;
+       u64 start_time = local_clock();
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
        int journal_flags = 0;
        int ret = 0;
 
-       BUG_ON(!iter->should_be_locked);
+       BUG_ON(!path->should_be_locked);
 
        if (flags & BTREE_INSERT_JOURNAL_RESERVED)
                journal_flags |= JOURNAL_RES_GET_RESERVED;
-
-       closure_init_stack(&cl);
-retry:
+       if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+               journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 
        /*
         * XXX: figure out how far we might need to split,
         * instead of locking/reserving all the way to the root:
         */
-       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
-               trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
-                                                iter->btree_id,
-                                                &iter->real_pos);
-               return ERR_PTR(-EINTR);
+       if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
+               trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
+                                                path->btree_id, &path->pos);
+               ret = btree_trans_restart(trans);
+               return ERR_PTR(ret);
        }
 
        if (flags & BTREE_INSERT_GC_LOCK_HELD)
@@ -972,9 +991,10 @@ retry:
        memset(as, 0, sizeof(*as));
        closure_init(&as->cl, NULL);
        as->c           = c;
+       as->start_time  = start_time;
        as->mode        = BTREE_INTERIOR_NO_UPDATE;
        as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
-       as->btree_id    = iter->btree_id;
+       as->btree_id    = path->btree_id;
        INIT_LIST_HEAD(&as->list);
        INIT_LIST_HEAD(&as->unwritten_list);
        INIT_LIST_HEAD(&as->write_blocked_list);
@@ -998,43 +1018,34 @@ retry:
        if (ret)
                goto err;
 
+       bch2_trans_unlock(trans);
+
        ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                      BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
-       if (ret == -EAGAIN) {
-               bch2_trans_unlock(trans);
-
-               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-                       bch2_btree_update_free(as);
-                       btree_trans_restart(trans);
-                       return ERR_PTR(ret);
-               }
-
-               ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                               BTREE_UPDATE_JOURNAL_RES,
-                               journal_flags);
-               if (ret) {
-                       trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
-                       goto err;
-               }
-
-               if (!bch2_trans_relock(trans)) {
-                       ret = -EINTR;
-                       goto err;
-               }
+                                     journal_flags);
+       if (ret) {
+               bch2_btree_update_free(as);
+               trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+               btree_trans_restart(trans);
+               return ERR_PTR(ret);
        }
 
        ret = bch2_disk_reservation_get(c, &as->disk_res,
-                       nr_nodes * c->opts.btree_node_size,
+                       nr_nodes * btree_sectors(c),
                        c->opts.metadata_replicas,
                        disk_res_flags);
        if (ret)
                goto err;
 
-       ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+       ret = bch2_btree_reserve_get(as, nr_nodes, flags);
        if (ret)
                goto err;
 
+       if (!bch2_trans_relock(trans)) {
+               ret = -EINTR;
+               goto err;
+       }
+
        bch2_journal_pin_add(&c->journal,
                             atomic64_read(&c->journal.seq),
                             &as->journal, NULL);
@@ -1042,16 +1053,6 @@ retry:
        return as;
 err:
        bch2_btree_update_free(as);
-
-       if (ret == -EAGAIN) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-               ret = -EINTR;
-       }
-
-       if (ret == -EINTR && bch2_trans_relock(trans))
-               goto retry;
-
        return ERR_PTR(ret);
 }
 
@@ -1092,8 +1093,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
  * is nothing new to be done.  This just guarantees that there is a
  * journal write.
  */
-static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
-                               struct btree_iter *iter)
+static void bch2_btree_set_root(struct btree_update *as,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
+                               struct btree *b)
 {
        struct bch_fs *c = as->c;
        struct btree *old;
@@ -1108,7 +1111,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
         * Ensure no one is using the old root while we switch to the
         * new root:
         */
-       bch2_btree_node_lock_write(old, iter);
+       bch2_btree_node_lock_write(trans, path, old);
 
        bch2_btree_set_root_inmem(c, b);
 
@@ -1121,15 +1124,17 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
         * an intent lock on the new root, and any updates that would
         * depend on the new root would have to update the new root.
         */
-       bch2_btree_node_unlock_write(old, iter);
+       bch2_btree_node_unlock_write(trans, path, old);
 }
 
 /* Interior node updates: */
 
-static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
-                                       struct btree_iter *iter,
-                                       struct bkey_i *insert,
-                                       struct btree_node_iter *node_iter)
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
+                                       struct btree_trans *trans,
+                                       struct btree_path *path,
+                                       struct btree *b,
+                                       struct btree_node_iter *node_iter,
+                                       struct bkey_i *insert)
 {
        struct bch_fs *c = as->c;
        struct bkey_packed *k;
@@ -1138,6 +1143,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
               !btree_ptr_sectors_written(insert));
 
+       if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+               bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
        invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
                bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
        if (invalid) {
@@ -1161,15 +1169,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
               bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
                bch2_btree_node_iter_advance(node_iter, b);
 
-       bch2_btree_bset_insert_key(iter, b, node_iter, insert);
+       bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
        set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
 }
 
 static void
-__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-                                 struct btree_iter *iter, struct keylist *keys,
-                                 struct btree_node_iter node_iter)
+__bch2_btree_insert_keys_interior(struct btree_update *as,
+                                 struct btree_trans *trans,
+                                 struct btree_path *path,
+                                 struct btree *b,
+                                 struct btree_node_iter node_iter,
+                                 struct keylist *keys)
 {
        struct bkey_i *insert = bch2_keylist_front(keys);
        struct bkey_packed *k;
@@ -1181,8 +1192,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                ;
 
        while (!bch2_keylist_empty(keys)) {
-               bch2_insert_fixup_btree_ptr(as, b, iter,
-                               bch2_keylist_front(keys), &node_iter);
+               bch2_insert_fixup_btree_ptr(as, trans, path, b,
+                               &node_iter, bch2_keylist_front(keys));
                bch2_keylist_pop_front(keys);
        }
 }
@@ -1192,8 +1203,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * node)
  */
 static struct btree *__btree_split_node(struct btree_update *as,
-                                       struct btree *n1,
-                                       struct btree_iter *iter)
+                                       struct btree *n1)
 {
        struct bkey_format_state s;
        size_t nr_packed = 0, nr_unpacked = 0;
@@ -1308,8 +1318,10 @@ static struct btree *__btree_split_node(struct btree_update *as,
  * nodes that were coalesced, and thus in the middle of a child node post
  * coalescing:
  */
-static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
-                                   struct btree_iter *iter,
+static void btree_split_insert_keys(struct btree_update *as,
+                                   struct btree_trans *trans,
+                                   struct btree_path *path,
+                                   struct btree *b,
                                    struct keylist *keys)
 {
        struct btree_node_iter node_iter;
@@ -1319,7 +1331,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-       __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
+       __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
        /*
         * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1349,18 +1361,17 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        btree_node_interior_verify(as->c, b);
 }
 
-static void btree_split(struct btree_update *as,
-                       struct btree_trans *trans, struct btree_iter *iter,
-                       struct btree *b, struct keylist *keys,
-                       unsigned flags)
+static void btree_split(struct btree_update *as, struct btree_trans *trans,
+                       struct btree_path *path, struct btree *b,
+                       struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
-       struct btree *parent = btree_node_parent(iter, b);
+       struct btree *parent = btree_node_parent(path, b);
        struct btree *n1, *n2 = NULL, *n3 = NULL;
        u64 start_time = local_clock();
 
        BUG_ON(!parent && (b != btree_node_root(c, b)));
-       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
 
        bch2_btree_interior_update_will_free_node(as, b);
 
@@ -1368,12 +1379,12 @@ static void btree_split(struct btree_update *as,
        bch2_btree_update_add_new_node(as, n1);
 
        if (keys)
-               btree_split_insert_keys(as, n1, iter, keys);
+               btree_split_insert_keys(as, trans, path, n1, keys);
 
        if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
                trace_btree_split(c, b);
 
-               n2 = __btree_split_node(as, n1, iter);
+               n2 = __btree_split_node(as, n1);
 
                bch2_btree_build_aux_trees(n2);
                bch2_btree_build_aux_trees(n1);
@@ -1398,7 +1409,7 @@ static void btree_split(struct btree_update *as,
                        n3->sib_u64s[0] = U16_MAX;
                        n3->sib_u64s[1] = U16_MAX;
 
-                       btree_split_insert_keys(as, n3, iter, &as->parent_keys);
+                       btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
 
                        bch2_btree_node_write(c, n3, SIX_LOCK_intent);
                }
@@ -1418,12 +1429,12 @@ static void btree_split(struct btree_update *as,
 
        if (parent) {
                /* Split a non root node */
-               bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+               bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
        } else if (n3) {
-               bch2_btree_set_root(as, n3, iter);
+               bch2_btree_set_root(as, trans, path, n3);
        } else {
                /* Root filled up but didn't need to be split */
-               bch2_btree_set_root(as, n1, iter);
+               bch2_btree_set_root(as, trans, path, n1);
        }
 
        bch2_btree_update_get_open_buckets(as, n1);
@@ -1432,15 +1443,14 @@ static void btree_split(struct btree_update *as,
        if (n3)
                bch2_btree_update_get_open_buckets(as, n3);
 
-       /* Successful split, update the iterator to point to the new nodes: */
+       /* Successful split, update the path to point to the new nodes: */
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
        if (n3)
-               bch2_btree_iter_node_replace(iter, n3);
+               bch2_trans_node_add(trans, n3);
        if (n2)
-               bch2_btree_iter_node_replace(iter, n2);
-       bch2_btree_iter_node_replace(iter, n1);
+               bch2_trans_node_add(trans, n2);
+       bch2_trans_node_add(trans, n1);
 
        /*
         * The old node must be freed (in memory) _before_ unlocking the new
@@ -1448,7 +1458,7 @@ static void btree_split(struct btree_update *as,
         * node after another thread has locked and updated the new node, thus
         * seeing stale data:
         */
-       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_btree_node_free_inmem(trans, b);
 
        if (n3)
                six_unlock_intent(&n3->c.lock);
@@ -1456,26 +1466,32 @@ static void btree_split(struct btree_update *as,
                six_unlock_intent(&n2->c.lock);
        six_unlock_intent(&n1->c.lock);
 
-       bch2_btree_trans_verify_locks(trans);
+       bch2_trans_verify_locks(trans);
 
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
+       bch2_time_stats_update(&c->times[n2
+                              ? BCH_TIME_btree_node_split
+                              : BCH_TIME_btree_node_compact],
                               start_time);
 }
 
 static void
-bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
-                               struct btree_iter *iter, struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+                               struct btree_trans *trans,
+                               struct btree_path *path,
+                               struct btree *b,
+                               struct keylist *keys)
 {
-       struct btree_iter *linked;
+       struct btree_path *linked;
 
-       __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
+       __bch2_btree_insert_keys_interior(as, trans, path, b,
+                                         path->l[b->c.level].iter, keys);
 
        btree_update_updated_node(as, b);
 
-       trans_for_each_iter_with_node(iter->trans, b, linked)
+       trans_for_each_path_with_node(trans, b, linked)
                bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-       bch2_btree_trans_verify_iters(iter->trans, b);
+       bch2_trans_verify_paths(trans);
 }
 
 /**
@@ -1490,10 +1506,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * If a split occurred, this function will return early. This can only happen
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
-static void bch2_btree_insert_node(struct btree_update *as,
-                                  struct btree_trans *trans, struct btree_iter *iter,
-                                  struct btree *b, struct keylist *keys,
-                                  unsigned flags)
+static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
+                                  struct btree_path *path, struct btree *b,
+                                  struct keylist *keys, unsigned flags)
 {
        struct bch_fs *c = as->c;
        int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1501,21 +1516,21 @@ static void bch2_btree_insert_node(struct btree_update *as,
        int live_u64s_added, u64s_added;
 
        lockdep_assert_held(&c->gc_lock);
-       BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
+       BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
        BUG_ON(!b->c.level);
        BUG_ON(!as || as->b);
        bch2_verify_keylist_sorted(keys);
 
-       bch2_btree_node_lock_for_insert(trans, iter, b);
+       bch2_btree_node_lock_for_insert(trans, path, b);
 
        if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
-               bch2_btree_node_unlock_write(b, iter);
+               bch2_btree_node_unlock_write(trans, path, b);
                goto split;
        }
 
        btree_node_interior_verify(c, b);
 
-       bch2_btree_insert_keys_interior(as, b, iter, keys);
+       bch2_btree_insert_keys_interior(as, trans, path, b, keys);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
        u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1527,48 +1542,48 @@ static void bch2_btree_insert_node(struct btree_update *as,
 
        if (u64s_added > live_u64s_added &&
            bch2_maybe_compact_whiteouts(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
-       bch2_btree_node_unlock_write(b, iter);
+       bch2_btree_node_unlock_write(trans, path, b);
 
        btree_node_interior_verify(c, b);
        return;
 split:
-       btree_split(as, trans, iter, b, keys, flags);
+       btree_split(as, trans, path, b, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
-                         struct btree_iter *iter,
+                         struct btree_path *path,
                          unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
+       struct btree *b = path_l(path)->b;
        struct btree_update *as;
        unsigned l;
        int ret = 0;
 
-       as = bch2_btree_update_start(iter, iter->level,
+       as = bch2_btree_update_start(trans, path, path->level,
                btree_update_reserve_required(c, b), flags);
        if (IS_ERR(as))
                return PTR_ERR(as);
 
-       btree_split(as, trans, iter, b, NULL, flags);
+       btree_split(as, trans, path, b, NULL, flags);
        bch2_btree_update_done(as);
 
-       for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
-               ret = bch2_foreground_maybe_merge(trans, iter, l, flags);
+       for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
+               ret = bch2_foreground_maybe_merge(trans, path, l, flags);
 
        return ret;
 }
 
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                 struct btree_iter *iter,
+                                 struct btree_path *path,
                                  unsigned level,
                                  unsigned flags,
                                  enum btree_node_sibling sib)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *sib_iter = NULL;
+       struct btree_path *sib_path = NULL;
        struct btree_update *as;
        struct bkey_format_state new_s;
        struct bkey_format new_f;
@@ -1576,39 +1591,36 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
        struct btree *b, *m, *n, *prev, *next, *parent;
        struct bpos sib_pos;
        size_t sib_u64s;
-       int ret = 0, ret2 = 0;
-
-retry:
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
+       u64 start_time = local_clock();
+       int ret = 0;
 
-       BUG_ON(!iter->should_be_locked);
-       BUG_ON(!btree_node_locked(iter, level));
+       BUG_ON(!path->should_be_locked);
+       BUG_ON(!btree_node_locked(path, level));
 
-       b = iter->l[level].b;
+       b = path->l[level].b;
 
        if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
            (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
                b->sib_u64s[sib] = U16_MAX;
-               goto out;
+               return 0;
        }
 
        sib_pos = sib == btree_prev_sib
                ? bpos_predecessor(b->data->min_key)
                : bpos_successor(b->data->max_key);
 
-       sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
-                                           sib_pos, U8_MAX, level,
-                                           BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(sib_iter);
+       sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+                                U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
+       ret = bch2_btree_path_traverse(trans, sib_path, false);
        if (ret)
                goto err;
 
-       m = sib_iter->l[level].b;
+       sib_path->should_be_locked = true;
 
-       if (btree_node_parent(iter, b) !=
-           btree_node_parent(sib_iter, m)) {
+       m = sib_path->l[level].b;
+
+       if (btree_node_parent(path, b) !=
+           btree_node_parent(sib_path, m)) {
                b->sib_u64s[sib] = U16_MAX;
                goto out;
        }
@@ -1659,8 +1671,8 @@ retry:
        if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
                goto out;
 
-       parent = btree_node_parent(iter, b);
-       as = bch2_btree_update_start(iter, level,
+       parent = btree_node_parent(path, b);
+       as = bch2_btree_update_start(trans, path, level,
                         btree_update_reserve_required(c, parent) + 1,
                         flags|
                         BTREE_INSERT_NOFAIL|
@@ -1696,47 +1708,34 @@ retry:
        bch2_keylist_add(&as->parent_keys, &delete);
        bch2_keylist_add(&as->parent_keys, &n->key);
 
-       bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags);
+       bch2_trans_verify_paths(trans);
+
+       bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+
+       bch2_trans_verify_paths(trans);
 
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
        six_lock_increment(&m->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
-       bch2_btree_iter_node_drop(iter, m);
 
-       bch2_btree_iter_node_replace(iter, n);
+       bch2_trans_node_add(trans, n);
 
-       bch2_btree_trans_verify_iters(trans, n);
+       bch2_trans_verify_paths(trans);
 
-       bch2_btree_node_free_inmem(c, b, iter);
-       bch2_btree_node_free_inmem(c, m, iter);
+       bch2_btree_node_free_inmem(trans, b);
+       bch2_btree_node_free_inmem(trans, m);
 
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
-out:
-       bch2_btree_trans_verify_locks(trans);
-       bch2_trans_iter_free(trans, sib_iter);
 
-       /*
-        * Don't downgrade locks here: we're called after successful insert,
-        * and the caller will downgrade locks after a successful insert
-        * anyways (in case e.g. a split was required first)
-        *
-        * And we're also called when inserting into interior nodes in the
-        * split path, and downgrading to read locks in there is potentially
-        * confusing:
-        */
-       return ret ?: ret2;
+       bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
+out:
 err:
-       bch2_trans_iter_put(trans, sib_iter);
-       sib_iter = NULL;
-
-       if (ret == -EINTR && bch2_trans_relock(trans))
-               goto retry;
-
-       goto out;
+       bch2_path_put(trans, sib_path, true);
+       bch2_trans_verify_locks(trans);
+       return ret;
 }
 
 /**
@@ -1744,32 +1743,23 @@ err:
  */
 int bch2_btree_node_rewrite(struct btree_trans *trans,
                            struct btree_iter *iter,
-                           __le64 seq, unsigned flags)
+                           struct btree *b,
+                           unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b, *n, *parent;
+       struct btree *n, *parent;
        struct btree_update *as;
        int ret;
 
        flags |= BTREE_INSERT_NOFAIL;
-retry:
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               goto out;
 
-       b = bch2_btree_iter_peek_node(iter);
-       if (!b || b->data->keys.seq != seq)
-               goto out;
-
-       parent = btree_node_parent(iter, b);
-       as = bch2_btree_update_start(iter, b->c.level,
+       parent = btree_node_parent(iter->path, b);
+       as = bch2_btree_update_start(trans, iter->path, b->c.level,
                (parent
                 ? btree_update_reserve_required(c, parent)
                 : 0) + 1,
                flags);
        ret = PTR_ERR_OR_ZERO(as);
-       if (ret == -EINTR)
-               goto retry;
        if (ret) {
                trace_btree_gc_rewrite_node_fail(c, b);
                goto out;
@@ -1789,23 +1779,22 @@ retry:
 
        if (parent) {
                bch2_keylist_add(&as->parent_keys, &n->key);
-               bch2_btree_insert_node(as, trans, iter, parent,
+               bch2_btree_insert_node(as, trans, iter->path, parent,
                                       &as->parent_keys, flags);
        } else {
-               bch2_btree_set_root(as, n, iter);
+               bch2_btree_set_root(as, trans, iter->path, n);
        }
 
        bch2_btree_update_get_open_buckets(as, n);
 
        six_lock_increment(&b->c.lock, SIX_LOCK_intent);
-       bch2_btree_iter_node_drop(iter, b);
-       bch2_btree_iter_node_replace(iter, n);
-       bch2_btree_node_free_inmem(c, b, iter);
+       bch2_trans_node_add(trans, n);
+       bch2_btree_node_free_inmem(trans, b);
        six_unlock_intent(&n->c.lock);
 
        bch2_btree_update_done(as);
 out:
-       bch2_btree_iter_downgrade(iter);
+       bch2_btree_path_downgrade(iter->path);
        return ret;
 }
 
@@ -1818,20 +1807,38 @@ struct async_btree_rewrite {
        __le64                  seq;
 };
 
+static int async_btree_node_rewrite_trans(struct btree_trans *trans,
+                                         struct async_btree_rewrite *a)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       int ret;
+
+       bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
+                                 BTREE_MAX_DEPTH, a->level, 0);
+       b = bch2_btree_iter_peek_node(&iter);
+       ret = PTR_ERR_OR_ZERO(b);
+       if (ret)
+               goto out;
+
+       if (!b || b->data->keys.seq != a->seq)
+               goto out;
+
+       ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
+out :
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
 void async_btree_node_rewrite_work(struct work_struct *work)
 {
        struct async_btree_rewrite *a =
                container_of(work, struct async_btree_rewrite, work);
        struct bch_fs *c = a->c;
-       struct btree_trans trans;
-       struct btree_iter *iter;
 
-       bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
-                                       BTREE_MAX_DEPTH, a->level, 0);
-       bch2_btree_node_rewrite(&trans, iter, a->seq, 0);
-       bch2_trans_iter_put(&trans, iter);
-       bch2_trans_exit(&trans);
+       bch2_trans_do(c, NULL, NULL, 0,
+                     async_btree_node_rewrite_trans(&trans, a));
        percpu_ref_put(&c->writes);
        kfree(a);
 }
@@ -1840,9 +1847,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
        struct async_btree_rewrite *a;
 
-       if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
-               return;
-
        if (!percpu_ref_tryget(&c->writes))
                return;
 
@@ -1869,7 +1873,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                                        bool skip_triggers)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter2 = NULL;
+       struct btree_iter iter2 = { NULL };
        struct btree *parent;
        u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
        int ret;
@@ -1897,19 +1901,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                BUG_ON(ret);
        }
 
-       parent = btree_node_parent(iter, b);
+       parent = btree_node_parent(iter->path, b);
        if (parent) {
-               iter2 = bch2_trans_copy_iter(trans, iter);
+               bch2_trans_copy_iter(&iter2, iter);
 
-               BUG_ON(iter2->level != b->c.level);
-               BUG_ON(bpos_cmp(iter2->pos, new_key->k.p));
+               iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
+                               iter2.flags & BTREE_ITER_INTENT,
+                               _THIS_IP_);
 
-               btree_node_unlock(iter2, iter2->level);
-               iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP;
-               iter2->level++;
+               BUG_ON(iter2.path->level != b->c.level);
+               BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
 
-               ret   = bch2_btree_iter_traverse(iter2) ?:
-                       bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN);
+               btree_node_unlock(iter2.path, iter2.path->level);
+               path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
+               iter2.path->level++;
+
+               ret   = bch2_btree_iter_traverse(&iter2) ?:
+                       bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
                if (ret)
                        goto err;
        } else {
@@ -1926,12 +1934,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
        ret = bch2_trans_commit(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL|
                                BTREE_INSERT_NOCHECK_RW|
+                               BTREE_INSERT_USE_RESERVE|
                                BTREE_INSERT_JOURNAL_RECLAIM|
                                BTREE_INSERT_JOURNAL_RESERVED);
        if (ret)
                goto err;
 
-       bch2_btree_node_lock_write(b, iter);
+       bch2_btree_node_lock_write(trans, iter->path, b);
 
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
@@ -1946,9 +1955,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
                bkey_copy(&b->key, new_key);
        }
 
-       bch2_btree_node_unlock_write(b, iter);
+       bch2_btree_node_unlock_write(trans, iter->path, b);
 out:
-       bch2_trans_iter_put(trans, iter2);
+       bch2_trans_iter_exit(trans, &iter2);
        return ret;
 err:
        if (new_hash) {
@@ -1965,9 +1974,16 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
 {
        struct bch_fs *c = trans->c;
        struct btree *new_hash = NULL;
+       struct btree_path *path = iter->path;
        struct closure cl;
        int ret = 0;
 
+       if (!btree_node_intent_locked(path, b->c.level) &&
+           !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
+               btree_trans_restart(trans);
+               return -EINTR;
+       }
+
        closure_init_stack(&cl);
 
        /*
@@ -1986,8 +2002,10 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
                new_hash = bch2_btree_node_mem_alloc(c);
        }
 
+       path->intent_ref++;
        ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
                                           new_key, skip_triggers);
+       --path->intent_ref;
 
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
@@ -2006,18 +2024,18 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
                                        struct btree *b, struct bkey_i *new_key,
                                        bool skip_triggers)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p,
-                                       BTREE_MAX_DEPTH, b->c.level,
-                                       BTREE_ITER_INTENT);
-       ret = bch2_btree_iter_traverse(iter);
+       bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
+                                 BTREE_MAX_DEPTH, b->c.level,
+                                 BTREE_ITER_INTENT);
+       ret = bch2_btree_iter_traverse(&iter);
        if (ret)
                goto out;
 
        /* has node been freed? */
-       if (iter->l[b->c.level].b != b) {
+       if (iter.path->l[b->c.level].b != b) {
                /* node has been freed: */
                BUG_ON(!btree_node_dying(b));
                goto out;
@@ -2025,9 +2043,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
 
        BUG_ON(!btree_node_hashed(b));
 
-       ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers);
+       ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index e88e737ee8134e8365fcd451bba8257f08a4820a..8dc86fa636d680900034d8c1b9efd0c1374d0b15 100644 (file)
@@ -35,6 +35,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
 struct btree_update {
        struct closure                  cl;
        struct bch_fs                   *c;
+       u64                             start_time;
 
        struct list_head                list;
        struct list_head                unwritten_list;
@@ -81,12 +82,12 @@ struct btree_update {
        /* Nodes being freed: */
        struct keylist                  old_keys;
        u64                             _old_keys[BTREE_UPDATE_NODES_MAX *
-                                                 BKEY_BTREE_PTR_VAL_U64s_MAX];
+                                                 BKEY_BTREE_PTR_U64s_MAX];
 
        /* Nodes being added: */
        struct keylist                  new_keys;
        u64                             _new_keys[BTREE_UPDATE_NODES_MAX *
-                                                 BKEY_BTREE_PTR_VAL_U64s_MAX];
+                                                 BKEY_BTREE_PTR_U64s_MAX];
 
        /* New nodes, that will be made reachable by this update: */
        struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
@@ -113,57 +114,39 @@ struct btree_update {
        u64                             inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
 };
 
-void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
-                               struct btree_iter *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-
-void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
-
 struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
                                                  struct btree *,
                                                  struct bkey_format);
 
-void bch2_btree_update_done(struct btree_update *);
-struct btree_update *
-bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
-
-void bch2_btree_interior_update_will_free_node(struct btree_update *,
-                                              struct btree *);
-void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
 
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
                                  unsigned, unsigned, enum btree_node_sibling);
 
 static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
-                                       struct btree_iter *iter,
+                                       struct btree_path *path,
                                        unsigned level, unsigned flags,
                                        enum btree_node_sibling sib)
 {
        struct btree *b;
 
-       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
-               return 0;
-
-       if (!bch2_btree_node_relock(iter, level))
-               return 0;
+       EBUG_ON(!btree_node_locked(path, level));
 
-       b = iter->l[level].b;
+       b = path->l[level].b;
        if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
                return 0;
 
-       return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib);
+       return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
 }
 
 static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
-                                             struct btree_iter *iter,
+                                             struct btree_path *path,
                                              unsigned level,
                                              unsigned flags)
 {
-       return  bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+       return  bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
                                                    btree_prev_sib) ?:
-               bch2_foreground_maybe_merge_sibling(trans, iter, level, flags,
+               bch2_foreground_maybe_merge_sibling(trans, path, level, flags,
                                                    btree_next_sib);
 }
 
@@ -235,7 +218,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
 {
        ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
                b->whiteout_u64s;
-       ssize_t total = c->opts.btree_node_size << 6;
+       ssize_t total = c->opts.btree_node_size >> 3;
 
        /* Always leave one extra u64 for bch2_varint_decode: */
        used++;
index 7e9909e2dcaf5ef305effe1bfedd9f74625ee710..4b37a4869873998a7301265999480364c2df0429 100644 (file)
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery.h"
+#include "subvolume.h"
 #include "replicas.h"
 
 #include <linux/prefetch.h>
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                         struct bkey_i *, enum btree_update_flags);
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                                         const struct btree_insert_entry *r)
 {
@@ -29,40 +35,59 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                 bpos_cmp(l->k->k.p,    r->k->k.p);
 }
 
+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+{
+       return i->path->l + i->level;
+}
+
 static inline bool same_leaf_as_prev(struct btree_trans *trans,
                                     struct btree_insert_entry *i)
 {
        return i != trans->updates &&
-               iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
+               insert_l(&i[0])->b == insert_l(&i[-1])->b;
 }
 
-inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           struct btree *b)
+static inline bool same_leaf_as_next(struct btree_trans *trans,
+                                    struct btree_insert_entry *i)
 {
-       struct bch_fs *c = trans->c;
+       return i + 1 < trans->updates + trans->nr_updates &&
+               insert_l(&i[0])->b == insert_l(&i[1])->b;
+}
 
-       bch2_btree_node_lock_write(b, iter);
+static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
+                                                 struct btree_path *path,
+                                                 struct btree *b)
+{
+       struct bch_fs *c = trans->c;
 
-       if (btree_iter_type(iter) == BTREE_ITER_CACHED)
+       if (path->cached)
                return;
 
        if (unlikely(btree_node_just_written(b)) &&
            bch2_btree_post_write_cleanup(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
        /*
         * If the last bset has been written, or if it's gotten too big - start
         * a new bset to insert into:
         */
        if (want_new_bset(c, b))
-               bch2_btree_init_next(trans, iter, b);
+               bch2_btree_init_next(trans, b);
+}
+
+void bch2_btree_node_lock_for_insert(struct btree_trans *trans,
+                                    struct btree_path *path,
+                                    struct btree *b)
+{
+       bch2_btree_node_lock_write(trans, path, b);
+       bch2_btree_node_prep_for_write(trans, path, b);
 }
 
 /* Inserting into a given leaf node (last stage of insert): */
 
 /* Handle overwrites and do insert, for non extents: */
-bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+bool bch2_btree_bset_insert_key(struct btree_trans *trans,
+                               struct btree_path *path,
                                struct btree *b,
                                struct btree_node_iter *node_iter,
                                struct bkey_i *insert)
@@ -76,8 +101,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
        EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
        EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
        EBUG_ON(insert->k.u64s >
-               bch_btree_keys_u64s_remaining(iter->trans->c, b));
-       EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+               bch_btree_keys_u64s_remaining(trans->c, b));
 
        k = bch2_btree_node_iter_peek_all(node_iter, b);
        if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
@@ -96,7 +120,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                k->type = KEY_TYPE_deleted;
 
                if (k->needs_whiteout)
-                       push_whiteout(iter->trans->c, b, insert->k.p);
+                       push_whiteout(trans->c, b, insert->k.p);
                k->needs_whiteout = false;
 
                if (k >= btree_bset_last(b)->start) {
@@ -104,7 +128,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                        bch2_bset_delete(b, k, clobber_u64s);
                        goto fix_iter;
                } else {
-                       bch2_btree_iter_fix_key_modified(iter, b, k);
+                       bch2_btree_path_fix_key_modified(trans, b, k);
                }
 
                return true;
@@ -122,7 +146,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
                        clobber_u64s = k->u64s;
                        goto overwrite;
                } else {
-                       bch2_btree_iter_fix_key_modified(iter, b, k);
+                       bch2_btree_path_fix_key_modified(trans, b, k);
                }
        }
 
@@ -132,7 +156,7 @@ overwrite:
        new_u64s = k->u64s;
 fix_iter:
        if (clobber_u64s != new_u64s)
-               bch2_btree_node_iter_fix(iter, b, node_iter, k,
+               bch2_btree_node_iter_fix(trans, path, b, node_iter, k,
                                         clobber_u64s, new_u64s);
        return true;
 }
@@ -144,7 +168,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        struct btree_write *w = container_of(pin, struct btree_write, journal);
        struct btree *b = container_of(w, struct btree, writes[i]);
 
-       btree_node_lock_type(c, b, SIX_LOCK_read);
+       six_lock_read(&b->c.lock, NULL, NULL);
        bch2_btree_node_write_cond(c, b,
                (btree_current_write(b) == w && w->journal.seq == seq));
        six_unlock_read(&b->c.lock);
@@ -176,22 +200,18 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
  * btree_insert_key - insert a key one key into a leaf node
  */
 static bool btree_insert_key_leaf(struct btree_trans *trans,
-                                 struct btree_iter *iter,
-                                 struct bkey_i *insert)
+                                 struct btree_insert_entry *insert)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
+       struct btree *b = insert_l(insert)->b;
        struct bset_tree *t = bset_tree_last(b);
        struct bset *i = bset(b, t);
        int old_u64s = bset_u64s(t);
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
-       EBUG_ON(!iter->level &&
-               !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
-
-       if (unlikely(!bch2_btree_bset_insert_key(iter, b,
-                                       &iter_l(iter)->iter, insert)))
+       if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
+                                       &insert_l(insert)->iter, insert->k)))
                return false;
 
        i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
@@ -212,9 +232,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 
        if (u64s_added > live_u64s_added &&
            bch2_maybe_compact_whiteouts(c, b))
-               bch2_btree_iter_reinit_node(iter, b);
+               bch2_trans_node_reinit_iter(trans, b);
 
-       trace_btree_insert_key(c, b, insert);
        return true;
 }
 
@@ -225,9 +244,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
 static inline void btree_insert_entry_checks(struct btree_trans *trans,
                                             struct btree_insert_entry *i)
 {
-       BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos));
-       BUG_ON(i->level         != i->iter->level);
-       BUG_ON(i->btree_id      != i->iter->btree_id);
+       BUG_ON(bpos_cmp(i->k->k.p, i->path->pos));
+       BUG_ON(i->cached        != i->path->cached);
+       BUG_ON(i->level         != i->path->level);
+       BUG_ON(i->btree_id      != i->path->btree_id);
+       EBUG_ON(!i->level &&
+               !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
+               test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
+               i->k->k.p.snapshot &&
+               bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot));
 }
 
 static noinline int
@@ -245,7 +270,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s,
                return ret;
 
        if (!bch2_trans_relock(trans)) {
-               trace_trans_restart_journal_preres_get(trans->ip, trace_ip);
+               trace_trans_restart_journal_preres_get(trans->fn, trace_ip);
                return -EINTR;
        }
 
@@ -267,13 +292,38 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
        return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
 }
 
-static enum btree_insert_ret
+#define JSET_ENTRY_LOG_U64s            4
+
+static noinline void journal_transaction_name(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res);
+       struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+       unsigned u64s = JSET_ENTRY_LOG_U64s - 1;
+       unsigned b, buflen = u64s * sizeof(u64);
+
+       l->entry.u64s           = cpu_to_le16(u64s);
+       l->entry.btree_id       = 0;
+       l->entry.level          = 0;
+       l->entry.type           = BCH_JSET_ENTRY_log;
+       l->entry.pad[0]         = 0;
+       l->entry.pad[1]         = 0;
+       l->entry.pad[2]         = 0;
+       b = min_t(unsigned, strlen(trans->fn), buflen);
+       memcpy(l->d, trans->fn, b);
+       while (b < buflen)
+               l->d[b++] = '\0';
+
+       trans->journal_res.offset       += JSET_ENTRY_LOG_U64s;
+       trans->journal_res.u64s         -= JSET_ENTRY_LOG_U64s;
+}
+
+static inline enum btree_insert_ret
 btree_key_can_insert(struct btree_trans *trans,
-                    struct btree_iter *iter,
+                    struct btree *b,
                     unsigned u64s)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter_l(iter)->b;
 
        if (!bch2_btree_node_insert_fits(c, b, u64s))
                return BTREE_INSERT_BTREE_NODE_FULL;
@@ -283,17 +333,18 @@ btree_key_can_insert(struct btree_trans *trans,
 
 static enum btree_insert_ret
 btree_key_can_insert_cached(struct btree_trans *trans,
-                           struct btree_iter *iter,
+                           struct btree_path *path,
                            unsigned u64s)
 {
-       struct bkey_cached *ck = (void *) iter->l[0].b;
+       struct bch_fs *c = trans->c;
+       struct bkey_cached *ck = (void *) path->l[0].b;
        unsigned new_u64s;
        struct bkey_i *new_k;
 
-       BUG_ON(iter->level);
+       EBUG_ON(path->level);
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-           bch2_btree_key_cache_must_wait(trans->c) &&
+           bch2_btree_key_cache_must_wait(c) &&
            !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
                return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 
@@ -308,8 +359,11 @@ btree_key_can_insert_cached(struct btree_trans *trans,
 
        new_u64s        = roundup_pow_of_two(u64s);
        new_k           = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
-       if (!new_k)
+       if (!new_k) {
+               bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+                       bch2_btree_ids[path->btree_id], new_u64s);
                return -ENOMEM;
+       }
 
        ck->u64s        = new_u64s;
        ck->k           = new_k;
@@ -328,9 +382,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
 
        i->k->k.needs_whiteout = false;
 
-       did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
-               ? btree_insert_key_leaf(trans, i->iter, i->k)
-               : bch2_btree_insert_key_cached(trans, i->iter, i->k);
+       did_work = !i->cached
+               ? btree_insert_key_leaf(trans, i)
+               : bch2_btree_insert_key_cached(trans, i->path, i->k);
        if (!did_work)
                return;
 
@@ -340,29 +394,33 @@ static inline void do_btree_insert_one(struct btree_trans *trans,
                                      i->level,
                                      i->k);
 
-               bch2_journal_set_has_inode(j, &trans->journal_res,
-                                          i->k->k.p.inode);
-
                if (trans->journal_seq)
                        *trans->journal_seq = trans->journal_res.seq;
        }
 }
 
-static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
+static noinline int bch2_trans_mark_gc(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
+       int ret = 0;
 
        trans_for_each_update(trans, i) {
                /*
                 * XXX: synchronization of cached update triggers with gc
+                * XXX: synchronization of interior node updates with gc
                 */
-               BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
+               BUG_ON(i->cached || i->level);
 
-               if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
-                       bch2_mark_update(trans, i->iter, i->k,
-                                        i->flags|BTREE_TRIGGER_GC);
+               if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
+                       ret = bch2_mark_update(trans, i->path, i->k,
+                                              i->flags|BTREE_TRIGGER_GC);
+                       if (ret)
+                               break;
+               }
        }
+
+       return ret;
 }
 
 static inline int
@@ -378,7 +436,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
        int ret;
 
        if (race_fault()) {
-               trace_trans_restart_fault_inject(trans->ip, trace_ip);
+               trace_trans_restart_fault_inject(trans->fn, trace_ip);
                trans->restarted = true;
                return -EINTR;
        }
@@ -405,9 +463,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                        u64s = 0;
 
                u64s += i->k->k.u64s;
-               ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
-                       ? btree_key_can_insert(trans, i->iter, u64s)
-                       : btree_key_can_insert_cached(trans, i->iter, u64s);
+               ret = !i->cached
+                       ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
+                       : btree_key_can_insert_cached(trans, i->path, u64s);
                if (ret) {
                        *stopped_at = i;
                        return ret;
@@ -417,17 +475,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                        marking = true;
        }
 
-       if (marking) {
-               percpu_down_read(&c->mark_lock);
-       }
-
-       /* Must be called under mark_lock: */
-       if (marking && trans->fs_usage_deltas &&
-           !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
-               ret = BTREE_INSERT_NEED_MARK_REPLICAS;
-               goto err;
-       }
-
        /*
         * Don't get journal reservation until after we know insert will
         * succeed:
@@ -436,7 +483,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                ret = bch2_trans_journal_res_get(trans,
                                JOURNAL_RES_GET_NONBLOCK);
                if (ret)
-                       goto err;
+                       return ret;
+
+               if (unlikely(trans->journal_transaction_names))
+                       journal_transaction_name(trans);
        } else {
                trans->journal_res.seq = c->journal.replay_journal_seq;
        }
@@ -464,63 +514,139 @@ bch2_trans_commit_write_locked(struct btree_trans *trans,
                                i->k->k.version = MAX_VERSION;
        }
 
-       trans_for_each_update(trans, i)
-               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
-                       bch2_mark_update(trans, i->iter, i->k,
-                                        i->flags);
+       if (trans->fs_usage_deltas &&
+           bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
+               return BTREE_INSERT_NEED_MARK_REPLICAS;
 
-       if (marking && trans->fs_usage_deltas)
-               bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
+       trans_for_each_update(trans, i)
+               if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
+                       ret = bch2_mark_update(trans, i->path, i->k, i->flags);
+                       if (ret)
+                               return ret;
+               }
 
-       if (unlikely(c->gc_pos.phase))
-               bch2_trans_mark_gc(trans);
+       if (unlikely(c->gc_pos.phase)) {
+               ret = bch2_trans_mark_gc(trans);
+               if  (ret)
+                       return ret;
+       }
 
        trans_for_each_update(trans, i)
                do_btree_insert_one(trans, i);
-err:
-       if (marking) {
-               percpu_up_read(&c->mark_lock);
-       }
 
        return ret;
 }
 
-static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
+static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path)
 {
-       struct btree_insert_entry *i;
-       struct btree *b = iter_l(iter)->b;
-       struct bkey_s_c old;
-       int u64s_delta = 0;
-       int ret;
+       unsigned l;
 
-       /*
-        * Inserting directly into interior nodes is an uncommon operation with
-        * various weird edge cases: also, a lot of things about
-        * BTREE_ITER_NODES iters need to be audited
-        */
-       if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
-               return 0;
+       for (l = 0; l < BTREE_MAX_DEPTH; l++)
+               if (btree_node_read_locked(path, l))
+                       BUG_ON(!bch2_btree_node_upgrade(trans, path, l));
+}
+
+static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path)
+{
+       struct btree *b = path_l(path)->b;
+
+       do {
+               if (path->nodes_locked &&
+                   path->nodes_locked != path->nodes_intent_locked)
+                       path_upgrade_readers(trans, path);
+       } while ((path = prev_btree_path(trans, path)) &&
+                path_l(path)->b == b);
+}
+
+/*
+ * Check for nodes that we have both read and intent locks on, and upgrade the
+ * readers to intent:
+ */
+static inline void normalize_read_intent_locks(struct btree_trans *trans)
+{
+       struct btree_path *path;
+       unsigned i, nr_read = 0, nr_intent = 0;
+
+       trans_for_each_path_inorder(trans, path, i) {
+               struct btree_path *next = i + 1 < trans->nr_sorted
+                       ? trans->paths + trans->sorted[i + 1]
+                       : NULL;
+
+               if (path->nodes_locked) {
+                       if (path->nodes_intent_locked)
+                               nr_intent++;
+                       else
+                               nr_read++;
+               }
+
+               if (!next || path_l(path)->b != path_l(next)->b) {
+                       if (nr_read && nr_intent)
+                               upgrade_readers(trans, path);
+
+                       nr_read = nr_intent = 0;
+               }
+       }
+
+       bch2_trans_verify_locks(trans);
+}
+
+static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
+{
+       struct btree_path *path;
+       unsigned i;
+
+       trans_for_each_path_inorder(trans, path, i) {
+               //if (path == pos)
+               //      break;
 
-       BUG_ON(iter->level);
+               if (path->nodes_locked != path->nodes_intent_locked &&
+                   !bch2_btree_path_upgrade(trans, path, path->level + 1))
+                       return true;
+       }
+
+       return false;
+}
+
+static inline int trans_lock_write(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
 
        trans_for_each_update(trans, i) {
-               if (iter_l(i->iter)->b != b)
+               if (same_leaf_as_prev(trans, i))
                        continue;
 
-               old = bch2_btree_iter_peek_slot(i->iter);
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
+               if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
+                       if (have_conflicting_read_lock(trans, i->path))
+                               goto fail;
 
-               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
-               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+                       btree_node_lock_type(trans, i->path,
+                                            insert_l(i)->b,
+                                            i->path->pos, i->level,
+                                            SIX_LOCK_write, NULL, NULL);
+               }
+
+               bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
        }
 
-       if (u64s_delta > 0)
-               return 0;
+       return 0;
+fail:
+       while (--i >= trans->updates) {
+               if (same_leaf_as_prev(trans, i))
+                       continue;
 
-       return bch2_foreground_maybe_merge(trans, iter,
-                               iter->level, trans->flags);
+               bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
+       }
+
+       trace_trans_restart_would_deadlock_write(trans->fn);
+       return btree_trans_restart(trans);
+}
+
+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_update(trans, i)
+               bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
 }
 
 /*
@@ -532,29 +658,53 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       struct btree_iter *iter;
-       int ret;
+       struct bkey_s_c old;
+       int ret, u64s_delta = 0;
 
        trans_for_each_update(trans, i) {
-               struct btree *b;
+               const char *invalid = bch2_bkey_invalid(c,
+                               bkey_i_to_s_c(i->k), i->bkey_type);
+               if (invalid) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
+                       bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+                                           buf, trans->fn, (void *) i->ip_allocated, invalid);
+                       return -EINVAL;
+               }
+               btree_insert_entry_checks(trans, i);
+       }
 
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+       trans_for_each_update(trans, i) {
+               struct bkey u;
 
-               if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
+               /*
+                * peek_slot() doesn't yet work on iterators that point to
+                * interior nodes:
+                */
+               if (i->cached || i->level)
                        continue;
 
-               b = iter_l(i->iter)->b;
-               if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
-                   b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
-                       ret = maybe_do_btree_merge(trans, i->iter);
-                       if (unlikely(ret))
-                               return ret;
+               old = bch2_btree_path_peek_slot(i->path, &u);
+               ret = bkey_err(old);
+               if (unlikely(ret))
+                       return ret;
+
+               u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
+               u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
+
+               if (!same_leaf_as_next(trans, i)) {
+                       if (u64s_delta <= 0) {
+                               ret = bch2_foreground_maybe_merge(trans, i->path,
+                                                       i->level, trans->flags);
+                               if (unlikely(ret))
+                                       return ret;
+                       }
+
+                       u64s_delta = 0;
                }
        }
 
-       trans_for_each_update(trans, i)
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
-
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
                        JOURNAL_RES_GET_NONBLOCK|
@@ -566,52 +716,21 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        if (unlikely(ret))
                return ret;
 
-       /*
-        * Can't be holding any read locks when we go to take write locks:
-        * another thread could be holding an intent lock on the same node we
-        * have a read lock on, and it'll block trying to take a write lock
-        * (because we hold a read lock) and it could be blocking us by holding
-        * its own read lock (while we're trying to to take write locks).
-        *
-        * note - this must be done after bch2_trans_journal_preres_get_cold()
-        * or anything else that might call bch2_trans_relock(), since that
-        * would just retake the read locks:
-        */
-       trans_for_each_iter(trans, iter)
-               if (iter->nodes_locked != iter->nodes_intent_locked &&
-                   !bch2_btree_iter_upgrade(iter, 1)) {
-                       trace_trans_restart_upgrade(trans->ip, trace_ip,
-                                                   iter->btree_id,
-                                                   &iter->real_pos);
-                       trans->restarted = true;
-                       return -EINTR;
-               }
+       normalize_read_intent_locks(trans);
 
-       trans_for_each_update(trans, i) {
-               const char *invalid = bch2_bkey_invalid(c,
-                               bkey_i_to_s_c(i->k), i->bkey_type);
-               if (invalid) {
-                       char buf[200];
-
-                       bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-                       bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
-                       bch2_fatal_error(c);
-               }
-               btree_insert_entry_checks(trans, i);
-       }
-       bch2_btree_trans_verify_locks(trans);
-
-       trans_for_each_update(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_lock_for_insert(trans, i->iter,
-                                       iter_l(i->iter)->b);
+       ret = trans_lock_write(trans);
+       if (unlikely(ret))
+               return ret;
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
 
+       if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+               bch2_drop_overwrites_from_journal(trans);
+
        trans_for_each_update(trans, i)
                if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
-                                                            i->iter);
+                       bch2_btree_node_unlock_write_inlined(trans, i->path,
+                                                       insert_l(i)->b);
 
        if (!ret && trans->journal_pin)
                bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
@@ -650,14 +769,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
 
        switch (ret) {
        case BTREE_INSERT_BTREE_NODE_FULL:
-               ret = bch2_btree_split_leaf(trans, i->iter, trans->flags);
+               ret = bch2_btree_split_leaf(trans, i->path, trans->flags);
                if (!ret)
                        return 0;
 
                if (ret == -EINTR)
-                       trace_trans_restart_btree_node_split(trans->ip, trace_ip,
-                                                            i->iter->btree_id,
-                                                            &i->iter->real_pos);
+                       trace_trans_restart_btree_node_split(trans->fn, trace_ip,
+                                               i->btree_id, &i->path->pos);
                break;
        case BTREE_INSERT_NEED_MARK_REPLICAS:
                bch2_trans_unlock(trans);
@@ -669,7 +787,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_mark_replicas(trans->ip, trace_ip);
+               trace_trans_restart_mark_replicas(trans->fn, trace_ip);
                ret = -EINTR;
                break;
        case BTREE_INSERT_NEED_JOURNAL_RES:
@@ -689,13 +807,13 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_journal_res_get(trans->ip, trace_ip);
+               trace_trans_restart_journal_res_get(trans->fn, trace_ip);
                ret = -EINTR;
                break;
        case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
                bch2_trans_unlock(trans);
 
-               trace_trans_blocked_journal_reclaim(trans->ip, trace_ip);
+               trace_trans_blocked_journal_reclaim(trans->fn, trace_ip);
 
                wait_event_freezable(c->journal.reclaim_wait,
                                     (ret = journal_reclaim_wait_done(c)));
@@ -705,7 +823,7 @@ int bch2_trans_commit_error(struct btree_trans *trans,
                if (bch2_trans_relock(trans))
                        return 0;
 
-               trace_trans_restart_journal_reclaim(trans->ip, trace_ip);
+               trace_trans_restart_journal_reclaim(trans->fn, trace_ip);
                ret = -EINTR;
                break;
        default:
@@ -714,7 +832,9 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        }
 
        BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted);
-       BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL));
+       BUG_ON(ret == -ENOSPC &&
+              !(trans->flags & BTREE_INSERT_NOWAIT) &&
+              (trans->flags & BTREE_INSERT_NOFAIL));
 
        return ret;
 }
@@ -725,7 +845,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        struct bch_fs *c = trans->c;
        int ret;
 
-       if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
+       if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) ||
+           test_bit(BCH_FS_STARTED, &c->flags))
                return -EROFS;
 
        bch2_trans_unlock(trans);
@@ -734,125 +855,128 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
        if (ret)
                return ret;
 
+       if (!bch2_trans_relock(trans))
+               return -EINTR;
+
        percpu_ref_get(&c->writes);
        return 0;
 }
 
-static int extent_handle_overwrites(struct btree_trans *trans,
-                                   struct btree_insert_entry *i)
+static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i,
+                          bool overwrite)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter, *update_iter;
-       struct bpos start = bkey_start_pos(&i->k->k);
-       struct bkey_i *update;
-       struct bkey_s_c k;
-       int ret = 0, compressed_sectors;
-
-       iter = bch2_trans_get_iter(trans, i->btree_id, start,
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES|
-                                  BTREE_ITER_NOT_EXTENTS);
-       k = bch2_btree_iter_peek(iter);
-       if (!k.k || (ret = bkey_err(k)))
-               goto out;
-
-       if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) {
-               update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-               if ((ret = PTR_ERR_OR_ZERO(update)))
-                       goto out;
+       struct bkey             _deleted = KEY(0, 0, 0);
+       struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
+       struct bkey_s_c         old;
+       struct bkey             unpacked;
+       int ret = 0;
 
-               bkey_reassemble(update, k);
+       if ((i->flags & BTREE_TRIGGER_NORUN) ||
+           !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
+               return 0;
 
-               if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       ret = bch2_btree_delete_at(trans, update_iter, i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
+       if (!overwrite) {
+               if (i->insert_trigger_run)
+                       return 0;
 
-                       if (ret)
-                               goto out;
+               BUG_ON(i->overwrite_trigger_run);
+               i->insert_trigger_run = true;
+       } else {
+               if (i->overwrite_trigger_run)
+                       return 0;
 
-                       i->k = update;
-                       goto next;
-               }
+               BUG_ON(!i->insert_trigger_run);
+               i->overwrite_trigger_run = true;
        }
 
-       if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k)))
-               goto next;
-
-       while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) {
-               /*
-                * If we're going to be splitting a compressed extent, note it
-                * so that __bch2_trans_commit() can increase our disk
-                * reservation:
-                */
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0 &&
-                   bkey_cmp(k.k->p, i->k->k.p) > 0 &&
-                   (compressed_sectors = bch2_bkey_sectors_compressed(k)))
-                       trans->extra_journal_res += compressed_sectors;
+       old = bch2_btree_path_peek_slot(i->path, &unpacked);
+       _deleted.p = i->path->pos;
+
+       if (overwrite) {
+               ret = bch2_trans_mark_key(trans, old, deleted,
+                               BTREE_TRIGGER_OVERWRITE|i->flags);
+       } else if (old.k->type == i->k->k.type &&
+           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+               i->overwrite_trigger_run = true;
+               ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
+                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
+       } else {
+               ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
+                               BTREE_TRIGGER_INSERT|i->flags);
+       }
 
-               if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
-                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-                       if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto out;
+       if (ret == -EINTR)
+               trace_trans_restart_mark(trans->fn, _RET_IP_,
+                                        i->btree_id, &i->path->pos);
+       return ret ?: 1;
+}
 
-                       bkey_reassemble(update, k);
+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
+                             struct btree_insert_entry *btree_id_start)
+{
+       struct btree_insert_entry *i;
+       bool trans_trigger_run;
+       int ret, overwrite;
 
-                       bch2_cut_back(start, update);
+       for (overwrite = 0; overwrite < 2; overwrite++) {
 
-                       update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p,
-                                                         BTREE_ITER_NOT_EXTENTS|
-                                                         BTREE_ITER_INTENT);
-                       ret = bch2_btree_iter_traverse(update_iter);
-                       if (ret) {
-                               bch2_trans_iter_put(trans, update_iter);
-                               goto out;
+               /*
+                * Running triggers will append more updates to the list of updates as
+                * we're walking it:
+                */
+               do {
+                       trans_trigger_run = false;
+
+                       for (i = btree_id_start;
+                            i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+                            i++) {
+                               ret = run_one_trigger(trans, i, overwrite);
+                               if (ret < 0)
+                                       return ret;
+                               if (ret)
+                                       trans_trigger_run = true;
                        }
+               } while (trans_trigger_run);
+       }
 
-                       bch2_trans_update(trans, update_iter, update,
-                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
-                                         i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
-               }
-
-               if (bkey_cmp(k.k->p, i->k->k.p) <= 0) {
-                       update_iter = bch2_trans_copy_iter(trans, iter);
-                       ret = bch2_btree_delete_at(trans, update_iter,
-                                                  i->flags);
-                       bch2_trans_iter_put(trans, update_iter);
-
-                       if (ret)
-                               goto out;
-               }
+       return 0;
+}
 
-               if (bkey_cmp(k.k->p, i->k->k.p) > 0) {
-                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
-                       if ((ret = PTR_ERR_OR_ZERO(update)))
-                               goto out;
+static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
+{
+       struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+       unsigned btree_id = 0;
+       int ret = 0;
 
-                       bkey_reassemble(update, k);
-                       bch2_cut_front(i->k->k.p, update);
+       /*
+        *
+        * For a given btree, this algorithm runs insert triggers before
+        * overwrite triggers: this is so that when extents are being moved
+        * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
+        * they are re-added.
+        */
+       for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
+               while (btree_id_start < trans->updates + trans->nr_updates &&
+                      btree_id_start->btree_id < btree_id)
+                       btree_id_start++;
 
-                       bch2_trans_update(trans, iter, update, i->flags);
-                       goto out;
-               }
-next:
-               k = bch2_btree_iter_next(iter);
-               if (!k.k || (ret = bkey_err(k)))
-                       goto out;
+               ret = run_btree_triggers(trans, btree_id, btree_id_start);
+               if (ret)
+                       return ret;
        }
 
-       bch2_bkey_merge(c, bkey_i_to_s(i->k), k);
-out:
-       bch2_trans_iter_put(trans, iter);
+       trans_for_each_update(trans, i)
+               BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
+                      (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
+                      (!i->insert_trigger_run || !i->overwrite_trigger_run));
 
-       return ret;
+       return 0;
 }
 
 int __bch2_trans_commit(struct btree_trans *trans)
 {
+       struct bch_fs *c = trans->c;
        struct btree_insert_entry *i = NULL;
-       struct btree_iter *iter;
-       bool trans_trigger_run;
        unsigned u64s;
        int ret = 0;
 
@@ -861,77 +985,62 @@ int __bch2_trans_commit(struct btree_trans *trans)
                goto out_reset;
 
        if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
-               lockdep_assert_held(&trans->c->gc_lock);
+               lockdep_assert_held(&c->gc_lock);
 
        memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
 
        trans->journal_u64s             = trans->extra_journal_entry_u64s;
        trans->journal_preres_u64s      = 0;
 
+       trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
+
+       if (trans->journal_transaction_names)
+               trans->journal_u64s += JSET_ENTRY_LOG_U64s;
+
        if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
-           unlikely(!percpu_ref_tryget(&trans->c->writes))) {
+           unlikely(!percpu_ref_tryget(&c->writes))) {
                ret = bch2_trans_commit_get_rw_cold(trans);
                if (ret)
                        goto out_reset;
        }
 
 #ifdef CONFIG_BCACHEFS_DEBUG
+       /*
+        * if BTREE_TRIGGER_NORUN is set, it means we're probably being called
+        * from the key cache flush code:
+        */
        trans_for_each_update(trans, i)
-               if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
+               if (!i->cached &&
                    !(i->flags & BTREE_TRIGGER_NORUN))
                        bch2_btree_key_cache_verify_clean(trans,
                                        i->btree_id, i->k->k.p);
 #endif
 
-       /*
-        * Running triggers will append more updates to the list of updates as
-        * we're walking it:
-        */
-       do {
-               trans_trigger_run = false;
-
-               trans_for_each_update(trans, i) {
-                       if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
-                           !i->trans_triggers_run) {
-                               i->trans_triggers_run = true;
-                               trans_trigger_run = true;
-
-                               ret = bch2_trans_mark_update(trans, i->iter,
-                                                            i->k, i->flags);
-                               if (unlikely(ret)) {
-                                       if (ret == -EINTR)
-                                               trace_trans_restart_mark(trans->ip, _RET_IP_,
-                                                                        i->iter->btree_id,
-                                                                        &i->iter->pos);
-                                       goto out;
-                               }
-                       }
-               }
-       } while (trans_trigger_run);
+       ret = bch2_trans_commit_run_triggers(trans);
+       if (ret)
+               goto out;
 
        trans_for_each_update(trans, i) {
-               BUG_ON(!i->iter->should_be_locked);
+               BUG_ON(!i->path->should_be_locked);
 
-               if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
-                       trace_trans_restart_upgrade(trans->ip, _RET_IP_,
-                                                   i->iter->btree_id,
-                                                   &i->iter->pos);
-                       trans->restarted = true;
-                       ret = -EINTR;
+               if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) {
+                       trace_trans_restart_upgrade(trans->fn, _RET_IP_,
+                                                   i->btree_id, &i->path->pos);
+                       ret = btree_trans_restart(trans);
                        goto out;
                }
 
-               BUG_ON(!btree_node_intent_locked(i->iter, i->level));
+               BUG_ON(!btree_node_intent_locked(i->path, i->level));
 
                u64s = jset_u64s(i->k->k.u64s);
-               if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
+               if (i->cached &&
                    likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
                        trans->journal_preres_u64s += u64s;
                trans->journal_u64s += u64s;
        }
 
        if (trans->extra_journal_res) {
-               ret = bch2_disk_reservation_add(trans->c, trans->disk_res,
+               ret = bch2_disk_reservation_add(c, trans->disk_res,
                                trans->extra_journal_res,
                                (trans->flags & BTREE_INSERT_NOFAIL)
                                ? BCH_DISK_RESERVATION_NOFAIL : 0);
@@ -945,21 +1054,19 @@ retry:
        ret = do_bch2_trans_commit(trans, &i, _RET_IP_);
 
        /* make sure we didn't drop or screw up locks: */
-       bch2_btree_trans_verify_locks(trans);
+       bch2_trans_verify_locks(trans);
 
        if (ret)
                goto err;
-
-       trans_for_each_iter(trans, iter)
-               if (btree_iter_live(trans, iter) &&
-                   (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
-                       bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
 out:
-       bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
+       bch2_journal_preres_put(&c->journal, &trans->journal_preres);
 
        if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
-               percpu_ref_put(&trans->c->writes);
+               percpu_ref_put(&c->writes);
 out_reset:
+       trans_for_each_update(trans, i)
+               bch2_path_put(trans, i->path, true);
+
        trans->extra_journal_res        = 0;
        trans->nr_updates               = 0;
        trans->hooks                    = NULL;
@@ -982,53 +1089,319 @@ err:
        goto retry;
 }
 
-int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
-                     struct bkey_i *k, enum btree_update_flags flags)
+static int check_pos_snapshot_overwritten(struct btree_trans *trans,
+                                         enum btree_id id,
+                                         struct bpos pos)
 {
-       struct btree_insert_entry *i, n = (struct btree_insert_entry) {
-               .flags          = flags,
-               .bkey_type      = __btree_node_type(iter->level, iter->btree_id),
-               .btree_id       = iter->btree_id,
-               .level          = iter->level,
-               .iter           = iter,
-               .k              = k
-       };
-       bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0;
-       int ret = 0;
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
 
-       BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
-       BUG_ON(!iter->should_be_locked);
+       if (!btree_type_has_snapshots(id))
+               return 0;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-       trans_for_each_update(trans, i)
-               BUG_ON(i != trans->updates &&
-                      btree_insert_entry_cmp(i - 1, i) >= 0);
-#endif
+       if (!snapshot_t(c, pos.snapshot)->children[0])
+               return 0;
 
-       if (is_extent) {
-               ret = extent_handle_overwrites(trans, &n);
+       bch2_trans_iter_init(trans, &iter, id, pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
                if (ret)
-                       return ret;
+                       break;
 
-               iter->pos_after_commit = k->k.p;
-               iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
+               if (!k.k)
+                       break;
 
-               if (bkey_deleted(&n.k->k))
-                       return 0;
+               if (bkey_cmp(pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
+                       ret = 1;
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+int bch2_trans_update_extent(struct btree_trans *trans,
+                            struct btree_iter *orig_iter,
+                            struct bkey_i *insert,
+                            enum btree_update_flags flags)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, update_iter;
+       struct bpos start = bkey_start_pos(&insert->k);
+       struct bkey_i *update;
+       struct bkey_s_c k;
+       enum btree_id btree_id = orig_iter->btree_id;
+       int ret = 0, compressed_sectors;
 
-               n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p,
-                                            BTREE_ITER_INTENT|
-                                            BTREE_ITER_NOT_EXTENTS);
-               ret = bch2_btree_iter_traverse(n.iter);
-               bch2_trans_iter_put(trans, n.iter);
+       bch2_trans_iter_init(trans, &iter, btree_id, start,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_NOT_EXTENTS);
+       k = bch2_btree_iter_peek(&iter);
+       if ((ret = bkey_err(k)))
+               goto err;
+       if (!k.k)
+               goto out;
 
+       if (bch2_bkey_maybe_mergable(k.k, &insert->k)) {
+               /*
+                * We can't merge extents if they belong to interior snapshot
+                * tree nodes, and there's a snapshot in which one extent is
+                * visible and the other is not - i.e. if visibility is
+                * different.
+                *
+                * Instead of checking if visibilitiy of the two extents is
+                * different, for now we just check if either has been
+                * overwritten:
+                */
+               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+               if (ret < 0)
+                       goto err;
                if (ret)
-                       return ret;
+                       goto nomerge1;
+
+               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+               if (ret < 0)
+                       goto err;
+               if (ret)
+                       goto nomerge1;
+
+               update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+               if ((ret = PTR_ERR_OR_ZERO(update)))
+                       goto err;
+
+               bkey_reassemble(update, k);
+
+               if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) {
+                       ret = bch2_btree_delete_at(trans, &iter, flags);
+                       if (ret)
+                               goto err;
+
+                       insert = update;
+                       goto next;
+               }
+       }
+nomerge1:
+       ret = 0;
+       if (!bkey_cmp(k.k->p, start))
+               goto next;
+
+       while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) {
+               bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0;
+               bool back_split  = bkey_cmp(k.k->p, insert->k.p) > 0;
+
+               /*
+                * If we're going to be splitting a compressed extent, note it
+                * so that __bch2_trans_commit() can increase our disk
+                * reservation:
+                */
+               if (((front_split && back_split) ||
+                    ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) &&
+                   (compressed_sectors = bch2_bkey_sectors_compressed(k)))
+                       trans->extra_journal_res += compressed_sectors;
+
+               if (front_split) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+
+                       bch2_cut_back(start, update);
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+
+                       if (ret)
+                               goto err;
+               }
+
+               if (k.k->p.snapshot != insert->k.p.snapshot &&
+                   (front_split || back_split)) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+
+                       bch2_cut_front(start, update);
+                       bch2_cut_back(insert->k.p, update);
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+                       if (ret)
+                               goto err;
+               }
+
+               if (bkey_cmp(k.k->p, insert->k.p) <= 0) {
+                       update = bch2_trans_kmalloc(trans, sizeof(*update));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_init(&update->k);
+                       update->k.p = k.k->p;
+
+                       if (insert->k.p.snapshot != k.k->p.snapshot) {
+                               update->k.p.snapshot = insert->k.p.snapshot;
+                               update->k.type = KEY_TYPE_whiteout;
+                       }
+
+                       bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       bch2_trans_iter_exit(trans, &update_iter);
+
+                       if (ret)
+                               goto err;
+               }
+
+               if (back_split) {
+                       update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+                       if ((ret = PTR_ERR_OR_ZERO(update)))
+                               goto err;
+
+                       bkey_reassemble(update, k);
+                       bch2_cut_front(insert->k.p, update);
+
+                       ret = bch2_trans_update_by_path(trans, iter.path, update,
+                                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                                                 flags);
+                       if (ret)
+                               goto err;
+                       goto out;
+               }
+next:
+               k = bch2_btree_iter_next(&iter);
+               if ((ret = bkey_err(k)))
+                       goto err;
+               if (!k.k)
+                       goto out;
+       }
+
+       if (bch2_bkey_maybe_mergable(&insert->k, k.k)) {
+               ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p);
+               if (ret < 0)
+                       goto err;
+               if (ret)
+                       goto nomerge2;
+
+               ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p);
+               if (ret < 0)
+                       goto err;
+               if (ret)
+                       goto nomerge2;
+
+               bch2_bkey_merge(c, bkey_i_to_s(insert), k);
+       }
+nomerge2:
+       ret = 0;
+out:
+       if (!bkey_deleted(&insert->k)) {
+               /*
+                * Rewinding iterators is expensive: get a new one and the one
+                * that points to the start of insert will be cloned from:
+                */
+               bch2_trans_iter_exit(trans, &iter);
+               bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p,
+                                    BTREE_ITER_NOT_EXTENTS|
+                                    BTREE_ITER_INTENT);
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(trans, &iter, insert, flags);
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+/*
+ * When deleting, check if we need to emit a whiteout (because we're overwriting
+ * something in an ancestor snapshot)
+ */
+static int need_whiteout_for_snapshot(struct btree_trans *trans,
+                                     enum btree_id btree_id, struct bpos pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u32 snapshot = pos.snapshot;
+       int ret;
+
+       if (!bch2_snapshot_parent(trans->c, pos.snapshot))
+               return 0;
+
+       pos.snapshot++;
+
+       for_each_btree_key_norestart(trans, iter, btree_id, pos,
+                          BTREE_ITER_ALL_SNAPSHOTS|
+                          BTREE_ITER_NOPRESERVE, k, ret) {
+               if (bkey_cmp(k.k->p, pos))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(trans->c, snapshot,
+                                             k.k->p.snapshot)) {
+                       ret = !bkey_whiteout(k.k);
+                       break;
+               }
        }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_insert_entry *i, n;
 
-       BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS);
+       BUG_ON(!path->should_be_locked);
 
-       n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
+       BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+       BUG_ON(bpos_cmp(k->k.p, path->pos));
+
+       n = (struct btree_insert_entry) {
+               .flags          = flags,
+               .bkey_type      = __btree_node_type(path->level, path->btree_id),
+               .btree_id       = path->btree_id,
+               .level          = path->level,
+               .cached         = path->cached,
+               .path           = path,
+               .k              = k,
+               .ip_allocated   = _RET_IP_,
+       };
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+       trans_for_each_update(trans, i)
+               BUG_ON(i != trans->updates &&
+                      btree_insert_entry_cmp(i - 1, i) >= 0);
+#endif
 
        /*
         * Pending updates are kept sorted: first, find position of new update,
@@ -1040,27 +1413,80 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
 
        if (i < trans->updates + trans->nr_updates &&
            !btree_insert_entry_cmp(&n, i)) {
-               BUG_ON(i->trans_triggers_run);
+               BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
-               /*
-                * This is a hack to ensure that inode creates update the btree,
-                * not the key cache, which helps with cache coherency issues in
-                * other areas:
-                */
-               if (btree_iter_type(n.iter) == BTREE_ITER_CACHED &&
-                   btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
-                       i->k = n.k;
-                       i->flags = n.flags;
-               } else {
-                       *i = n;
-               }
+               bch2_path_put(trans, i->path, true);
+               *i = n;
        } else
                array_insert_item(trans->updates, trans->nr_updates,
                                  i - trans->updates, n);
 
+       __btree_path_get(n.path, true);
        return 0;
 }
 
+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
+                                  struct bkey_i *k, enum btree_update_flags flags)
+{
+       struct btree_path *path = iter->update_path ?: iter->path;
+       struct bkey_cached *ck;
+       int ret;
+
+       if (iter->flags & BTREE_ITER_IS_EXTENTS)
+               return bch2_trans_update_extent(trans, iter, k, flags);
+
+       if (bkey_deleted(&k->k) &&
+           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
+               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               if (unlikely(ret < 0))
+                       return ret;
+
+               if (ret)
+                       k->k.type = KEY_TYPE_whiteout;
+       }
+
+       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           !path->cached &&
+           !path->level &&
+           btree_id_cached(trans->c, path->btree_id)) {
+               if (!iter->key_cache_path ||
+                   !iter->key_cache_path->should_be_locked ||
+                   bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+                       if (!iter->key_cache_path)
+                               iter->key_cache_path =
+                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                                     BTREE_ITER_INTENT|
+                                                     BTREE_ITER_CACHED, _THIS_IP_);
+
+                       iter->key_cache_path =
+                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                                       iter->flags & BTREE_ITER_INTENT,
+                                                       _THIS_IP_);
+
+                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                                      BTREE_ITER_CACHED|
+                                                      BTREE_ITER_CACHED_NOFILL);
+                       if (unlikely(ret))
+                               return ret;
+
+                       ck = (void *) iter->key_cache_path->l[0].b;
+
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                               trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+                               btree_trans_restart(trans);
+                               return -EINTR;
+                       }
+
+                       iter->key_cache_path->should_be_locked = true;
+               }
+
+               path = iter->key_cache_path;
+       }
+
+       return bch2_trans_update_by_path(trans, path, k, flags);
+}
+
 void bch2_trans_commit_hook(struct btree_trans *trans,
                            struct btree_trans_commit_hook *h)
 {
@@ -1071,15 +1497,14 @@ void bch2_trans_commit_hook(struct btree_trans *trans,
 int __bch2_btree_insert(struct btree_trans *trans,
                        enum btree_id id, struct bkey_i *k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
-                                  BTREE_ITER_INTENT);
-
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, 0);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k),
+                            BTREE_ITER_INTENT);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, k, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1115,18 +1540,21 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
                                  struct bpos start, struct bpos end,
+                                 unsigned iter_flags,
                                  u64 *journal_seq)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags);
 retry:
        while ((bch2_trans_begin(trans),
-              (k = bch2_btree_iter_peek(iter)).k) &&
+              (k = bch2_btree_iter_peek(&iter)).k) &&
               !(ret = bkey_err(k)) &&
-              bkey_cmp(iter->pos, end) < 0) {
+              bkey_cmp(iter.pos, end) < 0) {
+               struct disk_reservation disk_res =
+                       bch2_disk_reservation_init(trans->c, 0);
                struct bkey_i delete;
 
                bkey_init(&delete.k);
@@ -1145,9 +1573,9 @@ retry:
                 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
                 * bkey_start_pos(k.k)).
                 */
-               delete.k.p = iter->pos;
+               delete.k.p = iter.pos;
 
-               if (btree_node_type_is_extents(iter->btree_id)) {
+               if (iter.flags & BTREE_ITER_IS_EXTENTS) {
                        unsigned max_sectors =
                                KEY_SIZE_MAX & (~0 << trans->c->block_bits);
 
@@ -1155,18 +1583,17 @@ retry:
                        bch2_key_resize(&delete.k, max_sectors);
                        bch2_cut_back(end, &delete);
 
-                       ret = bch2_extent_trim_atomic(&delete, iter);
+                       ret = bch2_extent_trim_atomic(trans, &iter, &delete);
                        if (ret)
                                break;
                }
 
-               ret   = bch2_trans_update(trans, iter, &delete, 0) ?:
-                       bch2_trans_commit(trans, NULL, journal_seq,
+               ret   = bch2_trans_update(trans, &iter, &delete, 0) ?:
+                       bch2_trans_commit(trans, &disk_res, journal_seq,
                                        BTREE_INSERT_NOFAIL);
+               bch2_disk_reservation_put(trans->c, &disk_res);
                if (ret)
                        break;
-
-               bch2_trans_cond_resched(trans);
        }
 
        if (ret == -EINTR) {
@@ -1174,7 +1601,7 @@ retry:
                goto retry;
        }
 
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1185,8 +1612,10 @@ retry:
  */
 int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                            struct bpos start, struct bpos end,
+                           unsigned iter_flags,
                            u64 *journal_seq)
 {
        return bch2_trans_do(c, NULL, journal_seq, 0,
-                            bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
+                            bch2_btree_delete_range_trans(&trans, id, start, end,
+                                                          iter_flags, journal_seq));
 }
index 76945e50e4b15a4f660ed3080db79fe7d822722c..eb0eaa983dc9f665c3a0c384f49c3a1b1f605160 100644 (file)
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "ec.h"
 #include "error.h"
+#include "inode.h"
 #include "movinggc.h"
+#include "recovery.h"
 #include "reflink.h"
 #include "replicas.h"
+#include "subvolume.h"
 
 #include <linux/preempt.h>
 #include <trace/events/bcachefs.h>
@@ -40,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
        }
 }
 
-/*
- * Clear journal_seq_valid for buckets for which it's not needed, to prevent
- * wraparound:
- */
-void bch2_bucket_seq_cleanup(struct bch_fs *c)
-{
-       u64 journal_seq = atomic64_read(&c->journal.seq);
-       u16 last_seq_ondisk = c->journal.last_seq_ondisk;
-       struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket *g;
-       struct bucket_mark m;
-       unsigned i;
-
-       if (journal_seq - c->last_bucket_seq_cleanup <
-           (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
-               return;
-
-       c->last_bucket_seq_cleanup = journal_seq;
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets) {
-                       bucket_cmpxchg(g, m, ({
-                               if (!m.journal_seq_valid ||
-                                   bucket_needs_journal_commit(m, last_seq_ondisk))
-                                       break;
-
-                               m.journal_seq_valid = 0;
-                       }));
-               }
-               up_read(&ca->bucket_lock);
-       }
-}
-
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
        struct bch_fs_usage *usage;
@@ -114,6 +81,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
                                                  unsigned journal_seq,
                                                  bool gc)
 {
+       BUG_ON(!gc && !journal_seq);
+
        return this_cpu_ptr(gc
                            ? ca->usage_gc
                            : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -139,6 +108,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
                                                unsigned journal_seq,
                                                bool gc)
 {
+       percpu_rwsem_assert_held(&c->mark_lock);
+       BUG_ON(!gc && !journal_seq);
+
        return this_cpu_ptr(gc
                            ? c->usage_gc
                            : c->usage[journal_seq & JOURNAL_BUF_MASK]);
@@ -315,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m)
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
                                            struct bucket_mark m)
 {
-       return bucket_sectors_used(m)
-               ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
+       return m.dirty_sectors
+               ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
                : 0;
 }
 
@@ -332,13 +304,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
                : m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bucket_mark old,
-                                     struct bucket_mark new)
-{
-       return is_available_bucket(old) &&
-              !is_available_bucket(new);
-}
-
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
                                  struct bch_dev_usage *dev_usage,
                                  enum bch_data_type type,
@@ -357,8 +322,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
        struct bch_fs_usage *fs_usage;
        struct bch_dev_usage *u;
 
-       percpu_rwsem_assert_held(&c->mark_lock);
-
        preempt_disable();
        fs_usage = fs_usage_ptr(c, journal_seq, gc);
        u = dev_usage_ptr(ca, journal_seq, gc);
@@ -404,25 +367,48 @@ static inline int __update_replicas(struct bch_fs *c,
        return 0;
 }
 
-static inline int update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
                        struct bch_replicas_entry *r, s64 sectors,
                        unsigned journal_seq, bool gc)
 {
        struct bch_fs_usage __percpu *fs_usage;
-       int idx = bch2_replicas_entry_idx(c, r);
+       int idx, ret = 0;
+       char buf[200];
 
-       if (idx < 0)
-               return -1;
+       percpu_down_read(&c->mark_lock);
+
+       idx = bch2_replicas_entry_idx(c, r);
+       if (idx < 0 &&
+           (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+            fsck_err(c, "no replicas entry\n"
+                     "  while marking %s",
+                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) {
+               percpu_up_read(&c->mark_lock);
+               ret = bch2_mark_replicas(c, r);
+               if (ret)
+                       return ret;
+
+               percpu_down_read(&c->mark_lock);
+               idx = bch2_replicas_entry_idx(c, r);
+       }
+       if (idx < 0) {
+               ret = -1;
+               goto err;
+       }
 
        preempt_disable();
        fs_usage = fs_usage_ptr(c, journal_seq, gc);
        fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
        preempt_enable();
-       return 0;
+err:
+fsck_err:
+       percpu_up_read(&c->mark_lock);
+       return ret;
 }
 
 static inline int update_cached_sectors(struct bch_fs *c,
+                       struct bkey_s_c k,
                        unsigned dev, s64 sectors,
                        unsigned journal_seq, bool gc)
 {
@@ -430,7 +416,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
 
        bch2_replicas_entry_cached(&r.e, dev);
 
-       return update_replicas(c, &r.e, sectors, journal_seq, gc);
+       return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
 }
 
 static struct replicas_delta_list *
@@ -496,19 +482,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
        update_replicas_list(trans, &r.e, sectors);
 }
 
-#define do_mark_fn(fn, c, pos, flags, ...)                             \
-({                                                                     \
-       int gc, ret = 0;                                                \
-                                                                       \
-       percpu_rwsem_assert_held(&c->mark_lock);                        \
-                                                                       \
-       for (gc = 0; gc < 2 && !ret; gc++)                              \
-               if (!gc == !(flags & BTREE_TRIGGER_GC) ||               \
-                   (gc && gc_visited(c, pos)))                         \
-                       ret = fn(c, __VA_ARGS__, gc);                   \
-       ret;                                                            \
-})
-
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, bool owned_by_allocator)
 {
@@ -522,20 +495,19 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
        BUG_ON(owned_by_allocator == old.owned_by_allocator);
 }
 
-static int bch2_mark_alloc(struct bch_fs *c,
+static int bch2_mark_alloc(struct btree_trans *trans,
                           struct bkey_s_c old, struct bkey_s_c new,
-                          u64 journal_seq, unsigned flags)
+                          unsigned flags)
 {
        bool gc = flags & BTREE_TRIGGER_GC;
-       struct bkey_alloc_unpacked u;
+       u64 journal_seq = trans->journal_res.seq;
+       struct bch_fs *c = trans->c;
+       struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
+       struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
        struct bch_dev *ca;
        struct bucket *g;
        struct bucket_mark old_m, m;
-
-       /* We don't do anything for deletions - do we?: */
-       if (new.k->type != KEY_TYPE_alloc &&
-           new.k->type != KEY_TYPE_alloc_v2)
-               return 0;
+       int ret = 0;
 
        /*
         * alloc btree is read in by bch2_alloc_read, not gc:
@@ -544,35 +516,66 @@ static int bch2_mark_alloc(struct bch_fs *c,
            !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
                return 0;
 
-       ca = bch_dev_bkey_exists(c, new.k->p.inode);
+       if ((flags & BTREE_TRIGGER_INSERT) &&
+           !old_u.data_type != !new_u.data_type &&
+           new.k->type == KEY_TYPE_alloc_v3) {
+               struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+               u64 old_journal_seq = le64_to_cpu(v->journal_seq);
+
+               BUG_ON(!journal_seq);
 
-       if (new.k->p.offset >= ca->mi.nbuckets)
+               /*
+                * If the btree updates referring to a bucket weren't flushed
+                * before the bucket became empty again, then the we don't have
+                * to wait on a journal flush before we can reuse the bucket:
+                */
+               new_u.journal_seq = !new_u.data_type &&
+                       (journal_seq == old_journal_seq ||
+                        bch2_journal_noflush_seq(&c->journal, old_journal_seq))
+                       ? 0 : journal_seq;
+               v->journal_seq = cpu_to_le64(new_u.journal_seq);
+       }
+
+       if (old_u.data_type && !new_u.data_type && new_u.journal_seq) {
+               ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                               c->journal.flushed_seq_ondisk,
+                               new_u.dev, new_u.bucket,
+                               new_u.journal_seq);
+               if (ret) {
+                       bch2_fs_fatal_error(c,
+                               "error setting bucket_needs_journal_commit: %i", ret);
+                       return ret;
+               }
+       }
+
+       ca = bch_dev_bkey_exists(c, new_u.dev);
+
+       if (new_u.bucket >= ca->mi.nbuckets)
                return 0;
 
-       g = __bucket(ca, new.k->p.offset, gc);
-       u = bch2_alloc_unpack(new);
+       percpu_down_read(&c->mark_lock);
+       if (!gc && new_u.gen != old_u.gen)
+               *bucket_gen(ca, new_u.bucket) = new_u.gen;
+
+       g = __bucket(ca, new_u.bucket, gc);
 
        old_m = bucket_cmpxchg(g, m, ({
-               m.gen                   = u.gen;
-               m.data_type             = u.data_type;
-               m.dirty_sectors         = u.dirty_sectors;
-               m.cached_sectors        = u.cached_sectors;
-               m.stripe                = u.stripe != 0;
-
-               if (journal_seq) {
-                       m.journal_seq_valid     = 1;
-                       m.journal_seq           = journal_seq;
-               }
+               m.gen                   = new_u.gen;
+               m.data_type             = new_u.data_type;
+               m.dirty_sectors         = new_u.dirty_sectors;
+               m.cached_sectors        = new_u.cached_sectors;
+               m.stripe                = new_u.stripe != 0;
        }));
 
        bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
 
-       g->io_time[READ]        = u.read_time;
-       g->io_time[WRITE]       = u.write_time;
-       g->oldest_gen           = u.oldest_gen;
+       g->io_time[READ]        = new_u.read_time;
+       g->io_time[WRITE]       = new_u.write_time;
+       g->oldest_gen           = new_u.oldest_gen;
        g->gen_valid            = 1;
-       g->stripe               = u.stripe;
-       g->stripe_redundancy    = u.stripe_redundancy;
+       g->stripe               = new_u.stripe;
+       g->stripe_redundancy    = new_u.stripe_redundancy;
+       percpu_up_read(&c->mark_lock);
 
        /*
         * need to know if we're getting called from the invalidate path or
@@ -581,13 +584,15 @@ static int bch2_mark_alloc(struct bch_fs *c,
 
        if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
            old_m.cached_sectors) {
-               if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors,
-                                         journal_seq, gc)) {
+               ret = update_cached_sectors(c, new, ca->dev_idx,
+                                           -old_m.cached_sectors,
+                                           journal_seq, gc);
+               if (ret) {
                        bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
-                       return -1;
+                       return ret;
                }
 
-               trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
+               trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
                                 old_m.cached_sectors);
        }
 
@@ -604,17 +609,27 @@ static int bch2_mark_alloc(struct bch_fs *c,
        overflow;                                               \
 })
 
-static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                      size_t b, enum bch_data_type data_type,
-                                      unsigned sectors, bool gc)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                              size_t b, enum bch_data_type data_type,
+                              unsigned sectors, struct gc_pos pos,
+                              unsigned flags)
 {
-       struct bucket *g = __bucket(ca, b, gc);
+       struct bucket *g;
        struct bucket_mark old, new;
        bool overflow;
 
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
        BUG_ON(data_type != BCH_DATA_sb &&
               data_type != BCH_DATA_journal);
 
+       /*
+        * Backup superblock might be past the end of our normal usable space:
+        */
+       if (b >= ca->mi.nbuckets)
+               return;
+
+       percpu_down_read(&c->mark_lock);
+       g = gc_bucket(ca, b);
        old = bucket_cmpxchg(g, new, ({
                new.data_type   = data_type;
                overflow = checked_add(new.dirty_sectors, sectors);
@@ -632,88 +647,70 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                bch2_data_types[old.data_type ?: data_type],
                old.dirty_sectors, sectors);
 
-       if (c)
-               bch2_dev_usage_update(c, ca, old, new, 0, gc);
-
-       return 0;
-}
-
-void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-                              size_t b, enum bch_data_type type,
-                              unsigned sectors, struct gc_pos pos,
-                              unsigned flags)
-{
-       BUG_ON(type != BCH_DATA_sb &&
-              type != BCH_DATA_journal);
-
-       /*
-        * Backup superblock might be past the end of our normal usable space:
-        */
-       if (b >= ca->mi.nbuckets)
-               return;
-
-       if (likely(c)) {
-               do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
-                          ca, b, type, sectors);
-       } else {
-               __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
-       }
+       bch2_dev_usage_update(c, ca, old, new, 0, true);
+       percpu_up_read(&c->mark_lock);
 }
 
 static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p)
 {
-       return p.crc.compression_type
-               ? DIV_ROUND_UP(sectors * p.crc.compressed_size,
+       EBUG_ON(sectors < 0);
+
+       return p.crc.compression_type &&
+               p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible
+               ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size,
                               p.crc.uncompressed_size)
                : sectors;
 }
 
-static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
+static int check_bucket_ref(struct bch_fs *c,
+                           struct bkey_s_c k,
                            const struct bch_extent_ptr *ptr,
                            s64 sectors, enum bch_data_type ptr_data_type,
-                           u8 bucket_gen, u8 bucket_data_type,
+                           u8 b_gen, u8 bucket_data_type,
                            u16 dirty_sectors, u16 cached_sectors)
 {
-       size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
        u16 bucket_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
        char buf[200];
 
-       if (gen_after(ptr->gen, bucket_gen)) {
+       if (gen_after(ptr->gen, b_gen)) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
                        "while marking %s",
-                       ptr->dev, bucket_nr, bucket_gen,
+                       ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
+       if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                        "while marking %s",
-                       ptr->dev, bucket_nr, bucket_gen,
+                       ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (bucket_gen != ptr->gen && !ptr->cached) {
+       if (b_gen != ptr->gen && !ptr->cached) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                       "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
+                       "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n"
                        "while marking %s",
-                       ptr->dev, bucket_nr, bucket_gen,
+                       ptr->dev, bucket_nr, b_gen,
+                       *bucket_gen(ca, bucket_nr),
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        ptr->gen,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
                return -EIO;
        }
 
-       if (bucket_gen != ptr->gen)
+       if (b_gen != ptr->gen)
                return 1;
 
        if (bucket_data_type && ptr_data_type &&
@@ -721,7 +718,7 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
-                       ptr->dev, bucket_nr, bucket_gen,
+                       ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type],
                        bch2_data_types[ptr_data_type],
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
@@ -732,7 +729,7 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
                        "while marking %s",
-                       ptr->dev, bucket_nr, bucket_gen,
+                       ptr->dev, bucket_nr, b_gen,
                        bch2_data_types[bucket_data_type ?: ptr_data_type],
                        bucket_sectors, sectors,
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
@@ -742,54 +739,68 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
-static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
-                            unsigned ptr_idx,
-                            u64 journal_seq, unsigned flags)
+static int mark_stripe_bucket(struct btree_trans *trans,
+                             struct bkey_s_c k,
+                             unsigned ptr_idx,
+                             unsigned flags)
 {
+       struct bch_fs *c = trans->c;
+       u64 journal_seq = trans->journal_res.seq;
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
        unsigned nr_data = s->nr_blocks - s->nr_redundant;
        bool parity = ptr_idx >= nr_data;
+       enum bch_data_type data_type = parity ? BCH_DATA_parity : 0;
+       s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
        const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
-       bool gc = flags & BTREE_TRIGGER_GC;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bucket *g = PTR_BUCKET(ca, ptr, gc);
+       struct bucket *g;
        struct bucket_mark new, old;
        char buf[200];
-       int ret;
+       int ret = 0;
+
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
-       if (g->stripe && g->stripe != k.k->p.offset) {
+       /* * XXX doesn't handle deletion */
+
+       percpu_down_read(&c->mark_lock);
+       g = PTR_GC_BUCKET(ca, ptr);
+
+       if (g->mark.dirty_sectors ||
+           (g->stripe && g->stripe != k.k->p.offset)) {
                bch2_fs_inconsistent(c,
                              "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
                              ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
                              (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-               return -EINVAL;
+               ret = -EINVAL;
+               goto err;
        }
 
        old = bucket_cmpxchg(g, new, ({
-               ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
+               ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+                                      new.gen, new.data_type,
                                       new.dirty_sectors, new.cached_sectors);
                if (ret)
-                       return ret;
+                       goto err;
 
-               if (parity) {
-                       new.data_type           = BCH_DATA_parity;
-                       new.dirty_sectors       = le16_to_cpu(s->sectors);
-               }
+               new.dirty_sectors += sectors;
+               if (data_type)
+                       new.data_type           = data_type;
 
-               if (journal_seq) {
-                       new.journal_seq_valid   = 1;
-                       new.journal_seq         = journal_seq;
-               }
+               new.stripe = true;
        }));
 
        g->stripe               = k.k->p.offset;
        g->stripe_redundancy    = s->nr_redundant;
 
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
+       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+err:
+       percpu_up_read(&c->mark_lock);
+
        return 0;
 }
 
-static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int __mark_pointer(struct btree_trans *trans,
+                         struct bkey_s_c k,
                          const struct bch_extent_ptr *ptr,
                          s64 sectors, enum bch_data_type ptr_data_type,
                          u8 bucket_gen, u8 *bucket_data_type,
@@ -798,7 +809,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
        u16 *dst_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
-       int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
+       int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
                                   bucket_gen, *bucket_data_type,
                                   *dirty_sectors, *cached_sectors);
 
@@ -811,38 +822,41 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
-static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_mark_pointer(struct btree_trans *trans,
+                            struct bkey_s_c k,
                             struct extent_ptr_decoded p,
                             s64 sectors, enum bch_data_type data_type,
-                            u64 journal_seq, unsigned flags)
+                            unsigned flags)
 {
-       bool gc = flags & BTREE_TRIGGER_GC;
+       u64 journal_seq = trans->journal_res.seq;
+       struct bch_fs *c = trans->c;
        struct bucket_mark old, new;
        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
+       struct bucket *g;
        u8 bucket_data_type;
        u64 v;
-       int ret;
+       int ret = 0;
+
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+       percpu_down_read(&c->mark_lock);
+       g = PTR_GC_BUCKET(ca, &p.ptr);
 
        v = atomic64_read(&g->_mark.v);
        do {
                new.v.counter = old.v.counter = v;
                bucket_data_type = new.data_type;
 
-               ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
+               ret = __mark_pointer(trans, k, &p.ptr, sectors,
+                                    data_type, new.gen,
                                     &bucket_data_type,
                                     &new.dirty_sectors,
                                     &new.cached_sectors);
                if (ret)
-                       return ret;
+                       goto err;
 
                new.data_type = bucket_data_type;
 
-               if (journal_seq) {
-                       new.journal_seq_valid = 1;
-                       new.journal_seq = journal_seq;
-               }
-
                if (flags & BTREE_TRIGGER_NOATOMIC) {
                        g->_mark = new;
                        break;
@@ -851,25 +865,32 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, gc);
-
-       BUG_ON(!gc && bucket_became_unavailable(old, new));
+       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+err:
+       percpu_up_read(&c->mark_lock);
 
-       return 0;
+       return ret;
 }
 
-static int bch2_mark_stripe_ptr(struct bch_fs *c,
+static int bch2_mark_stripe_ptr(struct btree_trans *trans,
+                               struct bkey_s_c k,
                                struct bch_extent_stripe_ptr p,
                                enum bch_data_type data_type,
                                s64 sectors,
-                               unsigned journal_seq, unsigned flags)
+                               unsigned flags)
 {
-       bool gc = flags & BTREE_TRIGGER_GC;
+       struct bch_fs *c = trans->c;
        struct bch_replicas_padded r;
-       struct stripe *m;
-       unsigned i, blocks_nonempty = 0;
+       struct gc_stripe *m;
 
-       m = genradix_ptr(&c->stripes[gc], p.idx);
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+       m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
+       if (!m) {
+               bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+                       (u64) p.idx);
+               return -ENOMEM;
+       }
 
        spin_lock(&c->ec_stripes_heap_lock);
 
@@ -884,30 +905,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
        m->block_sectors[p.block] += sectors;
 
        r = m->r;
-
-       for (i = 0; i < m->nr_blocks; i++)
-               blocks_nonempty += m->block_sectors[i] != 0;
-
-       if (m->blocks_nonempty != blocks_nonempty) {
-               m->blocks_nonempty = blocks_nonempty;
-               if (!gc)
-                       bch2_stripes_heap_update(c, m, p.idx);
-       }
-
        spin_unlock(&c->ec_stripes_heap_lock);
 
        r.e.data_type = data_type;
-       update_replicas(c, &r.e, sectors, journal_seq, gc);
+       update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
 
        return 0;
 }
 
-static int bch2_mark_extent(struct bch_fs *c,
+static int bch2_mark_extent(struct btree_trans *trans,
                            struct bkey_s_c old, struct bkey_s_c new,
-                           unsigned journal_seq, unsigned flags)
+                           unsigned flags)
 {
-       bool gc = flags & BTREE_TRIGGER_GC;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+       u64 journal_seq = trans->journal_res.seq;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -916,17 +928,13 @@ static int bch2_mark_extent(struct bch_fs *c,
                ? BCH_DATA_btree
                : BCH_DATA_user;
        s64 sectors = bkey_is_btree_ptr(k.k)
-               ? c->opts.btree_node_size
+               ? btree_sectors(c)
                : k.k->size;
        s64 dirty_sectors = 0;
        bool stale;
        int ret;
 
-       BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-              (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
-       if (flags & BTREE_TRIGGER_OVERWRITE)
-               sectors = -sectors;
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
 
        r.e.data_type   = data_type;
        r.e.nr_devs     = 0;
@@ -935,27 +943,31 @@ static int bch2_mark_extent(struct bch_fs *c,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
-               ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
-                                       journal_seq, flags);
+               if (flags & BTREE_TRIGGER_OVERWRITE)
+                       disk_sectors = -disk_sectors;
+
+               ret = bch2_mark_pointer(trans, k, p, disk_sectors,
+                                       data_type, flags);
                if (ret < 0)
                        return ret;
 
                stale = ret > 0;
 
                if (p.ptr.cached) {
-                       if (!stale)
-                               if (update_cached_sectors(c, p.ptr.dev, disk_sectors,
-                                                         journal_seq, gc)) {
+                       if (!stale) {
+                               ret = update_cached_sectors(c, k, p.ptr.dev,
+                                               disk_sectors, journal_seq, true);
+                               if (ret) {
                                        bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
-                                       return -1;
-
+                                       return ret;
                                }
+                       }
                } else if (!p.has_ec) {
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
                } else {
-                       ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
-                                       disk_sectors, journal_seq, flags);
+                       ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
+                                       disk_sectors, flags);
                        if (ret)
                                return ret;
 
@@ -969,182 +981,245 @@ static int bch2_mark_extent(struct bch_fs *c,
        }
 
        if (r.e.nr_devs) {
-               if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) {
+               ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
+               if (ret) {
                        char buf[200];
 
                        bch2_bkey_val_to_text(&PBUF(buf), c, k);
                        bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-                       return -1;
+                       return ret;
                }
        }
 
        return 0;
 }
 
-static int bch2_mark_stripe(struct bch_fs *c,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       u64 journal_seq, unsigned flags)
+static int bch2_mark_stripe(struct btree_trans *trans,
+                           struct bkey_s_c old, struct bkey_s_c new,
+                           unsigned flags)
 {
        bool gc = flags & BTREE_TRIGGER_GC;
-       size_t idx = new.k->p.offset;
+       u64 journal_seq = trans->journal_res.seq;
+       struct bch_fs *c = trans->c;
+       u64 idx = new.k->p.offset;
        const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
                ? bkey_s_c_to_stripe(old).v : NULL;
        const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
                ? bkey_s_c_to_stripe(new).v : NULL;
-       struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
        unsigned i;
        int ret;
 
        BUG_ON(gc && old_s);
 
-       if (!m || (old_s && !m->alive)) {
-               bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
-                                   idx);
-               bch2_inconsistent_error(c);
-               return -1;
-       }
+       if (!gc) {
+               struct stripe *m = genradix_ptr(&c->stripes, idx);
 
-       if (!new_s) {
-               spin_lock(&c->ec_stripes_heap_lock);
-               bch2_stripes_heap_del(c, m, idx);
-               spin_unlock(&c->ec_stripes_heap_lock);
+               if (!m || (old_s && !m->alive)) {
+                       char buf1[200], buf2[200];
 
-               memset(m, 0, sizeof(*m));
+                       bch2_bkey_val_to_text(&PBUF(buf1), c, old);
+                       bch2_bkey_val_to_text(&PBUF(buf2), c, new);
+                       bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+                                           "old %s\n"
+                                           "new %s", idx, buf1, buf2);
+                       bch2_inconsistent_error(c);
+                       return -1;
+               }
+
+               if (!new_s) {
+                       spin_lock(&c->ec_stripes_heap_lock);
+                       bch2_stripes_heap_del(c, m, idx);
+                       spin_unlock(&c->ec_stripes_heap_lock);
+
+                       memset(m, 0, sizeof(*m));
+               } else {
+                       m->alive        = true;
+                       m->sectors      = le16_to_cpu(new_s->sectors);
+                       m->algorithm    = new_s->algorithm;
+                       m->nr_blocks    = new_s->nr_blocks;
+                       m->nr_redundant = new_s->nr_redundant;
+                       m->blocks_nonempty = 0;
+
+                       for (i = 0; i < new_s->nr_blocks; i++)
+                               m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+                       spin_lock(&c->ec_stripes_heap_lock);
+                       bch2_stripes_heap_update(c, m, idx);
+                       spin_unlock(&c->ec_stripes_heap_lock);
+               }
        } else {
+               struct gc_stripe *m =
+                       genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+               if (!m) {
+                       bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+                               idx);
+                       return -ENOMEM;
+               }
+               /*
+                * This will be wrong when we bring back runtime gc: we should
+                * be unmarking the old key and then marking the new key
+                */
                m->alive        = true;
                m->sectors      = le16_to_cpu(new_s->sectors);
-               m->algorithm    = new_s->algorithm;
                m->nr_blocks    = new_s->nr_blocks;
                m->nr_redundant = new_s->nr_redundant;
-               m->blocks_nonempty = 0;
-
-               for (i = 0; i < new_s->nr_blocks; i++) {
-                       m->block_sectors[i] =
-                               stripe_blockcount_get(new_s, i);
-                       m->blocks_nonempty += !!m->block_sectors[i];
 
+               for (i = 0; i < new_s->nr_blocks; i++)
                        m->ptrs[i] = new_s->ptrs[i];
-               }
 
                bch2_bkey_to_replicas(&m->r.e, new);
 
-               if (!gc) {
-                       spin_lock(&c->ec_stripes_heap_lock);
-                       bch2_stripes_heap_update(c, m, idx);
-                       spin_unlock(&c->ec_stripes_heap_lock);
-               }
-       }
-
-       if (gc) {
                /*
                 * gc recalculates this field from stripe ptr
                 * references:
                 */
                memset(m->block_sectors, 0, sizeof(m->block_sectors));
-               m->blocks_nonempty = 0;
 
                for (i = 0; i < new_s->nr_blocks; i++) {
-                       ret = mark_stripe_bucket(c, new, i, journal_seq, flags);
+                       ret = mark_stripe_bucket(trans, new, i, flags);
                        if (ret)
                                return ret;
                }
 
-               if (update_replicas(c, &m->r.e,
-                                   ((s64) m->sectors * m->nr_redundant),
-                                   journal_seq, gc)) {
+               ret = update_replicas(c, new, &m->r.e,
+                                     ((s64) m->sectors * m->nr_redundant),
+                                     journal_seq, gc);
+               if (ret) {
                        char buf[200];
 
                        bch2_bkey_val_to_text(&PBUF(buf), c, new);
                        bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
-                       return -1;
+                       return ret;
                }
        }
 
        return 0;
 }
 
-static int bch2_mark_inode(struct bch_fs *c,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       u64 journal_seq, unsigned flags)
+static int bch2_mark_inode(struct btree_trans *trans,
+                          struct bkey_s_c old, struct bkey_s_c new,
+                          unsigned flags)
 {
+       struct bch_fs *c = trans->c;
        struct bch_fs_usage __percpu *fs_usage;
+       u64 journal_seq = trans->journal_res.seq;
 
-       preempt_disable();
-       fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
-       fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
-       fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
-       preempt_enable();
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v;
+
+               BUG_ON(!journal_seq);
+               BUG_ON(new.k->type != KEY_TYPE_inode_v2);
+
+               v->bi_journal_seq = cpu_to_le64(journal_seq);
+       }
+
+       if (flags & BTREE_TRIGGER_GC) {
+               percpu_down_read(&c->mark_lock);
+               preempt_disable();
+
+               fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+               fs_usage->nr_inodes += bkey_is_inode(new.k);
+               fs_usage->nr_inodes -= bkey_is_inode(old.k);
+
+               preempt_enable();
+               percpu_up_read(&c->mark_lock);
+       }
        return 0;
 }
 
-static int bch2_mark_reservation(struct bch_fs *c,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       u64 journal_seq, unsigned flags)
+static int bch2_mark_reservation(struct btree_trans *trans,
+                                struct bkey_s_c old, struct bkey_s_c new,
+                                unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
        struct bch_fs_usage __percpu *fs_usage;
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
 
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
        if (flags & BTREE_TRIGGER_OVERWRITE)
                sectors = -sectors;
        sectors *= replicas;
 
+       percpu_down_read(&c->mark_lock);
        preempt_disable();
-       fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
+
+       fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
        replicas = clamp_t(unsigned, replicas, 1,
                           ARRAY_SIZE(fs_usage->persistent_reserved));
 
        fs_usage->reserved                              += sectors;
        fs_usage->persistent_reserved[replicas - 1]     += sectors;
+
        preempt_enable();
+       percpu_up_read(&c->mark_lock);
 
        return 0;
 }
 
 static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p,
-                                u64 idx, unsigned flags, size_t *r_idx)
+                                u64 *idx, unsigned flags, size_t r_idx)
 {
        struct reflink_gc *r;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+       s64 ret = 0;
 
-       while (1) {
-               if (*r_idx >= c->reflink_gc_nr)
-                       goto not_found;
-               r = genradix_ptr(&c->reflink_gc_table, *r_idx);
-               BUG_ON(!r);
+       if (r_idx >= c->reflink_gc_nr)
+               goto not_found;
 
-               if (idx < r->offset)
-                       break;
-               (*r_idx)++;
-       }
+       r = genradix_ptr(&c->reflink_gc_table, r_idx);
+       if (*idx < r->offset - r->size)
+               goto not_found;
 
        BUG_ON((s64) r->refcount + add < 0);
 
        r->refcount += add;
-       return r->offset - idx;
+       *idx = r->offset;
+       return 0;
 not_found:
-       bch2_fs_inconsistent(c,
-               "%llu:%llu len %u points to nonexistent indirect extent %llu",
-               p.k->p.inode, p.k->p.offset, p.k->size, idx);
-       bch2_inconsistent_error(c);
-       return -EIO;
+       *idx = U64_MAX;
+       ret = -EIO;
+
+       /*
+        * XXX: we're replacing the entire reflink pointer with an error
+        * key, we should just be replacing the part that was missing:
+        */
+       if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu",
+                    p.k->p.inode, p.k->p.offset, p.k->size, *idx)) {
+               struct bkey_i_error new;
+
+               bkey_init(&new.k);
+               new.k.type      = KEY_TYPE_error;
+               new.k.p         = p.k->p;
+               new.k.size      = p.k->size;
+               ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i);
+       }
+fsck_err:
+       return ret;
 }
 
-static int bch2_mark_reflink_p(struct bch_fs *c,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       u64 journal_seq, unsigned flags)
+static int bch2_mark_reflink_p(struct btree_trans *trans,
+                              struct bkey_s_c old, struct bkey_s_c new,
+                              unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        struct reflink_gc *ref;
        size_t l, r, m;
        u64 idx = le64_to_cpu(p.v->idx);
-       unsigned sectors = p.k->size;
-       s64 ret = 0;
+       u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+       int ret = 0;
 
-       BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-              (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
+       BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+       if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) {
+               idx -= le32_to_cpu(p.v->front_pad);
+               end += le32_to_cpu(p.v->back_pad);
+       }
 
        l = 0;
        r = c->reflink_gc_nr;
@@ -1158,107 +1233,72 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
                        r = m;
        }
 
-       while (sectors) {
-               ret = __bch2_mark_reflink_p(c, p, idx, flags, &l);
-               if (ret < 0)
-                       return ret;
+       while (idx < end && !ret)
+               ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++);
 
-               ret = min_t(s64, ret, sectors);
-               idx     += ret;
-               sectors -= ret;
-       }
-
-       return 0;
+       return ret;
 }
 
-static int bch2_mark_key_locked(struct bch_fs *c,
-                  struct bkey_s_c old,
-                  struct bkey_s_c new,
-                  u64 journal_seq, unsigned flags)
+int bch2_mark_key(struct btree_trans *trans,
+                 struct bkey_s_c old,
+                 struct bkey_s_c new,
+                 unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
-       BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 
        switch (k.k->type) {
        case KEY_TYPE_alloc:
        case KEY_TYPE_alloc_v2:
-               return bch2_mark_alloc(c, old, new, journal_seq, flags);
+       case KEY_TYPE_alloc_v3:
+               return bch2_mark_alloc(trans, old, new, flags);
        case KEY_TYPE_btree_ptr:
        case KEY_TYPE_btree_ptr_v2:
        case KEY_TYPE_extent:
        case KEY_TYPE_reflink_v:
-               return bch2_mark_extent(c, old, new, journal_seq, flags);
+               return bch2_mark_extent(trans, old, new, flags);
        case KEY_TYPE_stripe:
-               return bch2_mark_stripe(c, old, new, journal_seq, flags);
+               return bch2_mark_stripe(trans, old, new, flags);
        case KEY_TYPE_inode:
-               return bch2_mark_inode(c, old, new, journal_seq, flags);
+       case KEY_TYPE_inode_v2:
+               return bch2_mark_inode(trans, old, new, flags);
        case KEY_TYPE_reservation:
-               return bch2_mark_reservation(c, old, new, journal_seq, flags);
+               return bch2_mark_reservation(trans, old, new, flags);
        case KEY_TYPE_reflink_p:
-               return bch2_mark_reflink_p(c, old, new, journal_seq, flags);
+               return bch2_mark_reflink_p(trans, old, new, flags);
+       case KEY_TYPE_snapshot:
+               return bch2_mark_snapshot(trans, old, new, flags);
        default:
                return 0;
        }
 }
 
-int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags)
-{
-       struct bkey deleted = KEY(0, 0, 0);
-       struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
-       int ret;
-
-       percpu_down_read(&c->mark_lock);
-       ret = bch2_mark_key_locked(c, old, new, 0, flags);
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
-int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter,
+int bch2_mark_update(struct btree_trans *trans, struct btree_path *path,
                     struct bkey_i *new, unsigned flags)
 {
-       struct bch_fs           *c = trans->c;
        struct bkey             _deleted = KEY(0, 0, 0);
        struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
        struct bkey_s_c         old;
-       int iter_flags, ret;
+       struct bkey             unpacked;
+       int ret;
+
+       _deleted.p = path->pos;
 
        if (unlikely(flags & BTREE_TRIGGER_NORUN))
                return 0;
 
-       if (!btree_node_type_needs_gc(iter->btree_id))
+       if (!btree_node_type_needs_gc(path->btree_id))
                return 0;
 
-       if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-               iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-               iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-               old = bch2_btree_iter_peek_slot(iter);
-               iter->flags |= iter_flags;
-
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
-       } else {
-               /*
-                * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-                * running triggers that do anything on removal (alloc btree):
-                */
-               old = deleted;
-       }
+       old = bch2_btree_path_peek_slot(path, &unpacked);
 
        if (old.k->type == new->k.type &&
            ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               ret   = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
-                               trans->journal_res.seq,
+               ret   = bch2_mark_key(trans, old, bkey_i_to_s_c(new),
                                BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
        } else {
-               ret   = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new),
-                               trans->journal_res.seq,
+               ret   = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new),
                                BTREE_TRIGGER_INSERT|flags) ?:
-                       bch2_mark_key_locked(c, old, deleted,
-                               trans->journal_res.seq,
+                       bch2_mark_key(trans, old, deleted,
                                BTREE_TRIGGER_OVERWRITE|flags);
        }
 
@@ -1283,23 +1323,14 @@ void fs_usage_apply_warn(struct btree_trans *trans,
                pr_err("%s", buf);
                pr_err("overlapping with");
 
-               if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
-                       struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
-                       struct bkey_s_c k;
-                       int ret;
-
-                       for_each_btree_key_continue(copy, 0, k, ret) {
-                               if (btree_node_type_is_extents(i->iter->btree_id)
-                                   ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
-                                   : bkey_cmp(i->k->k.p, k.k->p))
-                                       break;
+               if (!i->cached) {
+                       struct bkey u;
+                       struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u);
 
-                               bch2_bkey_val_to_text(&PBUF(buf), c, k);
-                               pr_err("%s", buf);
-                       }
-                       bch2_trans_iter_put(trans, copy);
+                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       pr_err("%s", buf);
                } else {
-                       struct bkey_cached *ck = (void *) i->iter->l[0].b;
+                       struct bkey_cached *ck = (void *) i->path->l[0].b;
 
                        if (ck->valid) {
                                bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
@@ -1310,21 +1341,20 @@ void fs_usage_apply_warn(struct btree_trans *trans,
        __WARN();
 }
 
-void bch2_trans_fs_usage_apply(struct btree_trans *trans,
-                              struct replicas_delta_list *deltas)
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+                             struct replicas_delta_list *deltas)
 {
        struct bch_fs *c = trans->c;
        static int warned_disk_usage = 0;
        bool warn = false;
        unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
-       struct replicas_delta *d = deltas->d;
+       struct replicas_delta *d = deltas->d, *d2;
        struct replicas_delta *top = (void *) deltas->d + deltas->used;
        struct bch_fs_usage *dst;
        s64 added = 0, should_not_have_added;
        unsigned i;
 
-       percpu_rwsem_assert_held(&c->mark_lock);
-
+       percpu_down_read(&c->mark_lock);
        preempt_disable();
        dst = fs_usage_ptr(c, trans->journal_res.seq, false);
 
@@ -1336,7 +1366,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
                        added += d->delta;
                }
 
-               BUG_ON(__update_replicas(c, dst, &d->r, d->delta));
+               if (__update_replicas(c, dst, &d->r, d->delta))
+                       goto need_mark;
        }
 
        dst->nr_inodes += deltas->nr_inodes;
@@ -1371,101 +1402,71 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
        }
 
        preempt_enable();
+       percpu_up_read(&c->mark_lock);
 
        if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
                fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added);
+       return 0;
+need_mark:
+       /* revert changes: */
+       for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2))
+               BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta));
+
+       preempt_enable();
+       percpu_up_read(&c->mark_lock);
+       return -1;
 }
 
 /* trans_mark: */
 
-static struct btree_iter *trans_get_update(struct btree_trans *trans,
-                           enum btree_id btree_id, struct bpos pos,
-                           struct bkey_s_c *k)
-{
-       struct btree_insert_entry *i;
-
-       trans_for_each_update(trans, i)
-               if (i->iter->btree_id == btree_id &&
-                   (btree_node_type_is_extents(btree_id)
-                    ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
-                      bkey_cmp(pos, i->k->k.p) < 0
-                    : !bkey_cmp(pos, i->iter->pos))) {
-                       *k = bkey_i_to_s_c(i->k);
-
-                       /* ugly hack.. */
-                       BUG_ON(btree_iter_live(trans, i->iter));
-                       trans->iters_live |= 1ULL << i->iter->idx;
-                       return i->iter;
-               }
-
-       return NULL;
-}
-
-static struct bkey_alloc_buf *
-bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter,
                              const struct bch_extent_ptr *ptr,
                              struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
-       struct bucket *g;
-       struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_alloc_buf *a;
        int ret;
 
-       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-       if (IS_ERR(a))
-               return a;
-
-       iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
-       if (iter) {
-               *u = bch2_alloc_unpack(k);
-       } else {
-               iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
-                                          BTREE_ITER_CACHED|
-                                          BTREE_ITER_CACHED_NOFILL|
-                                          BTREE_ITER_INTENT);
-               ret = bch2_btree_iter_traverse(iter);
-               if (ret) {
-                       bch2_trans_iter_put(trans, iter);
-                       return ERR_PTR(ret);
-               }
-
-               percpu_down_read(&c->mark_lock);
-               g = bucket(ca, pos.offset);
-               *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
-               percpu_up_read(&c->mark_lock);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_alloc,
+                            POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)),
+                            BTREE_ITER_WITH_UPDATES|
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret) {
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
        }
 
-       *_iter = iter;
-       return a;
+       *u = bch2_alloc_unpack(k);
+       return 0;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
                        struct bkey_s_c k, struct extent_ptr_decoded p,
                        s64 sectors, enum bch_data_type data_type)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
-       struct bkey_alloc_buf *a;
        int ret;
 
-       a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-       if (IS_ERR(a))
-               return PTR_ERR(a);
+       ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+       if (ret)
+               return ret;
 
-       ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
+       ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type,
+                            u.gen, &u.data_type,
                             &u.dirty_sectors, &u.cached_sectors);
        if (ret)
                goto out;
 
-       bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       if (ret)
+               goto out;
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1474,16 +1475,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
                        s64 sectors, enum bch_data_type data_type)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_stripe *s;
        struct bch_replicas_padded r;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1514,13 +1515,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
        stripe_blockcount_set(&s->v, p.ec.block,
                stripe_blockcount_get(&s->v, p.ec.block) +
                sectors);
-       bch2_trans_update(trans, iter, &s->k_i, 0);
+
+       ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+       if (ret)
+               goto err;
 
        bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
        r.e.data_type = data_type;
        update_replicas_list(trans, &r.e, sectors);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1536,18 +1540,12 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
                ? BCH_DATA_btree
                : BCH_DATA_user;
        s64 sectors = bkey_is_btree_ptr(k.k)
-               ? c->opts.btree_node_size
+               ? btree_sectors(c)
                : k.k->size;
        s64 dirty_sectors = 0;
        bool stale;
        int ret;
 
-       BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-              (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
-       if (flags & BTREE_TRIGGER_OVERWRITE)
-               sectors = -sectors;
-
        r.e.data_type   = data_type;
        r.e.nr_devs     = 0;
        r.e.nr_required = 1;
@@ -1555,6 +1553,9 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
                s64 disk_sectors = ptr_disk_sectors(sectors, p);
 
+               if (flags & BTREE_TRIGGER_OVERWRITE)
+                       disk_sectors = -disk_sectors;
+
                ret = bch2_trans_mark_pointer(trans, k, p,
                                        disk_sectors, data_type);
                if (ret < 0)
@@ -1585,54 +1586,79 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
-static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-                                           struct bkey_s_c_stripe s,
-                                           unsigned idx, bool deleting)
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+                                        struct bkey_s_c_stripe s,
+                                        unsigned idx, bool deleting)
 {
        struct bch_fs *c = trans->c;
        const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
-       struct bkey_alloc_buf *a;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
-       bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+       enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+               ? BCH_DATA_parity : 0;
+       s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
        int ret = 0;
 
-       a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-       if (IS_ERR(a))
-               return PTR_ERR(a);
-
-       if (parity) {
-               s64 sectors = le16_to_cpu(s.v->sectors);
+       if (deleting)
+               sectors = -sectors;
 
-               if (deleting)
-                       sectors = -sectors;
+       ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+       if (ret)
+               return ret;
 
-               u.dirty_sectors += sectors;
-               u.data_type = u.dirty_sectors
-                       ? BCH_DATA_parity
-                       : 0;
-       }
+       ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type,
+                              u.gen, u.data_type,
+                              u.dirty_sectors, u.cached_sectors);
+       if (ret)
+               goto err;
 
        if (!deleting) {
-               if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
-                               "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
-                               iter->pos.inode, iter->pos.offset, u.gen,
+               if (bch2_fs_inconsistent_on(u.stripe ||
+                                           u.stripe_redundancy, c,
+                               "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+                               iter.pos.inode, iter.pos.offset, u.gen,
+                               bch2_data_types[u.data_type],
+                               u.dirty_sectors,
                                u.stripe, s.k->p.offset)) {
                        ret = -EIO;
                        goto err;
                }
 
+               if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c,
+                               "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+                               iter.pos.inode, iter.pos.offset, u.gen,
+                               bch2_data_types[u.data_type],
+                               u.dirty_sectors,
+                               s.k->p.offset)) {
+                       ret = -EIO;
+                       goto err;
+               }
+
                u.stripe                = s.k->p.offset;
                u.stripe_redundancy     = s.v->nr_redundant;
        } else {
+               if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset ||
+                                           u.stripe_redundancy != s.v->nr_redundant, c,
+                               "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+                               iter.pos.inode, iter.pos.offset, u.gen,
+                               s.k->p.offset, u.stripe)) {
+                       ret = -EIO;
+                       goto err;
+               }
+
                u.stripe                = 0;
                u.stripe_redundancy     = 0;
        }
 
-       bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       u.dirty_sectors += sectors;
+       if (data_type)
+               u.data_type = !deleting ? data_type : 0;
+
+       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       if (ret)
+               goto err;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1643,7 +1669,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
        struct bkey_s_c_stripe old_s = { .k = NULL };
        struct bkey_s_c_stripe new_s = { .k = NULL };
        struct bch_replicas_padded r;
-       unsigned i;
+       unsigned i, nr_blocks;
        int ret = 0;
 
        if (old.k->type == KEY_TYPE_stripe)
@@ -1661,18 +1687,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
                    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
                return 0;
 
+       BUG_ON(new_s.k && old_s.k &&
+              (new_s.v->nr_blocks      != old_s.v->nr_blocks ||
+               new_s.v->nr_redundant   != old_s.v->nr_redundant));
+
+       nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks;
+
        if (new_s.k) {
                s64 sectors = le16_to_cpu(new_s.v->sectors);
 
                bch2_bkey_to_replicas(&r.e, new);
                update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
-
-               for (i = 0; i < new_s.v->nr_blocks; i++) {
-                       ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
-                                                              i, false);
-                       if (ret)
-                               return ret;
-               }
        }
 
        if (old_s.k) {
@@ -1680,12 +1705,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 
                bch2_bkey_to_replicas(&r.e, old);
                update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
+       }
 
-               for (i = 0; i < old_s.v->nr_blocks; i++) {
-                       ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
-                                                              i, true);
+       for (i = 0; i < nr_blocks; i++) {
+               if (new_s.k && old_s.k &&
+                   !memcmp(&new_s.v->ptrs[i],
+                           &old_s.v->ptrs[i],
+                           sizeof(new_s.v->ptrs[i])))
+                       continue;
+
+               if (new_s.k) {
+                       ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false);
                        if (ret)
-                               return ret;
+                               break;
+               }
+
+               if (old_s.k) {
+                       ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true);
+                       if (ret)
+                               break;
                }
        }
 
@@ -1697,8 +1735,7 @@ static int bch2_trans_mark_inode(struct btree_trans *trans,
                                 struct bkey_s_c new,
                                 unsigned flags)
 {
-       int nr = (new.k->type == KEY_TYPE_inode) -
-               (old.k->type == KEY_TYPE_inode);
+       int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
 
        if (nr) {
                struct replicas_delta_list *d =
@@ -1716,9 +1753,6 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans,
        s64 sectors = (s64) k.k->size;
        struct replicas_delta_list *d;
 
-       BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
-              (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE));
-
        if (flags & BTREE_TRIGGER_OVERWRITE)
                sectors = -sectors;
        sectors *= replicas;
@@ -1734,20 +1768,21 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans,
 
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
-                       u64 idx, unsigned flags)
+                       u64 *idx, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i *n;
        __le64 *refcount;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
-       s64 ret;
+       char buf[200];
+       int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_WITH_UPDATES);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1761,15 +1796,38 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
 
        refcount = bkey_refcount(n);
        if (!refcount) {
+               bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
                bch2_fs_inconsistent(c,
-                       "%llu:%llu len %u points to nonexistent indirect extent %llu",
-                       p.k->p.inode, p.k->p.offset, p.k->size, idx);
-               bch2_inconsistent_error(c);
+                       "nonexistent indirect extent at %llu while marking\n  %s",
+                       *idx, buf);
                ret = -EIO;
                goto err;
        }
 
-       BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+       if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+               bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c);
+               bch2_fs_inconsistent(c,
+                       "indirect extent refcount underflow at %llu while marking\n  %s",
+                       *idx, buf);
+               ret = -EIO;
+               goto err;
+       }
+
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+               u64 pad;
+
+               pad = max_t(s64, le32_to_cpu(v->front_pad),
+                           le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+               BUG_ON(pad > U32_MAX);
+               v->front_pad = cpu_to_le32(pad);
+
+               pad = max_t(s64, le32_to_cpu(v->back_pad),
+                           k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+               BUG_ON(pad > U32_MAX);
+               v->back_pad = cpu_to_le32(pad);
+       }
+
        le64_add_cpu(refcount, add);
 
        if (!*refcount) {
@@ -1777,14 +1835,14 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                set_bkey_val_u64s(&n->k, 0);
        }
 
-       bch2_btree_iter_set_pos_to_extent_start(iter);
-       ret = bch2_trans_update(trans, iter, n, 0);
+       bch2_btree_iter_set_pos_to_extent_start(&iter);
+       ret = bch2_trans_update(trans, &iter, n, 0);
        if (ret)
                goto err;
 
-       ret = k.k->p.offset - idx;
+       *idx = k.k->p.offset;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1792,29 +1850,29 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
                                     struct bkey_s_c k, unsigned flags)
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-       u64 idx = le64_to_cpu(p.v->idx);
-       unsigned sectors = p.k->size;
-       s64 ret = 0;
+       u64 idx, end_idx;
+       int ret = 0;
 
-       while (sectors) {
-               ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
-               if (ret < 0)
-                       return ret;
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
 
-               ret = min_t(s64, ret, sectors);
-               idx     += ret;
-               sectors -= ret;
+               v->front_pad = v->back_pad = 0;
        }
 
-       return 0;
+       idx     = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+       end_idx = le64_to_cpu(p.v->idx) + p.k->size +
+               le32_to_cpu(p.v->back_pad);
+
+       while (idx < end_idx && !ret)
+               ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
+
+       return ret;
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
                        struct bkey_s_c new, unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
-
-       BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
+       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new;
 
        switch (k.k->type) {
        case KEY_TYPE_btree_ptr:
@@ -1825,6 +1883,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
        case KEY_TYPE_stripe:
                return bch2_trans_mark_stripe(trans, old, new, flags);
        case KEY_TYPE_inode:
+       case KEY_TYPE_inode_v2:
                return bch2_trans_mark_inode(trans, old, new, flags);
        case KEY_TYPE_reservation:
                return bch2_trans_mark_reservation(trans, k, flags);
@@ -1835,64 +1894,14 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
        }
 }
 
-int bch2_trans_mark_update(struct btree_trans *trans,
-                          struct btree_iter *iter,
-                          struct bkey_i *new,
-                          unsigned flags)
-{
-       struct bkey             _deleted = KEY(0, 0, 0);
-       struct bkey_s_c         deleted = (struct bkey_s_c) { &_deleted, NULL };
-       struct bkey_s_c         old;
-       int iter_flags, ret;
-
-       if (unlikely(flags & BTREE_TRIGGER_NORUN))
-               return 0;
-
-       if (!btree_node_type_needs_gc(iter->btree_id))
-               return 0;
-
-
-       if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) {
-               iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES;
-               iter->flags &= ~BTREE_ITER_WITH_UPDATES;
-
-               old = bch2_btree_iter_peek_slot(iter);
-               iter->flags |= iter_flags;
-
-               ret = bkey_err(old);
-               if (ret)
-                       return ret;
-       } else {
-               /*
-                * If BTREE_ITER_CACHED_NOFILL was used, we better not be
-                * running triggers that do anything on removal (alloc btree):
-                */
-               old = deleted;
-       }
-
-       if (old.k->type == new->k.type &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
-               ret   = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
-       } else {
-               ret   = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
-                               BTREE_TRIGGER_INSERT|flags) ?:
-                       bch2_trans_mark_key(trans, old, deleted,
-                               BTREE_TRIGGER_OVERWRITE|flags);
-       }
-
-       return ret;
-}
-
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    struct bch_dev *ca, size_t b,
                                    enum bch_data_type type,
                                    unsigned sectors)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_alloc_unpacked u;
-       struct bkey_alloc_buf *a;
        struct bch_extent_ptr ptr = {
                .dev = ca->dev_idx,
                .offset = bucket_to_sector(ca, b),
@@ -1905,15 +1914,15 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        if (b >= ca->mi.nbuckets)
                return 0;
 
-       a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-       if (IS_ERR(a))
-               return PTR_ERR(a);
+       ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+       if (ret)
+               return ret;
 
        if (u.data_type && u.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
                        "while marking %s",
-                       iter->pos.inode, iter->pos.offset, u.gen,
+                       iter.pos.inode, iter.pos.offset, u.gen,
                        bch2_data_types[u.data_type],
                        bch2_data_types[type],
                        bch2_data_types[type]);
@@ -1924,10 +1933,11 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        u.data_type     = type;
        u.dirty_sectors = sectors;
 
-       bch2_alloc_pack(c, a, u);
-       bch2_trans_update(trans, iter, &a->k, 0);
+       ret = bch2_alloc_write(trans, &iter, &u, 0);
+       if (ret)
+               goto out;
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -2092,20 +2102,29 @@ static void buckets_free_rcu(struct rcu_head *rcu)
                container_of(rcu, struct bucket_array, rcu);
 
        kvpfree(buckets,
-               sizeof(struct bucket_array) +
+               sizeof(*buckets) +
                buckets->nbuckets * sizeof(struct bucket));
 }
 
+static void bucket_gens_free_rcu(struct rcu_head *rcu)
+{
+       struct bucket_gens *buckets =
+               container_of(rcu, struct bucket_gens, rcu);
+
+       kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+}
+
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
        struct bucket_array *buckets = NULL, *old_buckets = NULL;
+       struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
        unsigned long *buckets_nouse = NULL;
        alloc_fifo      free[RESERVE_NR];
        alloc_fifo      free_inc;
        alloc_heap      alloc_heap;
 
        size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
-                            ca->mi.bucket_size / c->opts.btree_node_size);
+                            ca->mi.bucket_size / btree_sectors(c));
        /* XXX: these should be tunable */
        size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
        size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
@@ -2122,9 +2141,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
                                            nbuckets * sizeof(struct bucket),
                                            GFP_KERNEL|__GFP_ZERO)) ||
-           !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
-                                           sizeof(unsigned long),
+           !(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
                                            GFP_KERNEL|__GFP_ZERO)) ||
+           (c->opts.buckets_nouse &&
+            !(buckets_nouse    = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+                                           sizeof(unsigned long),
+                                           GFP_KERNEL|__GFP_ZERO))) ||
            !init_fifo(&free[RESERVE_MOVINGGC],
                       copygc_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
@@ -2134,6 +2156,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
        buckets->first_bucket   = ca->mi.first_bucket;
        buckets->nbuckets       = nbuckets;
+       bucket_gens->first_bucket = ca->mi.first_bucket;
+       bucket_gens->nbuckets   = nbuckets;
 
        bch2_copygc_stop(c);
 
@@ -2144,6 +2168,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        }
 
        old_buckets = bucket_array(ca);
+       old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
        if (resize) {
                size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
@@ -2151,13 +2176,19 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                memcpy(buckets->b,
                       old_buckets->b,
                       n * sizeof(struct bucket));
-               memcpy(buckets_nouse,
-                      ca->buckets_nouse,
-                      BITS_TO_LONGS(n) * sizeof(unsigned long));
+               memcpy(bucket_gens->b,
+                      old_bucket_gens->b,
+                      n);
+               if (buckets_nouse)
+                       memcpy(buckets_nouse,
+                              ca->buckets_nouse,
+                              BITS_TO_LONGS(n) * sizeof(unsigned long));
        }
 
        rcu_assign_pointer(ca->buckets[0], buckets);
-       buckets = old_buckets;
+       rcu_assign_pointer(ca->bucket_gens, bucket_gens);
+       buckets         = old_buckets;
+       bucket_gens     = old_bucket_gens;
 
        swap(ca->buckets_nouse, buckets_nouse);
 
@@ -2191,8 +2222,10 @@ err:
                free_fifo(&free[i]);
        kvpfree(buckets_nouse,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+       if (bucket_gens)
+               call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
        if (buckets)
-               call_rcu(&old_buckets->rcu, buckets_free_rcu);
+               call_rcu(&buckets->rcu, buckets_free_rcu);
 
        return ret;
 }
@@ -2207,6 +2240,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
                free_fifo(&ca->free[i]);
        kvpfree(ca->buckets_nouse,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+       kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+               sizeof(struct bucket_gens) + ca->mi.nbuckets);
        kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
index 0f544b62fc908f3fea5fd54c866741277d89dbf6..7c6c59c7762c55cb4626af78aff84d55a9c5ded1 100644 (file)
@@ -53,11 +53,34 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
        return buckets->b + b;
 }
 
+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
+{
+       return __bucket(ca, b, true);
+}
+
 static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 {
        return __bucket(ca, b, false);
 }
 
+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
+{
+       return rcu_dereference_check(ca->bucket_gens,
+                                    !ca->fs ||
+                                    percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+                                    lockdep_is_held(&ca->fs->gc_lock) ||
+                                    lockdep_is_held(&ca->bucket_lock));
+
+}
+
+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
+{
+       struct bucket_gens *gens = bucket_gens(ca);
+
+       BUG_ON(b < gens->first_bucket || b >= gens->nbuckets);
+       return gens->b + b;
+}
+
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
@@ -74,11 +97,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
        return sector_to_bucket(ca, ptr->offset);
 }
 
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-                                       const struct bch_extent_ptr *ptr,
-                                       bool gc)
+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
+                                          const struct bch_extent_ptr *ptr)
 {
-       return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
+       return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr));
 }
 
 static inline enum bch_data_type ptr_data_type(const struct bkey *k,
@@ -91,18 +113,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k,
        return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
 }
 
-static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
-                                                const struct bch_extent_ptr *ptr)
-{
-       struct bucket_mark m;
-
-       rcu_read_lock();
-       m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
-       rcu_read_unlock();
-
-       return m;
-}
-
 static inline int gen_cmp(u8 a, u8 b)
 {
        return (s8) (a - b);
@@ -122,28 +132,22 @@ static inline int gen_after(u8 a, u8 b)
 static inline u8 ptr_stale(struct bch_dev *ca,
                           const struct bch_extent_ptr *ptr)
 {
-       return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
-}
+       u8 ret;
 
-/* bucket gc marks */
+       rcu_read_lock();
+       ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen);
+       rcu_read_unlock();
 
-static inline unsigned bucket_sectors_used(struct bucket_mark mark)
-{
-       return mark.dirty_sectors + mark.cached_sectors;
+       return ret;
 }
 
+/* bucket gc marks */
+
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
        return !mark.dirty_sectors && !mark.stripe;
 }
 
-static inline bool bucket_needs_journal_commit(struct bucket_mark m,
-                                              u16 last_seq_ondisk)
-{
-       return m.journal_seq_valid &&
-               ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
-}
-
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
@@ -218,7 +222,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 /* key/bucket marking: */
 
-void bch2_bucket_seq_cleanup(struct bch_fs *);
 void bch2_fs_usage_initialize(struct bch_fs *);
 
 void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
@@ -226,16 +229,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               size_t, enum bch_data_type, unsigned,
                               struct gc_pos, unsigned);
 
-int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned);
+int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned);
 
-int bch2_mark_update(struct btree_trans *, struct btree_iter *,
+int bch2_mark_update(struct btree_trans *, struct btree_path *,
                     struct bkey_i *, unsigned);
 
 int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
                        struct bkey_s_c, unsigned);
-int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
-                          struct bkey_i *insert, unsigned);
-void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
                                    size_t, enum bch_data_type, unsigned);
index b2de2995c5e7f5b157a0f0359a1eac8b6bc5672d..2c73dc60b838f08f42da26ded8c6b8a0358a4b20 100644 (file)
@@ -15,18 +15,9 @@ struct bucket_mark {
        u8              gen;
        u8              data_type:3,
                        owned_by_allocator:1,
-                       journal_seq_valid:1,
                        stripe:1;
        u16             dirty_sectors;
        u16             cached_sectors;
-
-       /*
-        * low bits of journal sequence number when this bucket was most
-        * recently modified: if journal_seq_valid is set, this bucket can't be
-        * reused until the journal sequence number written to disk is >= the
-        * bucket's journal sequence number:
-        */
-       u16             journal_seq;
        };
        };
 };
@@ -39,7 +30,6 @@ struct bucket {
 
        u64                             io_time[2];
        u8                              oldest_gen;
-       u8                              gc_gen;
        unsigned                        gen_valid:1;
        u8                              stripe_redundancy;
        u32                             stripe;
@@ -52,6 +42,13 @@ struct bucket_array {
        struct bucket           b[];
 };
 
+struct bucket_gens {
+       struct rcu_head         rcu;
+       u16                     first_bucket;
+       size_t                  nbuckets;
+       u8                      b[];
+};
+
 struct bch_dev_usage {
        u64                     buckets_ec;
        u64                     buckets_unavailable;
diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c
new file mode 100644 (file)
index 0000000..2e5b955
--- /dev/null
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "buckets_waiting_for_journal.h"
+#include <linux/random.h>
+
+static inline struct bucket_hashed *
+bucket_hash(struct buckets_waiting_for_journal_table *t,
+           unsigned hash_seed_idx, u64 dev_bucket)
+{
+       unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]);
+
+       BUG_ON(!is_power_of_2(t->size));
+
+       return t->d + (h & (t->size - 1));
+}
+
+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size)
+{
+       unsigned i;
+
+       t->size = size;
+       for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++)
+               get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i]));
+       memset(t->d, 0, sizeof(t->d[0]) * size);
+}
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+                                     u64 flushed_seq,
+                                     unsigned dev, u64 bucket)
+{
+       struct buckets_waiting_for_journal_table *t;
+       u64 dev_bucket = (u64) dev << 56 | bucket;
+       bool ret = false;
+       unsigned i;
+
+       mutex_lock(&b->lock);
+       t = b->t;
+
+       for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+               struct bucket_hashed *h = bucket_hash(t, i, dev_bucket);
+
+               if (h->dev_bucket == dev_bucket) {
+                       ret = h->journal_seq > flushed_seq;
+                       break;
+               }
+       }
+
+       mutex_unlock(&b->lock);
+
+       return ret;
+}
+
+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t,
+                               struct bucket_hashed *new,
+                               u64 flushed_seq)
+{
+       struct bucket_hashed *last_evicted = NULL;
+       unsigned tries, i;
+
+       for (tries = 0; tries < 10; tries++) {
+               struct bucket_hashed *old, *victim = NULL;
+
+               for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) {
+                       old = bucket_hash(t, i, new->dev_bucket);
+
+                       if (old->dev_bucket == new->dev_bucket ||
+                           old->journal_seq <= flushed_seq) {
+                               *old = *new;
+                               return true;
+                       }
+
+                       if (last_evicted != old)
+                               victim = old;
+               }
+
+               /* hashed to same slot 3 times: */
+               if (!victim)
+                       break;
+
+               /* Failed to find an empty slot: */
+               swap(*new, *victim);
+               last_evicted = victim;
+       }
+
+       return false;
+}
+
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b,
+                                        u64 flushed_seq,
+                                        unsigned dev, u64 bucket,
+                                        u64 journal_seq)
+{
+       struct buckets_waiting_for_journal_table *t, *n;
+       struct bucket_hashed tmp, new = {
+               .dev_bucket     = (u64) dev << 56 | bucket,
+               .journal_seq    = journal_seq,
+       };
+       size_t i, new_size, nr_elements = 1, nr_rehashes = 0;
+       int ret = 0;
+
+       mutex_lock(&b->lock);
+
+       if (likely(bucket_table_insert(b->t, &new, flushed_seq)))
+               goto out;
+
+       t = b->t;
+       for (i = 0; i < t->size; i++)
+               nr_elements += t->d[i].journal_seq > flushed_seq;
+
+       new_size = nr_elements < t->size / 3 ? t->size : t->size * 2;
+
+       n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL);
+       if (!n) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+retry_rehash:
+       nr_rehashes++;
+       bucket_table_init(n, new_size);
+
+       tmp = new;
+       BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq));
+
+       for (i = 0; i < t->size; i++) {
+               if (t->d[i].journal_seq <= flushed_seq)
+                       continue;
+
+               tmp = t->d[i];
+               if (!bucket_table_insert(n, &tmp, flushed_seq))
+                       goto retry_rehash;
+       }
+
+       b->t = n;
+       kvfree(t);
+
+       pr_debug("took %zu rehashes, table at %zu/%zu elements",
+                nr_rehashes, nr_elements, b->t->size);
+out:
+       mutex_unlock(&b->lock);
+
+       return ret;
+}
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+       kvfree(b->t);
+}
+
+#define INITIAL_TABLE_SIZE     8
+
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c)
+{
+       struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal;
+
+       mutex_init(&b->lock);
+
+       b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL);
+       if (!b->t)
+               return -ENOMEM;
+
+       bucket_table_init(b->t, INITIAL_TABLE_SIZE);
+       return 0;
+}
diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h
new file mode 100644 (file)
index 0000000..d2ae19c
--- /dev/null
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_H
+
+#include "buckets_waiting_for_journal_types.h"
+
+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+                                     u64, unsigned, u64);
+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *,
+                                        u64, unsigned, u64, u64);
+
+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *);
+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *);
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */
diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h
new file mode 100644 (file)
index 0000000..fea7f94
--- /dev/null
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H
+
+#include <linux/siphash.h>
+
+struct bucket_hashed {
+       u64                     dev_bucket;
+       u64                     journal_seq;
+};
+
+struct buckets_waiting_for_journal_table {
+       size_t                  size;
+       siphash_key_t           hash_seeds[3];
+       struct bucket_hashed    d[];
+};
+
+struct buckets_waiting_for_journal {
+       struct mutex            lock;
+       struct buckets_waiting_for_journal_table *t;
+};
+
+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */
index db68a78276cfaf1a7b2630146390b00dc1fe2295..aa26588ed5edf0e193c10cc7e3d6a0d25f49a8e9 100644 (file)
@@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
+       if (!dev)
+               return -EINVAL;
+
        for_each_online_member(ca, c, i)
-               if (ca->disk_sb.bdev->bd_dev == dev) {
+               if (ca->dev == dev) {
                        percpu_ref_put(&ca->io_ref);
                        return i;
                }
index d20924e579bffd9d3b34b1dee7e35857eb22d346..a1d89923d361a976aeed0f7de3b7ebb0658df495 100644 (file)
@@ -35,18 +35,18 @@ struct bch2_checksum_state {
 static void bch2_checksum_init(struct bch2_checksum_state *state)
 {
        switch (state->type) {
-       case BCH_CSUM_NONE:
-       case BCH_CSUM_CRC32C:
-       case BCH_CSUM_CRC64:
+       case BCH_CSUM_none:
+       case BCH_CSUM_crc32c:
+       case BCH_CSUM_crc64:
                state->seed = 0;
                break;
-       case BCH_CSUM_CRC32C_NONZERO:
+       case BCH_CSUM_crc32c_nonzero:
                state->seed = U32_MAX;
                break;
-       case BCH_CSUM_CRC64_NONZERO:
+       case BCH_CSUM_crc64_nonzero:
                state->seed = U64_MAX;
                break;
-       case BCH_CSUM_XXHASH:
+       case BCH_CSUM_xxhash:
                xxh64_reset(&state->h64state, 0);
                break;
        default:
@@ -57,15 +57,15 @@ static void bch2_checksum_init(struct bch2_checksum_state *state)
 static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 {
        switch (state->type) {
-       case BCH_CSUM_NONE:
-       case BCH_CSUM_CRC32C:
-       case BCH_CSUM_CRC64:
+       case BCH_CSUM_none:
+       case BCH_CSUM_crc32c:
+       case BCH_CSUM_crc64:
                return state->seed;
-       case BCH_CSUM_CRC32C_NONZERO:
+       case BCH_CSUM_crc32c_nonzero:
                return state->seed ^ U32_MAX;
-       case BCH_CSUM_CRC64_NONZERO:
+       case BCH_CSUM_crc64_nonzero:
                return state->seed ^ U64_MAX;
-       case BCH_CSUM_XXHASH:
+       case BCH_CSUM_xxhash:
                return xxh64_digest(&state->h64state);
        default:
                BUG();
@@ -75,17 +75,17 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state)
 static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len)
 {
        switch (state->type) {
-       case BCH_CSUM_NONE:
+       case BCH_CSUM_none:
                return;
-       case BCH_CSUM_CRC32C_NONZERO:
-       case BCH_CSUM_CRC32C:
+       case BCH_CSUM_crc32c_nonzero:
+       case BCH_CSUM_crc32c:
                state->seed = crc32c(state->seed, data, len);
                break;
-       case BCH_CSUM_CRC64_NONZERO:
-       case BCH_CSUM_CRC64:
+       case BCH_CSUM_crc64_nonzero:
+       case BCH_CSUM_crc64:
                state->seed = crc64_be(state->seed, data, len);
                break;
-       case BCH_CSUM_XXHASH:
+       case BCH_CSUM_xxhash:
                xxh64_update(&state->h64state, data, len);
                break;
        default:
@@ -161,12 +161,12 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
                              struct nonce nonce, const void *data, size_t len)
 {
        switch (type) {
-       case BCH_CSUM_NONE:
-       case BCH_CSUM_CRC32C_NONZERO:
-       case BCH_CSUM_CRC64_NONZERO:
-       case BCH_CSUM_CRC32C:
-       case BCH_CSUM_XXHASH:
-       case BCH_CSUM_CRC64: {
+       case BCH_CSUM_none:
+       case BCH_CSUM_crc32c_nonzero:
+       case BCH_CSUM_crc64_nonzero:
+       case BCH_CSUM_crc32c:
+       case BCH_CSUM_xxhash:
+       case BCH_CSUM_crc64: {
                struct bch2_checksum_state state;
 
                state.type = type;
@@ -177,8 +177,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
                return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
        }
 
-       case BCH_CSUM_CHACHA20_POLY1305_80:
-       case BCH_CSUM_CHACHA20_POLY1305_128: {
+       case BCH_CSUM_chacha20_poly1305_80:
+       case BCH_CSUM_chacha20_poly1305_128: {
                SHASH_DESC_ON_STACK(desc, c->poly1305);
                u8 digest[POLY1305_DIGEST_SIZE];
                struct bch_csum ret = { 0 };
@@ -212,13 +212,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
        struct bio_vec bv;
 
        switch (type) {
-       case BCH_CSUM_NONE:
+       case BCH_CSUM_none:
                return (struct bch_csum) { 0 };
-       case BCH_CSUM_CRC32C_NONZERO:
-       case BCH_CSUM_CRC64_NONZERO:
-       case BCH_CSUM_CRC32C:
-       case BCH_CSUM_XXHASH:
-       case BCH_CSUM_CRC64: {
+       case BCH_CSUM_crc32c_nonzero:
+       case BCH_CSUM_crc64_nonzero:
+       case BCH_CSUM_crc32c:
+       case BCH_CSUM_xxhash:
+       case BCH_CSUM_crc64: {
                struct bch2_checksum_state state;
 
                state.type = type;
@@ -238,8 +238,8 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
                return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) };
        }
 
-       case BCH_CSUM_CHACHA20_POLY1305_80:
-       case BCH_CSUM_CHACHA20_POLY1305_128: {
+       case BCH_CSUM_chacha20_poly1305_80:
+       case BCH_CSUM_chacha20_poly1305_128: {
                SHASH_DESC_ON_STACK(desc, c->poly1305);
                u8 digest[POLY1305_DIGEST_SIZE];
                struct bch_csum ret = { 0 };
@@ -407,16 +407,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 }
 
 #ifdef __KERNEL__
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
-       char key_description[60];
        struct key *keyring_key;
        const struct user_key_payload *ukp;
        int ret;
 
-       snprintf(key_description, sizeof(key_description),
-                "bcachefs:%pUb", &sb->user_uuid);
-
        keyring_key = request_key(&key_type_logon, key_description, NULL);
        if (IS_ERR(keyring_key))
                return PTR_ERR(keyring_key);
@@ -436,16 +432,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 }
 #else
 #include <keyutils.h>
-#include <uuid/uuid.h>
 
-int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+static int __bch2_request_key(char *key_description, struct bch_key *key)
 {
        key_serial_t key_id;
-       char key_description[60];
-       char uuid[40];
-
-       uuid_unparse_lower(sb->user_uuid.b, uuid);
-       sprintf(key_description, "bcachefs:%s", uuid);
 
        key_id = request_key("user", key_description, NULL,
                             KEY_SPEC_USER_KEYRING);
@@ -459,6 +449,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 }
 #endif
 
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
+{
+       char key_description[60];
+       char uuid[40];
+
+       uuid_unparse_lower(sb->user_uuid.b, uuid);
+       sprintf(key_description, "bcachefs:%s", uuid);
+
+       return __bch2_request_key(key_description, key);
+}
+
 int bch2_decrypt_sb_key(struct bch_fs *c,
                        struct bch_sb_field_crypt *crypt,
                        struct bch_key *key)
index 6841fb16568a652779be0ed795768b5d5d8c79e9..f5c1a609c5c42fe408a5896498c1aeed748679ff 100644 (file)
@@ -13,9 +13,9 @@ static inline bool bch2_checksum_mergeable(unsigned type)
 {
 
        switch (type) {
-       case BCH_CSUM_NONE:
-       case BCH_CSUM_CRC32C:
-       case BCH_CSUM_CRC64:
+       case BCH_CSUM_none:
+       case BCH_CSUM_crc32c:
+       case BCH_CSUM_crc64:
                return true;
        default:
                return false;
@@ -78,13 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
 {
        switch (type) {
        case BCH_CSUM_OPT_none:
-            return BCH_CSUM_NONE;
+            return BCH_CSUM_none;
        case BCH_CSUM_OPT_crc32c:
-            return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
+            return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero;
        case BCH_CSUM_OPT_crc64:
-            return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
+            return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero;
        case BCH_CSUM_OPT_xxhash:
-            return BCH_CSUM_XXHASH;
+            return BCH_CSUM_xxhash;
        default:
             BUG();
        }
@@ -95,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 {
        if (c->sb.encryption_type)
                return c->opts.wide_macs
-                       ? BCH_CSUM_CHACHA20_POLY1305_128
-                       : BCH_CSUM_CHACHA20_POLY1305_80;
+                       ? BCH_CSUM_chacha20_poly1305_128
+                       : BCH_CSUM_chacha20_poly1305_80;
 
        return bch2_csum_opt_to_type(opt, true);
 }
@@ -104,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
        if (c->sb.encryption_type)
-               return BCH_CSUM_CHACHA20_POLY1305_128;
+               return BCH_CSUM_chacha20_poly1305_128;
 
        return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
index f63651d291e53de8737c52254fe222bf07512ed3..8e4179d8dc2764d883916cbca64d4a11deb98d4a 100644 (file)
@@ -26,7 +26,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
 {
        void *b;
 
-       BUG_ON(size > c->sb.encoded_extent_max << 9);
+       BUG_ON(size > c->opts.encoded_extent_max);
 
        b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
        if (b)
@@ -68,7 +68,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
        struct page **pages = NULL;
        void *data;
 
-       BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
+       BUG_ON(start.bi_size > c->opts.encoded_extent_max);
 
        if (!PageHighMem(bio_iter_page(bio, start)) &&
            bio_phys_contig(bio, start))
@@ -231,8 +231,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
        BUG_ON(!bio->bi_vcnt);
        BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 
-       if (crc->uncompressed_size      > c->sb.encoded_extent_max ||
-           crc->compressed_size        > c->sb.encoded_extent_max) {
+       if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
+           crc->compressed_size << 9   > c->opts.encoded_extent_max) {
                bch_err(c, "error rewriting existing data: extent too big");
                return -EIO;
        }
@@ -272,8 +272,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
        size_t dst_len = crc.uncompressed_size << 9;
        int ret = -ENOMEM;
 
-       if (crc.uncompressed_size       > c->sb.encoded_extent_max ||
-           crc.compressed_size         > c->sb.encoded_extent_max)
+       if (crc.uncompressed_size << 9  > c->opts.encoded_extent_max ||
+           crc.compressed_size << 9    > c->opts.encoded_extent_max)
                return -EIO;
 
        dst_data = dst_len == dst_iter.bi_size
@@ -376,7 +376,7 @@ static unsigned __bio_compress(struct bch_fs *c,
        BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
 
        /* If it's only one block, don't bother trying to compress: */
-       if (bio_sectors(src) <= c->opts.block_size)
+       if (src->bi_iter.bi_size <= c->opts.block_size)
                return 0;
 
        dst_data = bio_map_or_bounce(c, dst, WRITE);
@@ -466,7 +466,7 @@ unsigned bch2_bio_compress(struct bch_fs *c,
 
        /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
        src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
-                                    c->sb.encoded_extent_max << 9);
+                                    c->opts.encoded_extent_max);
        /* Don't generate a bigger output than input: */
        dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
@@ -544,10 +544,9 @@ void bch2_fs_compress_exit(struct bch_fs *c)
 
 static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 {
-       size_t max_extent = c->sb.encoded_extent_max << 9;
        size_t decompress_workspace_size = 0;
        bool decompress_workspace_needed;
-       ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+       ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0);
        struct {
                unsigned        feature;
                unsigned        type;
@@ -579,14 +578,14 @@ have_compressed:
 
        if (!mempool_initialized(&c->compression_bounce[READ])) {
                ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-                                                 1, max_extent);
+                                                 1, c->opts.encoded_extent_max);
                if (ret)
                        goto out;
        }
 
        if (!mempool_initialized(&c->compression_bounce[WRITE])) {
                ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-                                                 1, max_extent);
+                                                 1, c->opts.encoded_extent_max);
                if (ret)
                        goto out;
        }
index b0a8eb58a7a755c33097a54e252ec92e9b9d0579..ee5b7f6967965fa47f31136edb834af6d31e9264 100644 (file)
@@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int err;
 
@@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from,
-                                  BTREE_ITER_PREFETCH|
-                                  BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(&iter);
 
        while (k.k && !(err = bkey_err(k))) {
                bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
@@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                i->buf[i->bytes] = '\n';
                i->bytes++;
 
-               k = bch2_btree_iter_next(iter);
-               i->from = iter->pos;
+               k = bch2_btree_iter_next(&iter);
+               i->from = iter.pos;
 
                err = flush_buf(i);
                if (err)
@@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
@@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        int err;
 
@@ -318,7 +318,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
+       for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) {
                bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
                i->bytes = strlen(i->buf);
                err = flush_buf(i);
@@ -336,7 +336,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
 
@@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 {
        struct dump_iter *i = file->private_data;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct btree *prev_node = NULL;
        int err;
@@ -373,11 +373,13 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
 
        bch2_trans_init(&trans, i->c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, i->id, i->from,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(err = bkey_err(k))) {
-               struct btree_iter_level *l = &iter->l[0];
+               struct btree_path_level *l = &iter.path->l[0];
                struct bkey_packed *_k =
                        bch2_btree_node_iter_peek(&l->iter, l->b);
 
@@ -396,8 +398,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                if (err)
                        break;
 
-               bch2_btree_iter_advance(iter);
-               i->from = iter->pos;
+               bch2_btree_iter_advance(&iter);
+               i->from = iter.pos;
 
                err = flush_buf(i);
                if (err)
@@ -406,6 +408,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
                if (!i->size)
                        break;
        }
+       bch2_trans_iter_exit(&trans, &iter);
+
        bch2_trans_exit(&trans);
 
        return err < 0 ? err : i->ret;
index 02b29681f695e09c30dcf8d3cc50fbff1c938ce9..6f699b736b348e366b6c1e43567cf7a65c78b8a4 100644 (file)
@@ -8,6 +8,7 @@
 #include "fs.h"
 #include "keylist.h"
 #include "str_hash.h"
+#include "subvolume.h"
 
 #include <linux/dcache.h>
 
@@ -63,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
        return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
 }
 
+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
+{
+       struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+
+       if (d.v->d_type == DT_SUBVOL)
+               return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
+       return true;
+}
+
 const struct bch_hash_desc bch2_dirent_hash_desc = {
        .btree_id       = BTREE_ID_dirents,
        .key_type       = KEY_TYPE_dirent,
@@ -70,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
        .hash_bkey      = dirent_hash_bkey,
        .cmp_key        = dirent_cmp_key,
        .cmp_bkey       = dirent_cmp_bkey,
+       .is_visible     = dirent_is_visible,
 };
 
 const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@@ -99,7 +110,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (memchr(d.v->d_name, '/', len))
                return "invalid name";
 
-       if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
+       if (d.v->d_type != DT_SUBVOL &&
+           le64_to_cpu(d.v->d_inum) == d.k->p.inode)
                return "dirent points to own directory";
 
        return NULL;
@@ -112,14 +124,16 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
        bch_scnmemcpy(out, d.v->d_name,
                      bch2_dirent_name_bytes(d));
-       pr_buf(out, " -> %llu type %s", d.v->d_inum,
-              d.v->d_type < DT_MAX
-              ? bch2_d_types[d.v->d_type]
-              : "(bad d_type)");
+       pr_buf(out, " -> %llu type %s",
+              d.v->d_type != DT_SUBVOL
+              ? le64_to_cpu(d.v->d_inum)
+              : le32_to_cpu(d.v->d_child_subvol),
+              bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
-                               u8 type, const struct qstr *name, u64 dst)
+                               subvol_inum dir, u8 type,
+                               const struct qstr *name, u64 dst)
 {
        struct bkey_i_dirent *dirent;
        unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@@ -135,7 +149,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 
        bkey_dirent_init(&dirent->k_i);
        dirent->k.u64s = u64s;
-       dirent->v.d_inum = cpu_to_le64(dst);
+
+       if (type != DT_SUBVOL) {
+               dirent->v.d_inum = cpu_to_le64(dst);
+       } else {
+               dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
+               dirent->v.d_child_subvol = cpu_to_le32(dst);
+       }
+
        dirent->v.d_type = type;
 
        memcpy(dirent->v.d_name, name->name, name->len);
@@ -149,21 +170,21 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
        return dirent;
 }
 
-int bch2_dirent_create(struct btree_trans *trans,
-                      u64 dir_inum, const struct bch_hash_info *hash_info,
+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
+                      const struct bch_hash_info *hash_info,
                       u8 type, const struct qstr *name, u64 dst_inum,
                       u64 *dir_offset, int flags)
 {
        struct bkey_i_dirent *dirent;
        int ret;
 
-       dirent = dirent_create_key(trans, type, name, dst_inum);
+       dirent = dirent_create_key(trans, dir, type, name, dst_inum);
        ret = PTR_ERR_OR_ZERO(dirent);
        if (ret)
                return ret;
 
        ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
-                           dir_inum, &dirent->k_i, flags);
+                           dir, &dirent->k_i, flags);
        *dir_offset = dirent->k.p.offset;
 
        return ret;
@@ -176,82 +197,130 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
        dst->v.d_type = src.v->d_type;
 }
 
+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
+                           struct bkey_s_c_dirent d, subvol_inum *target)
+{
+       struct bch_subvolume s;
+       int ret = 0;
+
+       if (d.v->d_type == DT_SUBVOL &&
+           d.v->d_parent_subvol != dir.subvol)
+               return 1;
+
+       if (likely(d.v->d_type != DT_SUBVOL)) {
+               target->subvol  = dir.subvol;
+               target->inum    = le64_to_cpu(d.v->d_inum);
+       } else {
+               target->subvol  = le32_to_cpu(d.v->d_child_subvol);
+
+               ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
+
+               target->inum    = le64_to_cpu(s.inode);
+       }
+
+       return ret;
+}
+
 int bch2_dirent_rename(struct btree_trans *trans,
-                      u64 src_dir, struct bch_hash_info *src_hash,
-                      u64 dst_dir, struct bch_hash_info *dst_hash,
-                      const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
-                      const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
-                      enum bch_rename_mode mode)
+               subvol_inum src_dir, struct bch_hash_info *src_hash,
+               subvol_inum dst_dir, struct bch_hash_info *dst_hash,
+               const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset,
+               const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
+               enum bch_rename_mode mode)
 {
-       struct btree_iter *src_iter = NULL, *dst_iter = NULL;
-       struct bkey_s_c old_src, old_dst;
+       struct btree_iter src_iter = { NULL };
+       struct btree_iter dst_iter = { NULL };
+       struct bkey_s_c old_src, old_dst = bkey_s_c_null;
        struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
        struct bpos dst_pos =
-               POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
+               POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
+       unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
        int ret = 0;
 
-       *src_inum = *dst_inum = 0;
+       if (src_dir.subvol != dst_dir.subvol)
+               return -EXDEV;
 
-       /*
-        * Lookup dst:
-        *
-        * Note that in BCH_RENAME mode, we're _not_ checking if
-        * the target already exists - we're relying on the VFS
-        * to do that check for us for correctness:
-        */
-       dst_iter = mode == BCH_RENAME
-               ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
-                                dst_hash, dst_dir, dst_name)
-               : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-                                  dst_hash, dst_dir, dst_name,
-                                  BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dst_iter);
-       if (ret)
-               goto out;
+       memset(src_inum, 0, sizeof(*src_inum));
+       memset(dst_inum, 0, sizeof(*dst_inum));
 
-       old_dst = bch2_btree_iter_peek_slot(dst_iter);
-       ret = bkey_err(old_dst);
+       /* Lookup src: */
+       ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
+                              src_hash, src_dir, src_name,
+                              BTREE_ITER_INTENT);
        if (ret)
                goto out;
 
-       if (mode != BCH_RENAME)
-               *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
-       if (mode != BCH_RENAME_EXCHANGE)
-               *src_offset = dst_iter->pos.offset;
-
-       /* Lookup src: */
-       src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-                                   src_hash, src_dir, src_name,
-                                   BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_iter);
+       old_src = bch2_btree_iter_peek_slot(&src_iter);
+       ret = bkey_err(old_src);
        if (ret)
                goto out;
 
-       old_src = bch2_btree_iter_peek_slot(src_iter);
-       ret = bkey_err(old_src);
+       ret = bch2_dirent_read_target(trans, src_dir,
+                       bkey_s_c_to_dirent(old_src), src_inum);
        if (ret)
                goto out;
 
-       *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
+       src_type = bkey_s_c_to_dirent(old_src).v->d_type;
+
+       if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
+               return -EOPNOTSUPP;
+
+
+       /* Lookup dst: */
+       if (mode == BCH_RENAME) {
+               /*
+                * Note that we're _not_ checking if the target already exists -
+                * we're relying on the VFS to do that check for us for
+                * correctness:
+                */
+               ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
+                                    dst_hash, dst_dir, dst_name);
+               if (ret)
+                       goto out;
+       } else {
+               ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
+                                      dst_hash, dst_dir, dst_name,
+                                      BTREE_ITER_INTENT);
+               if (ret)
+                       goto out;
+
+               old_dst = bch2_btree_iter_peek_slot(&dst_iter);
+               ret = bkey_err(old_dst);
+               if (ret)
+                       goto out;
+
+               ret = bch2_dirent_read_target(trans, dst_dir,
+                               bkey_s_c_to_dirent(old_dst), dst_inum);
+               if (ret)
+                       goto out;
+
+               dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
+
+               if (dst_type == DT_SUBVOL)
+                       return -EOPNOTSUPP;
+       }
+
+       if (mode != BCH_RENAME_EXCHANGE)
+               *src_offset = dst_iter.pos.offset;
 
        /* Create new dst key: */
-       new_dst = dirent_create_key(trans, 0, dst_name, 0);
+       new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
        ret = PTR_ERR_OR_ZERO(new_dst);
        if (ret)
                goto out;
 
        dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
-       new_dst->k.p = dst_iter->pos;
+       new_dst->k.p = dst_iter.pos;
 
        /* Create new src key: */
        if (mode == BCH_RENAME_EXCHANGE) {
-               new_src = dirent_create_key(trans, 0, src_name, 0);
+               new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
                ret = PTR_ERR_OR_ZERO(new_src);
                if (ret)
                        goto out;
 
                dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
-               new_src->k.p = src_iter->pos;
+               new_src->k.p = src_iter.pos;
        } else {
                new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
                ret = PTR_ERR_OR_ZERO(new_src);
@@ -259,10 +328,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
                        goto out;
 
                bkey_init(&new_src->k);
-               new_src->k.p = src_iter->pos;
+               new_src->k.p = src_iter.pos;
 
-               if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
-                   bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
+               if (bkey_cmp(dst_pos, src_iter.pos) <= 0 &&
+                   bkey_cmp(src_iter.pos, dst_iter.pos) < 0) {
                        /*
                         * We have a hash collision for the new dst key,
                         * and new_src - the key we're deleting - is between
@@ -275,10 +344,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
                                 * If we're not overwriting, we can just insert
                                 * new_dst at the src position:
                                 */
-                               new_dst->k.p = src_iter->pos;
-                               bch2_trans_update(trans, src_iter,
-                                                 &new_dst->k_i, 0);
-                               goto out_set_offset;
+                               new_src = new_dst;
+                               new_src->k.p = src_iter.pos;
+                               goto out_set_src;
                        } else {
                                /* If we're overwriting, we can't insert new_dst
                                 * at a different slot because it has to
@@ -290,7 +358,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
                } else {
                        /* Check if we need a whiteout to delete src: */
                        ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
-                                                      src_hash, src_iter);
+                                                      src_hash, &src_iter);
                        if (ret < 0)
                                goto out;
 
@@ -299,75 +367,112 @@ int bch2_dirent_rename(struct btree_trans *trans,
                }
        }
 
-       bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
-       bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
-out_set_offset:
+       ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
+       if (ret)
+               goto out;
+out_set_src:
+
+       /*
+        * If we're deleting a subvolume, we need to really delete the dirent,
+        * not just emit a whiteout in the current snapshot:
+        */
+       if (src_type == DT_SUBVOL) {
+               bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+               ret = bch2_btree_iter_traverse(&src_iter);
+               if (ret)
+                       goto out;
+
+               new_src->k.p = src_iter.pos;
+               src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+       }
+
+       ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
+       if (ret)
+               goto out;
+
        if (mode == BCH_RENAME_EXCHANGE)
                *src_offset = new_src->k.p.offset;
        *dst_offset = new_dst->k.p.offset;
 out:
-       bch2_trans_iter_put(trans, src_iter);
-       bch2_trans_iter_put(trans, dst_iter);
+       bch2_trans_iter_exit(trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
        return ret;
 }
 
-int bch2_dirent_delete_at(struct btree_trans *trans,
-                         const struct bch_hash_info *hash_info,
-                         struct btree_iter *iter)
-{
-       return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                  hash_info, iter);
-}
-
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
-                          const struct bch_hash_info *hash_info,
-                          const struct qstr *name, unsigned flags)
-{
-       return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
-                               hash_info, dir_inum, name, flags);
-}
-
-u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
-                      const struct bch_hash_info *hash_info,
-                      const struct qstr *name)
+int __bch2_dirent_lookup_trans(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              subvol_inum dir,
+                              const struct bch_hash_info *hash_info,
+                              const struct qstr *name, subvol_inum *inum,
+                              unsigned flags)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
        struct bkey_s_c k;
-       u64 inum = 0;
-       int ret = 0;
+       struct bkey_s_c_dirent d;
+       u32 snapshot;
+       int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+       if (ret)
+               return ret;
 
-       iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
-                                         hash_info, name, 0);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+                              hash_info, dir, name, flags);
        if (ret)
-               goto out;
+               return ret;
 
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
-               goto out;
+               goto err;
 
-       inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
-       bch2_trans_iter_put(&trans, iter);
-out:
-       BUG_ON(ret == -EINTR);
+       d = bkey_s_c_to_dirent(k);
+
+       ret = bch2_dirent_read_target(trans, dir, d, inum);
+       if (ret > 0)
+               ret = -ENOENT;
+err:
+       if (ret)
+               bch2_trans_iter_exit(trans, iter);
+
+       return ret;
+}
+
+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
+                      const struct bch_hash_info *hash_info,
+                      const struct qstr *name, subvol_inum *inum)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info,
+                                         name, inum, 0);
+       if (ret == -EINTR)
+               goto retry;
+       if (!ret)
+               bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
-       return inum;
+       return ret;
 }
 
-int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
-       for_each_btree_key(trans, iter, BTREE_ID_dirents,
-                          POS(dir_inum, 0), 0, k, ret) {
-               if (k.k->p.inode > dir_inum)
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents,
+                          SPOS(dir.inum, 0, snapshot), 0, k, ret) {
+               if (k.k->p.inode > dir.inum)
                        break;
 
                if (k.k->type == KEY_TYPE_dirent) {
@@ -375,24 +480,32 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
                        break;
                }
        }
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret;
 }
 
-int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent dirent;
+       subvol_inum target;
+       u32 snapshot;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_dirents,
-                          POS(inum, ctx->pos), 0, k, ret) {
-               if (k.k->p.inode > inum)
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents,
+                          SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) {
+               if (k.k->p.inode > inum.inum)
                        break;
 
                if (k.k->type != KEY_TYPE_dirent)
@@ -400,6 +513,12 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
 
                dirent = bkey_s_c_to_dirent(k);
 
+               ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
+               if (ret < 0)
+                       break;
+               if (ret)
+                       continue;
+
                /*
                 * XXX: dir_emit() can fault and block, while we're holding
                 * locks
@@ -407,14 +526,25 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
                ctx->pos = dirent.k->p.offset;
                if (!dir_emit(ctx, dirent.v->d_name,
                              bch2_dirent_name_bytes(dirent),
-                             le64_to_cpu(dirent.v->d_inum),
-                             dirent.v->d_type))
+                             target.inum,
+                             vfs_d_type(dirent.v->d_type)))
                        break;
                ctx->pos = dirent.k->p.offset + 1;
+
+               /*
+                * read_target looks up subvolumes, we can overflow paths if the
+                * directory has many subvolumes in it
+                */
+               ret = btree_trans_too_many_iters(&trans);
+               if (ret)
+                       break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
 
        return ret;
 }
index e1d8ce377d43755cd5584edf1afa05d0ba65495e..1bb4d802bc1db1ea8bb79a454074a51afb595324 100644 (file)
@@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len)
                            sizeof(u64));
 }
 
-int bch2_dirent_create(struct btree_trans *, u64,
+int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
+                           struct bkey_s_c_dirent, subvol_inum *);
+
+int bch2_dirent_create(struct btree_trans *, subvol_inum,
                       const struct bch_hash_info *, u8,
                       const struct qstr *, u64, u64 *, int);
 
-int bch2_dirent_delete_at(struct btree_trans *,
-                         const struct bch_hash_info *,
-                         struct btree_iter *);
+static inline unsigned vfs_d_type(unsigned type)
+{
+       return type == DT_SUBVOL ? DT_DIR : type;
+}
 
 enum bch_rename_mode {
        BCH_RENAME,
@@ -44,20 +48,20 @@ enum bch_rename_mode {
 };
 
 int bch2_dirent_rename(struct btree_trans *,
-                      u64, struct bch_hash_info *,
-                      u64, struct bch_hash_info *,
-                      const struct qstr *, u64 *, u64 *,
-                      const struct qstr *, u64 *, u64 *,
+                      subvol_inum, struct bch_hash_info *,
+                      subvol_inum, struct bch_hash_info *,
+                      const struct qstr *, subvol_inum *, u64 *,
+                      const struct qstr *, subvol_inum *, u64 *,
                       enum bch_rename_mode);
 
-struct btree_iter *
-__bch2_dirent_lookup_trans(struct btree_trans *, u64,
-                          const struct bch_hash_info *,
-                          const struct qstr *, unsigned);
-u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
-                      const struct qstr *);
+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+                              subvol_inum, const struct bch_hash_info *,
+                              const struct qstr *, subvol_inum *, unsigned);
+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
+                      const struct bch_hash_info *,
+                      const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir_trans(struct btree_trans *, u64);
-int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
 #endif /* _BCACHEFS_DIRENT_H */
index c52b6faac9b4982c3a4e5b201b21de6ee3293f95..6c84297ef265f0810f71f3288a8d8fbab69e936e 100644 (file)
@@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r)
                strncmp(l->label, r->label, sizeof(l->label));
 }
 
-static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
-                                               struct bch_sb_field *f)
+static int bch2_sb_disk_groups_validate(struct bch_sb *sb,
+                                       struct bch_sb_field *f,
+                                       struct printbuf *err)
 {
        struct bch_sb_field_disk_groups *groups =
                field_to_type(f, disk_groups);
        struct bch_disk_group *g, *sorted = NULL;
-       struct bch_sb_field_members *mi;
-       struct bch_member *m;
-       unsigned i, nr_groups, len;
-       const char *err = NULL;
-
-       mi              = bch2_sb_get_members(sb);
-       groups          = bch2_sb_get_disk_groups(sb);
-       nr_groups       = disk_groups_nr(groups);
+       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+       unsigned nr_groups = disk_groups_nr(groups);
+       unsigned i, len;
+       int ret = -EINVAL;
 
-       for (m = mi->members;
-            m < mi->members + sb->nr_devices;
-            m++) {
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
                unsigned g;
 
                if (!BCH_MEMBER_GROUP(m))
@@ -42,45 +38,53 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
 
                g = BCH_MEMBER_GROUP(m) - 1;
 
-               if (g >= nr_groups ||
-                   BCH_GROUP_DELETED(&groups->entries[g]))
-                       return "disk has invalid group";
+               if (g >= nr_groups) {
+                       pr_buf(err, "disk %u has invalid label %u (have %u)",
+                              i, g, nr_groups);
+                       return -EINVAL;
+               }
+
+               if (BCH_GROUP_DELETED(&groups->entries[g])) {
+                       pr_buf(err, "disk %u has deleted label %u", i, g);
+                       return -EINVAL;
+               }
        }
 
        if (!nr_groups)
-               return NULL;
+               return 0;
+
+       for (i = 0; i < nr_groups; i++) {
+               g = groups->entries + i;
 
-       for (g = groups->entries;
-            g < groups->entries + nr_groups;
-            g++) {
                if (BCH_GROUP_DELETED(g))
                        continue;
 
                len = strnlen(g->label, sizeof(g->label));
                if (!len) {
-                       err = "group with empty label";
-                       goto err;
+                       pr_buf(err, "label %u empty", i);
+                       return -EINVAL;
                }
        }
 
        sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
        if (!sorted)
-               return "cannot allocate memory";
+               return -ENOMEM;
 
        memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
        sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
 
-       for (i = 0; i + 1 < nr_groups; i++)
-               if (!BCH_GROUP_DELETED(sorted + i) &&
-                   !group_cmp(sorted + i, sorted + i + 1)) {
-                       err = "duplicate groups";
+       for (g = sorted; g + 1 < sorted + nr_groups; g++)
+               if (!BCH_GROUP_DELETED(g) &&
+                   !group_cmp(&g[0], &g[1])) {
+                       pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g));
+                       bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label)));
                        goto err;
                }
 
-       err = NULL;
+       ret = 0;
 err:
        kfree(sorted);
-       return err;
+       return 0;
 }
 
 static void bch2_sb_disk_groups_to_text(struct printbuf *out,
index 328e0429b5d77329d04ed061ebade45536648fcd..9b45640e75dc1ed6194b969cc1eca9ecccdf3465 100644 (file)
@@ -15,6 +15,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "recovery.h"
+#include "replicas.h"
 #include "super-io.h"
 #include "util.h"
 
@@ -142,8 +143,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 }
 
 /* returns blocknr in stripe that we matched: */
-static int bkey_matches_stripe(struct bch_stripe *s,
-                              struct bkey_s_c k)
+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
+                                               struct bkey_s_c k, unsigned *block)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
@@ -152,10 +153,12 @@ static int bkey_matches_stripe(struct bch_stripe *s,
        bkey_for_each_ptr(ptrs, ptr)
                for (i = 0; i < nr_data; i++)
                        if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
-                                                     le16_to_cpu(s->sectors)))
-                               return i;
+                                                     le16_to_cpu(s->sectors))) {
+                               *block = i;
+                               return ptr;
+                       }
 
-       return -1;
+       return NULL;
 }
 
 static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
@@ -429,13 +432,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
 static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes,
+                            POS(0, idx), BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -445,6 +449,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
        }
        bkey_reassemble(&stripe->key.k_i, k);
 err:
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -542,29 +547,29 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
                free_heap(&n);
        }
 
-       if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
+       if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
                return -ENOMEM;
 
        if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
-           !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
+           !genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
                return -ENOMEM;
 
        return 0;
 }
 
-static int ec_stripe_mem_alloc(struct bch_fs *c,
+static int ec_stripe_mem_alloc(struct btree_trans *trans,
                               struct btree_iter *iter)
 {
        size_t idx = iter->pos.offset;
        int ret = 0;
 
-       if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
+       if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN))
                return ret;
 
-       bch2_trans_unlock(iter->trans);
+       bch2_trans_unlock(trans);
        ret = -EINTR;
 
-       if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+       if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL))
                return ret;
 
        return -ENOMEM;
@@ -591,13 +596,13 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
 {
        struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
 
-       genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
+       genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i;
 }
 
 static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
 {
        ec_stripes_heap *h = &c->ec_stripes_heap;
-       struct stripe *m = genradix_ptr(&c->stripes[0], idx);
+       struct stripe *m = genradix_ptr(&c->stripes, idx);
 
        BUG_ON(!m->alive);
        BUG_ON(m->heap_idx >= h->used);
@@ -672,7 +677,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx)
        return bch2_btree_delete_range(c, BTREE_ID_stripes,
                                       POS(0, idx),
                                       POS(0, idx + 1),
-                                      NULL);
+                                      0, NULL);
 }
 
 static void ec_stripe_delete_work(struct work_struct *work)
@@ -689,7 +694,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
                        break;
                }
 
-               bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
+               bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx);
                spin_unlock(&c->ec_stripes_heap_lock);
 
                if (ec_stripe_delete(c, idx))
@@ -699,27 +704,23 @@ static void ec_stripe_delete_work(struct work_struct *work)
 
 /* stripe creation: */
 
-static int ec_stripe_bkey_insert(struct bch_fs *c,
+static int ec_stripe_bkey_insert(struct btree_trans *trans,
                                 struct bkey_i_stripe *stripe,
                                 struct disk_reservation *res)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bpos min_pos = POS(0, 1);
        struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-retry:
-       bch2_trans_begin(&trans);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
+       for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos,
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
                if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
                        if (start_pos.offset) {
                                start_pos = min_pos;
-                               bch2_btree_iter_set_pos(iter, start_pos);
+                               bch2_btree_iter_set_pos(&iter, start_pos);
                                continue;
                        }
 
@@ -733,41 +734,36 @@ retry:
 
        goto err;
 found_slot:
-       start_pos = iter->pos;
+       start_pos = iter.pos;
 
-       ret = ec_stripe_mem_alloc(c, iter);
+       ret = ec_stripe_mem_alloc(trans, &iter);
        if (ret)
                goto err;
 
-       stripe->k.p = iter->pos;
+       stripe->k.p = iter.pos;
 
-       ret   = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?:
-               bch2_trans_commit(&trans, res, NULL,
-                               BTREE_INSERT_NOFAIL);
-err:
-       bch2_trans_iter_put(&trans, iter);
+       ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0);
 
-       if (ret == -EINTR)
-               goto retry;
-
-       c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
-       bch2_trans_exit(&trans);
+       c->ec_stripe_hint = start_pos.offset;
+err:
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret;
 }
 
 static int ec_stripe_bkey_update(struct btree_trans *trans,
-                                struct bkey_i_stripe *new)
+                                struct bkey_i_stripe *new,
+                                struct disk_reservation *res)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        const struct bch_stripe *existing;
        unsigned i;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
-                                  new->k.p, BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes,
+                            new->k.p, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -790,9 +786,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
                stripe_blockcount_set(&new->v, i,
                        stripe_blockcount_get(existing, i));
 
-       ret = bch2_trans_update(trans, iter, &new->k_i, 0);
+       ret = bch2_trans_update(trans, &iter, &new->k_i, 0);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -820,10 +816,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
                                 struct bkey *pos)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_extent e;
        struct bkey_buf sk;
+       struct bpos next_pos;
        int ret = 0, dev, block;
 
        bch2_bkey_buf_init(&sk);
@@ -831,23 +828,29 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
        /* XXX this doesn't support the reflink btree */
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  bkey_start_pos(pos),
-                                  BTREE_ITER_INTENT);
-
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            bkey_start_pos(pos),
+                            BTREE_ITER_INTENT);
+retry:
+       while (bch2_trans_begin(&trans),
+              (k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+               const struct bch_extent_ptr *ptr_c;
                struct bch_extent_ptr *ptr, *ec_ptr = NULL;
 
                if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
-               block = bkey_matches_stripe(&s->key.v, k);
-               if (block < 0) {
-                       bch2_btree_iter_advance(iter);
+               ptr_c = bkey_matches_stripe(&s->key.v, k, &block);
+               /*
+                * It doesn't generally make sense to erasure code cached ptrs:
+                * XXX: should we be incrementing a counter?
+                */
+               if (!ptr_c || ptr_c->cached) {
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -862,17 +865,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
                extent_stripe_ptr_add(e, s, ec_ptr, block);
 
-               bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
-               ret   = bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, sk.k, 0) ?:
+               bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k));
+               next_pos = sk.k->k.p;
+
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, sk.k, 0) ?:
                        bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
-               if (ret == -EINTR)
-                       ret = 0;
+               if (!ret)
+                       bch2_btree_iter_set_pos(&iter, next_pos);
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       if (ret == -EINTR)
+               goto retry;
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
@@ -938,10 +945,10 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err_put_writes;
        }
 
-       ret = s->have_existing_stripe
-               ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
-                               ec_stripe_bkey_update(&trans, &s->new_stripe.key))
-               : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
+       ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
+                           s->have_existing_stripe
+                           ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res)
+                           : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res));
        if (ret) {
                bch_err(c, "error creating stripe: error creating stripe key");
                goto err_put_writes;
@@ -956,7 +963,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        }
 
        spin_lock(&c->ec_stripes_heap_lock);
-       m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
+       m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset);
 
        BUG_ON(m->on_heap);
        bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
@@ -1056,22 +1063,20 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
        if (!ob)
                return NULL;
 
-       ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
+       ca      = bch_dev_bkey_exists(c, ob->dev);
        offset  = ca->mi.bucket_size - ob->sectors_free;
 
        return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
-void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
-                            struct bpos pos, unsigned sectors)
+void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob,
+                            struct bkey *k)
 {
-       struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
-       struct ec_stripe_new *ec;
+       struct ec_stripe_new *ec = ob->ec;
 
-       if (!ob)
+       if (!ec)
                return;
 
-       ec = ob->ec;
        mutex_lock(&ec->lock);
 
        if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
@@ -1081,8 +1086,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
        }
 
        bkey_init(&ec->keys.top->k);
-       ec->keys.top->k.p       = pos;
-       bch2_key_resize(&ec->keys.top->k, sectors);
+       ec->keys.top->k.p       = k->p;
+       ec->keys.top->k.size    = k->size;
        bch2_keylist_push(&ec->keys);
 
        mutex_unlock(&ec->lock);
@@ -1147,8 +1152,8 @@ static void ec_stripe_key_init(struct bch_fs *c,
        s->v.algorithm                  = 0;
        s->v.nr_blocks                  = nr_data + nr_parity;
        s->v.nr_redundant               = nr_parity;
-       s->v.csum_granularity_bits      = ilog2(c->sb.encoded_extent_max);
-       s->v.csum_type                  = BCH_CSUM_CRC32C;
+       s->v.csum_granularity_bits      = ilog2(c->opts.encoded_extent_max >> 9);
+       s->v.csum_type                  = BCH_CSUM_crc32c;
        s->v.pad                        = 0;
 
        while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
@@ -1266,16 +1271,15 @@ found:
        return h;
 }
 
-static enum bucket_alloc_ret
-new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
-                        struct closure *cl)
+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+                                   struct closure *cl)
 {
        struct bch_devs_mask devs = h->devs;
        struct open_bucket *ob;
        struct open_buckets buckets;
        unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
        bool have_cache = true;
-       enum bucket_alloc_ret ret = ALLOC_SUCCESS;
+       int ret = 0;
 
        for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
                if (test_bit(i, h->s->blocks_gotten)) {
@@ -1314,7 +1318,7 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                        BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
 
                        h->s->blocks[j] = buckets.v[i];
-                       h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+                       h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
                        __set_bit(j, h->s->blocks_gotten);
                }
 
@@ -1342,7 +1346,7 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
                        BUG_ON(j >= h->s->nr_data);
 
                        h->s->blocks[j] = buckets.v[i];
-                       h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
+                       h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob);
                        __set_bit(j, h->s->blocks_gotten);
                }
 
@@ -1375,7 +1379,7 @@ static s64 get_existing_stripe(struct bch_fs *c,
                        continue;
 
                stripe_idx = h->data[heap_idx].idx;
-               m = genradix_ptr(&c->stripes[0], stripe_idx);
+               m = genradix_ptr(&c->stripes, stripe_idx);
 
                if (m->algorithm        == head->algo &&
                    m->nr_redundant     == head->redundancy &&
@@ -1510,7 +1514,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
 
 err:
        bch2_ec_stripe_head_put(c, h);
-       return ERR_PTR(-ret);
+       return ERR_PTR(ret);
 }
 
 void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
@@ -1531,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
                                continue;
 
                        ob = c->open_buckets + h->s->blocks[i];
-                       if (ob->ptr.dev == ca->dev_idx)
+                       if (ob->dev == ca->dev_idx)
                                goto found;
                }
                goto unlock;
@@ -1549,145 +1553,59 @@ void bch2_stripes_heap_start(struct bch_fs *c)
        struct genradix_iter iter;
        struct stripe *m;
 
-       genradix_for_each(&c->stripes[0], iter, m)
+       genradix_for_each(&c->stripes, iter, m)
                if (m->alive)
                        bch2_stripes_heap_insert(c, m, iter.pos);
 }
 
-static int __bch2_stripe_write_key(struct btree_trans *trans,
-                                  struct btree_iter *iter,
-                                  struct stripe *m,
-                                  size_t idx,
-                                  struct bkey_i_stripe *new_key)
+int bch2_stripes_read(struct bch_fs *c)
 {
-       const struct bch_stripe *v;
+       struct btree_trans trans;
+       struct btree_iter iter;
        struct bkey_s_c k;
+       const struct bch_stripe *s;
+       struct stripe *m;
        unsigned i;
        int ret;
 
-       bch2_btree_iter_set_pos(iter, POS(0, idx));
-
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
-       if (ret)
-               return ret;
-
-       if (k.k->type != KEY_TYPE_stripe)
-               return -EIO;
-
-       v = bkey_s_c_to_stripe(k).v;
-       for (i = 0; i < v->nr_blocks; i++)
-               if (m->block_sectors[i] != stripe_blockcount_get(v, i))
-                       goto write;
-       return 0;
-write:
-       bkey_reassemble(&new_key->k_i, k);
-
-       for (i = 0; i < new_key->v.nr_blocks; i++)
-               stripe_blockcount_set(&new_key->v, i,
-                                     m->block_sectors[i]);
-
-       return bch2_trans_update(trans, iter, &new_key->k_i, 0);
-}
-
-int bch2_stripes_write(struct bch_fs *c, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct genradix_iter giter;
-       struct bkey_i_stripe *new_key;
-       struct stripe *m;
-       int ret = 0;
-
-       new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
-       BUG_ON(!new_key);
-
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-       genradix_for_each(&c->stripes[0], giter, m) {
-               if (!m->alive)
+       for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (k.k->type != KEY_TYPE_stripe)
                        continue;
 
-               ret = __bch2_trans_do(&trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|flags,
-                       __bch2_stripe_write_key(&trans, iter, m,
-                                       giter.pos, new_key));
-
+               ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
                if (ret)
                        break;
-       }
-       bch2_trans_iter_put(&trans, iter);
 
-       bch2_trans_exit(&trans);
+               s = bkey_s_c_to_stripe(k).v;
 
-       kfree(new_key);
-
-       return ret;
-}
+               m = genradix_ptr(&c->stripes, k.k->p.offset);
+               m->alive        = true;
+               m->sectors      = le16_to_cpu(s->sectors);
+               m->algorithm    = s->algorithm;
+               m->nr_blocks    = s->nr_blocks;
+               m->nr_redundant = s->nr_redundant;
+               m->blocks_nonempty = 0;
 
-static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
-{
-       int ret = 0;
+               for (i = 0; i < s->nr_blocks; i++)
+                       m->blocks_nonempty += !!stripe_blockcount_get(s, i);
 
-       if (k.k->type == KEY_TYPE_stripe)
-               ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
-                       bch2_mark_key(c, k,
-                                     BTREE_TRIGGER_INSERT|
-                                     BTREE_TRIGGER_NOATOMIC);
+               spin_lock(&c->ec_stripes_heap_lock);
+               bch2_stripes_heap_update(c, m, k.k->p.offset);
+               spin_unlock(&c->ec_stripes_heap_lock);
+       }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       return ret;
-}
+       bch2_trans_exit(&trans);
 
-int bch2_stripes_read(struct bch_fs *c)
-{
-       int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
-                                             bch2_stripes_read_fn);
        if (ret)
                bch_err(c, "error reading stripes: %i", ret);
 
        return ret;
 }
 
-int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
-       size_t i, idx = 0;
-       int ret = 0;
-
-       bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
-
-       k = bch2_btree_iter_prev(iter);
-       if (!IS_ERR_OR_NULL(k.k))
-               idx = k.k->p.offset + 1;
-
-       bch2_trans_iter_put(&trans, iter);
-       ret = bch2_trans_exit(&trans);
-       if (ret)
-               return ret;
-
-       if (!idx)
-               return 0;
-
-       if (!gc &&
-           !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
-                      GFP_KERNEL))
-               return -ENOMEM;
-#if 0
-       ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
-#else
-       for (i = 0; i < idx; i++)
-               if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
-                       return -ENOMEM;
-#endif
-       return 0;
-}
-
 void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 {
        ec_stripes_heap *h = &c->ec_stripes_heap;
@@ -1696,7 +1614,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
 
        spin_lock(&c->ec_stripes_heap_lock);
        for (i = 0; i < min_t(size_t, h->used, 20); i++) {
-               m = genradix_ptr(&c->stripes[0], h->data[i].idx);
+               m = genradix_ptr(&c->stripes, h->data[i].idx);
 
                pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
                       h->data[i].blocks_nonempty,
@@ -1754,7 +1672,7 @@ void bch2_fs_ec_exit(struct bch_fs *c)
        BUG_ON(!list_empty(&c->ec_stripe_new_list));
 
        free_heap(&c->ec_stripes_heap);
-       genradix_free(&c->stripes[0]);
+       genradix_free(&c->stripes);
        bioset_exit(&c->ec_bioset);
 }
 
index e79626b59509a3082ecef7f06acba9afbabeb21f..78d468c7680a2f167070297392dbcabba1204f95 100644 (file)
@@ -108,7 +108,7 @@ static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
                                         le16_to_cpu(s->sectors));
 }
 
-static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m,
                                             struct extent_ptr_decoded p)
 {
        unsigned nr_data = m->nr_blocks - m->nr_redundant;
@@ -193,8 +193,8 @@ struct ec_stripe_head {
 int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
 
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
-void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
-                            struct bpos, unsigned);
+void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *,
+                            struct bkey *);
 
 void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
@@ -216,9 +216,6 @@ void bch2_ec_flush_new_stripes(struct bch_fs *);
 void bch2_stripes_heap_start(struct bch_fs *);
 
 int bch2_stripes_read(struct bch_fs *);
-int bch2_stripes_write(struct bch_fs *, unsigned);
-
-int bch2_ec_mem_alloc(struct bch_fs *, bool);
 
 void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
 void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
index 3fc31222459a81fd92ec2fbdc1225d5bf9bedbfe..edd93da663c1f50b110ee764aa6df1290aa6c687 100644 (file)
@@ -21,6 +21,15 @@ struct stripe {
        unsigned                alive:1; /* does a corresponding key exist in stripes btree? */
        unsigned                on_heap:1;
        u8                      blocks_nonempty;
+};
+
+struct gc_stripe {
+       u16                     sectors;
+
+       u8                      nr_blocks;
+       u8                      nr_redundant;
+
+       unsigned                alive:1; /* does a corresponding key exist in stripes btree? */
        u16                     block_sectors[BCH_BKEY_PTRS_MAX];
        struct bch_extent_ptr   ptrs[BCH_BKEY_PTRS_MAX];
 
diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h
new file mode 100644 (file)
index 0000000..f7d1291
--- /dev/null
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ERRCODE_H
+#define _BCACHEFS_ERRCODE_H
+
+enum {
+       /* Bucket allocator: */
+       OPEN_BUCKETS_EMPTY =    2048,
+       FREELIST_EMPTY,         /* Allocator thread not keeping up */
+       INSUFFICIENT_DEVICES,
+};
+
+#endif /* _BCACHFES_ERRCODE_H */
index 2cea694575e99a4df6c4af282c76cbc77230f06e..8279a9ba76a5c5e91524512d31e966e566cb240e 100644 (file)
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
                return false;
        case BCH_ON_ERROR_ro:
                if (bch2_fs_emergency_read_only(c))
-                       bch_err(c, "emergency read only");
+                       bch_err(c, "inconsistency detected - emergency read only");
                return true;
        case BCH_ON_ERROR_panic:
                panic(bch2_fmt(c, "panic after error"));
@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c)
 void bch2_fatal_error(struct bch_fs *c)
 {
        if (bch2_fs_emergency_read_only(c))
-               bch_err(c, "emergency read only");
+               bch_err(c, "fatal error - emergency read only");
 }
 
 void bch2_io_error_work(struct work_struct *work)
index 4a8dd085f7fb80b2650f5bab433cae077343d1e3..58b2c96f450c9ba8a4787431665e5e42b7b9833f 100644 (file)
@@ -58,10 +58,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
                u64 idx = le64_to_cpu(p.v->idx);
                unsigned sectors = bpos_min(*end, p.k->p).offset -
                        bkey_start_offset(p.k);
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct bkey_s_c r_k;
 
-               for_each_btree_key(trans, iter,
+               for_each_btree_key_norestart(trans, iter,
                                   BTREE_ID_reflink, POS(0, idx + offset),
                                   BTREE_ITER_SLOTS, r_k, ret2) {
                        if (bkey_cmp(bkey_start_pos(r_k.k),
@@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans,
                                break;
                        }
                }
+               bch2_trans_iter_exit(trans, &iter);
 
-               bch2_trans_iter_put(trans, iter);
                break;
        }
        }
@@ -94,12 +94,12 @@ static int count_iters_for_insert(struct btree_trans *trans,
 
 #define EXTENT_ITERS_MAX       (BTREE_ITER_MAX / 3)
 
-int bch2_extent_atomic_end(struct btree_iter *iter,
+int bch2_extent_atomic_end(struct btree_trans *trans,
+                          struct btree_iter *iter,
                           struct bkey_i *insert,
                           struct bpos *end)
 {
-       struct btree_trans *trans = iter->trans;
-       struct btree_iter *copy;
+       struct btree_iter copy;
        struct bkey_s_c k;
        unsigned nr_iters = 0;
        int ret;
@@ -118,9 +118,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
        if (ret < 0)
                return ret;
 
-       copy = bch2_trans_copy_iter(trans, iter);
+       bch2_trans_copy_iter(&copy, iter);
 
-       for_each_btree_key_continue(copy, 0, k, ret) {
+       for_each_btree_key_continue_norestart(copy, 0, k, ret) {
                unsigned offset = 0;
 
                if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
@@ -149,31 +149,21 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
                        break;
        }
 
-       bch2_trans_iter_put(trans, copy);
+       bch2_trans_iter_exit(trans, &copy);
        return ret < 0 ? ret : 0;
 }
 
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+int bch2_extent_trim_atomic(struct btree_trans *trans,
+                           struct btree_iter *iter,
+                           struct bkey_i *k)
 {
        struct bpos end;
        int ret;
 
-       ret = bch2_extent_atomic_end(iter, k, &end);
+       ret = bch2_extent_atomic_end(trans, iter, k, &end);
        if (ret)
                return ret;
 
        bch2_cut_back(end, k);
        return 0;
 }
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-       struct bpos end;
-       int ret;
-
-       ret = bch2_extent_atomic_end(iter, k, &end);
-       if (ret)
-               return ret;
-
-       return !bkey_cmp(end, k->k.p);
-}
index 2fa4602967e04f5cf02033a19d5d134ba90dcf8c..6f5cf449361a7f1aa6661086c43110c8b2e15455 100644 (file)
@@ -4,9 +4,9 @@
 
 #include "bcachefs.h"
 
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
-                          struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *,
+                          struct bkey_i *, struct bpos *);
+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *,
+                           struct bkey_i *);
 
 #endif /* _BCACHEFS_EXTENT_UPDATE_H */
index 563e13057f5f2411cef336f4f5644becec9f058a..44c584e9adaa8691a6335e10b70dab66e9a2bd0d 100644 (file)
@@ -303,7 +303,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
                        if (lp.crc.csum_type &&
                            lp.crc.uncompressed_size +
-                           rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+                           rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
                                return false;
 
                        if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
@@ -480,7 +480,7 @@ restart_narrow_pointers:
 
        bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
                if (can_narrow_crc(p.crc, n)) {
-                       bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+                       __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
                        p.ptr.offset += p.crc.offset;
                        p.crc = n;
                        bch2_extent_ptr_decoded_append(k, &p);
@@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
        return false;
 }
 
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-                               unsigned nr_replicas, bool compressed)
-{
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bpos end = pos;
-       struct bkey_s_c k;
-       bool ret = true;
-       int err;
-
-       end.offset += size;
-
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
-                          BTREE_ITER_SLOTS, k, err) {
-               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-                       break;
-
-               if (nr_replicas > bch2_bkey_replicas(c, k) ||
-                   (!compressed && bch2_bkey_sectors_compressed(k))) {
-                       ret = false;
-                       break;
-               }
-       }
-       bch2_trans_iter_put(&trans, iter);
-
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
-
 unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -817,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
        return i;
 }
 
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-                                          struct bch_extent_ptr *ptr)
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+       union bch_extent_entry *next = extent_entry_next(entry);
+
+       /* stripes have ptrs, but their layout doesn't work with this code */
+       BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+       memmove_u64s_down(entry, next,
+                         (u64 *) bkey_val_end(k) - (u64 *) next);
+       k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+                                            struct bch_extent_ptr *ptr)
 {
        struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *dst, *src, *prev;
+       union bch_extent_entry *entry = to_entry(ptr), *next;
+       union bch_extent_entry *ret = entry;
        bool drop_crc = true;
 
        EBUG_ON(ptr < &ptrs.start->ptr ||
                ptr >= &ptrs.end->ptr);
        EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-       src = extent_entry_next(to_entry(ptr));
-       if (src != ptrs.end &&
-           !extent_entry_is_crc(src))
-               drop_crc = false;
-
-       dst = to_entry(ptr);
-       while ((prev = extent_entry_prev(ptrs, dst))) {
-               if (extent_entry_is_ptr(prev))
+       for (next = extent_entry_next(entry);
+            next != ptrs.end;
+            next = extent_entry_next(next)) {
+               if (extent_entry_is_crc(next)) {
                        break;
-
-               if (extent_entry_is_crc(prev)) {
-                       if (drop_crc)
-                               dst = prev;
+               } else if (extent_entry_is_ptr(next)) {
+                       drop_crc = false;
                        break;
                }
+       }
 
-               dst = prev;
+       extent_entry_drop(k, entry);
+
+       while ((entry = extent_entry_prev(ptrs, entry))) {
+               if (extent_entry_is_ptr(entry))
+                       break;
+
+               if ((extent_entry_is_crc(entry) && drop_crc) ||
+                   extent_entry_is_stripe_ptr(entry)) {
+                       ret = (void *) ret - extent_entry_bytes(entry);
+                       extent_entry_drop(k, entry);
+               }
        }
 
-       memmove_u64s_down(dst, src,
-                         (u64 *) ptrs.end - (u64 *) src);
-       k.k->u64s -= (u64 *) src - (u64 *) dst;
+       return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+                                          struct bch_extent_ptr *ptr)
+{
+       bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+       union bch_extent_entry *ret =
+               __bch2_bkey_drop_ptr(k, ptr);
+
+       /*
+        * If we deleted all the dirty pointers and there's still cached
+        * pointers, we could set the cached pointers to dirty if they're not
+        * stale - but to do that correctly we'd need to grab an open_bucket
+        * reference so that we don't race with bucket reuse:
+        */
+       if (have_dirty &&
+           !bch2_bkey_dirty_devs(k.s_c).nr) {
+               k.k->type = KEY_TYPE_error;
+               set_bkey_val_u64s(k.k, 0);
+               ret = NULL;
+       } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+               k.k->type = KEY_TYPE_deleted;
+               set_bkey_val_u64s(k.k, 0);
+               ret = NULL;
+       }
 
-       return dst;
+       return ret;
 }
 
 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -921,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
                ptr->cached &&
                ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-       /* will only happen if all pointers were cached: */
-       if (!bch2_bkey_nr_ptrs(k.s_c))
-               k.k->type = KEY_TYPE_deleted;
-
        return bkey_deleted(k.k);
 }
 
@@ -961,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                case BCH_EXTENT_ENTRY_crc128:
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
                               crc.uncompressed_size,
                               crc.offset, crc.nonce,
-                              crc.csum_type,
-                              crc.compression_type);
+                              bch2_csum_types[crc.csum_type],
+                              bch2_compression_types[crc.compression_type]);
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        ec = &entry->stripe_ptr;
@@ -1030,7 +1038,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
 
        if (k.k->type == KEY_TYPE_btree_ptr ||
            k.k->type == KEY_TYPE_btree_ptr_v2)
-               size_ondisk = c->opts.btree_node_size;
+               size_ondisk = btree_sectors(c);
 
        bkey_extent_entry_for_each(ptrs, entry) {
                if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
index 43cef0a3bdf3870f47afc849f5f54700e3664824..9c2567274a2b8d286707d6b1b3594b3d04007ac3 100644 (file)
@@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-       switch (extent_entry_type(e)) {
-       case BCH_EXTENT_ENTRY_ptr:
-               return true;
-       default:
-               return false;
-       }
+       return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+       return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
@@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
 
 unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
@@ -579,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
+                                            struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
 
index 26d5cad7e6a5fd37b8640130c319f932def8e317..05429c9631cdad6eced17ff7638cd61651e12bf5 100644 (file)
  *
  * With one based indexing each level of the tree starts at a power of two -
  * good for cacheline alignment:
- *
- * Size parameter is treated as if we were using 0 based indexing, however:
- * valid nodes, and inorder indices, are in the range [1..size) - that is, there
- * are actually size - 1 elements
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
@@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i)
 
 static inline unsigned eytzinger1_first(unsigned size)
 {
-       return rounddown_pow_of_two(size - 1);
+       return rounddown_pow_of_two(size);
 }
 
 static inline unsigned eytzinger1_last(unsigned size)
 {
-       return rounddown_pow_of_two(size) - 1;
+       return rounddown_pow_of_two(size + 1) - 1;
 }
 
 /*
@@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-       EBUG_ON(i >= size);
+       EBUG_ON(i > size);
 
-       if (eytzinger1_right_child(i) < size) {
+       if (eytzinger1_right_child(i) <= size) {
                i = eytzinger1_right_child(i);
 
-               i <<= __fls(size) - __fls(i);
-               i >>= i >= size;
+               i <<= __fls(size + 1) - __fls(i);
+               i >>= i > size;
        } else {
                i >>= ffz(i) + 1;
        }
@@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-       EBUG_ON(i >= size);
+       EBUG_ON(i > size);
 
-       if (eytzinger1_left_child(i) < size) {
+       if (eytzinger1_left_child(i) <= size) {
                i = eytzinger1_left_child(i) + 1;
 
-               i <<= __fls(size) - __fls(i);
+               i <<= __fls(size + 1) - __fls(i);
                i -= 1;
-               i >>= i >= size;
+               i >>= i > size;
        } else {
                i >>= __ffs(i) + 1;
        }
@@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_extra(unsigned size)
 {
-       return (size - rounddown_pow_of_two(size - 1)) << 1;
+       return (size + 1 - rounddown_pow_of_two(size)) << 1;
 }
 
 static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
                                              unsigned extra)
 {
        unsigned b = __fls(i);
-       unsigned shift = __fls(size - 1) - b;
+       unsigned shift = __fls(size) - b;
        int s;
 
-       EBUG_ON(!i || i >= size);
+       EBUG_ON(!i || i > size);
 
        i  ^= 1U << b;
        i <<= 1;
@@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
        unsigned shift;
        int s;
 
-       EBUG_ON(!i || i >= size);
+       EBUG_ON(!i || i > size);
 
        /*
         * sign bit trick:
@@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
        shift = __ffs(i);
 
        i >>= shift + 1;
-       i  |= 1U << (__fls(size - 1) - shift);
+       i  |= 1U << (__fls(size) - shift);
 
        return i;
 }
@@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i)
 
 static inline unsigned eytzinger0_first(unsigned size)
 {
-       return eytzinger1_first(size + 1) - 1;
+       return eytzinger1_first(size) - 1;
 }
 
 static inline unsigned eytzinger0_last(unsigned size)
 {
-       return eytzinger1_last(size + 1) - 1;
+       return eytzinger1_last(size) - 1;
 }
 
 static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 {
-       return eytzinger1_next(i + 1, size + 1) - 1;
+       return eytzinger1_next(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 {
-       return eytzinger1_prev(i + 1, size + 1) - 1;
+       return eytzinger1_prev(i + 1, size) - 1;
 }
 
 static inline unsigned eytzinger0_extra(unsigned size)
 {
-       return eytzinger1_extra(size + 1);
+       return eytzinger1_extra(size);
 }
 
 static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
                                               unsigned extra)
 {
-       return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
+       return __eytzinger1_to_inorder(i + 1, size, extra) - 1;
 }
 
 static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
                                               unsigned extra)
 {
-       return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
+       return __inorder_to_eytzinger1(i + 1, size, extra) - 1;
 }
 
 static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
index 2189a11ccad8d42e89a3749f7b19836d76f9a053..d543480be111796702d9f5c5f2ced168f4815e16 100644 (file)
 #include "dirent.h"
 #include "fs-common.h"
 #include "inode.h"
+#include "subvolume.h"
 #include "xattr.h"
 
 #include <linux/posix_acl.h>
 
-int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode)
+{
+       return S_ISDIR(inode->bi_mode) && !inode->bi_subvol;
+}
+
+int bch2_create_trans(struct btree_trans *trans,
+                     subvol_inum dir,
                      struct bch_inode_unpacked *dir_u,
                      struct bch_inode_unpacked *new_inode,
                      const struct qstr *name,
                      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
                      struct posix_acl *default_acl,
-                     struct posix_acl *acl)
+                     struct posix_acl *acl,
+                     subvol_inum snapshot_src,
+                     unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL;
-       struct btree_iter *inode_iter = NULL;
-       struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
+       subvol_inum new_inum = dir;
        u64 now = bch2_current_time(c);
        u64 cpu = raw_smp_processor_id();
-       u64 dir_offset = 0;
+       u64 dir_target;
+       u32 snapshot;
+       unsigned dir_type = mode_to_type(mode);
        int ret;
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
        if (ret)
                goto err;
 
-       bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
-
-       if (!name)
-               new_inode->bi_flags |= BCH_INODE_UNLINKED;
-
-       inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       if (default_acl) {
-               ret = bch2_set_acl_trans(trans, new_inode, &hash,
-                                        default_acl, ACL_TYPE_DEFAULT);
+       if (!(flags & BCH_CREATE_SNAPSHOT)) {
+               /* Normal create path - allocate a new inode: */
+               bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
+
+               if (flags & BCH_CREATE_TMPFILE)
+                       new_inode->bi_flags |= BCH_INODE_UNLINKED;
+
+               ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu);
                if (ret)
                        goto err;
+
+               snapshot_src = (subvol_inum) { 0 };
+       } else {
+               /*
+                * Creating a snapshot - we're not allocating a new inode, but
+                * we do have to lookup the root inode of the subvolume we're
+                * snapshotting and update it (in the new snapshot):
+                */
+
+               if (!snapshot_src.inum) {
+                       /* Inode wasn't specified, just snapshot: */
+                       struct bch_subvolume s;
+
+                       ret = bch2_subvolume_get(trans, snapshot_src.subvol, true,
+                                                BTREE_ITER_CACHED, &s);
+                       if (ret)
+                               goto err;
+
+                       snapshot_src.inum = le64_to_cpu(s.inode);
+               }
+
+               ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src,
+                                     BTREE_ITER_INTENT);
+               if (ret)
+                       goto err;
+
+               if (new_inode->bi_subvol != snapshot_src.subvol) {
+                       /* Not a subvolume root: */
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               /*
+                * If we're not root, we have to own the subvolume being
+                * snapshotted:
+                */
+               if (uid && new_inode->bi_uid != uid) {
+                       ret = -EPERM;
+                       goto err;
+               }
+
+               flags |= BCH_CREATE_SUBVOL;
        }
 
-       if (acl) {
-               ret = bch2_set_acl_trans(trans, new_inode, &hash,
-                                        acl, ACL_TYPE_ACCESS);
+       new_inum.inum   = new_inode->bi_inum;
+       dir_target      = new_inode->bi_inum;
+
+       if (flags & BCH_CREATE_SUBVOL) {
+               u32 new_subvol, dir_snapshot;
+
+               ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+                                           snapshot_src.subvol,
+                                           &new_subvol, &snapshot,
+                                           (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
+               if (ret)
+                       goto err;
+
+               new_inode->bi_parent_subvol     = dir.subvol;
+               new_inode->bi_subvol            = new_subvol;
+               new_inum.subvol                 = new_subvol;
+               dir_target                      = new_subvol;
+               dir_type                        = DT_SUBVOL;
+
+               ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot);
+               if (ret)
+                       goto err;
+
+               bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot);
+               ret = bch2_btree_iter_traverse(&dir_iter);
                if (ret)
                        goto err;
        }
 
-       if (name) {
+       if (!(flags & BCH_CREATE_SNAPSHOT)) {
+               if (default_acl) {
+                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+                                                default_acl, ACL_TYPE_DEFAULT);
+                       if (ret)
+                               goto err;
+               }
+
+               if (acl) {
+                       ret = bch2_set_acl_trans(trans, new_inum, new_inode,
+                                                acl, ACL_TYPE_ACCESS);
+                       if (ret)
+                               goto err;
+               }
+       }
+
+       if (!(flags & BCH_CREATE_TMPFILE)) {
                struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
-               dir_u->bi_mtime = dir_u->bi_ctime = now;
+               u64 dir_offset;
 
-               if (S_ISDIR(new_inode->bi_mode))
+               if (is_subdir_for_nlink(new_inode))
                        dir_u->bi_nlink++;
+               dir_u->bi_mtime = dir_u->bi_ctime = now;
 
-               ret = bch2_inode_write(trans, dir_iter, dir_u);
+               ret = bch2_inode_write(trans, &dir_iter, dir_u);
                if (ret)
                        goto err;
 
-               ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
-                                        mode_to_type(new_inode->bi_mode),
-                                        name, new_inode->bi_inum,
+               ret = bch2_dirent_create(trans, dir, &dir_hash,
+                                        dir_type,
+                                        name,
+                                        dir_target,
                                         &dir_offset,
                                         BCH_HASH_SET_MUST_CREATE);
                if (ret)
                        goto err;
-       }
 
-       if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-               new_inode->bi_dir               = dir_u->bi_inum;
-               new_inode->bi_dir_offset        = dir_offset;
+               if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
+                       new_inode->bi_dir               = dir_u->bi_inum;
+                       new_inode->bi_dir_offset        = dir_offset;
+               }
        }
 
-       /* XXX use bch2_btree_iter_set_snapshot() */
-       inode_iter->snapshot = U32_MAX;
-       bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
+       inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
+       bch2_btree_iter_set_snapshot(&inode_iter, snapshot);
 
-       ret   = bch2_btree_iter_traverse(inode_iter) ?:
-               bch2_inode_write(trans, inode_iter, new_inode);
+       ret   = bch2_btree_iter_traverse(&inode_iter) ?:
+               bch2_inode_write(trans, &inode_iter, new_inode);
 err:
-       bch2_trans_iter_put(trans, inode_iter);
-       bch2_trans_iter_put(trans, dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
        return ret;
 }
 
-int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
-                   u64 inum, struct bch_inode_unpacked *dir_u,
-                   struct bch_inode_unpacked *inode_u, const struct qstr *name)
+int bch2_link_trans(struct btree_trans *trans,
+                   subvol_inum dir,  struct bch_inode_unpacked *dir_u,
+                   subvol_inum inum, struct bch_inode_unpacked *inode_u,
+                   const struct qstr *name)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
        struct bch_hash_info dir_hash;
        u64 now = bch2_current_time(c);
        u64 dir_offset = 0;
        int ret;
 
-       inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       if (dir.subvol != inum.subvol)
+               return -EXDEV;
+
+       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        inode_u->bi_ctime = now;
        bch2_inode_nlink_inc(inode_u);
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
@@ -121,84 +214,110 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
+       ret = bch2_dirent_create(trans, dir, &dir_hash,
                                 mode_to_type(inode_u->bi_mode),
-                                name, inum, &dir_offset,
+                                name, inum.inum, &dir_offset,
                                 BCH_HASH_SET_MUST_CREATE);
        if (ret)
                goto err;
 
        if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
-               inode_u->bi_dir         = dir_inum;
+               inode_u->bi_dir         = dir.inum;
                inode_u->bi_dir_offset  = dir_offset;
        }
 
-       ret =   bch2_inode_write(trans, dir_iter, dir_u) ?:
-               bch2_inode_write(trans, inode_iter, inode_u);
+       ret =   bch2_inode_write(trans, &dir_iter, dir_u) ?:
+               bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-       bch2_trans_iter_put(trans, dir_iter);
-       bch2_trans_iter_put(trans, inode_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
        return ret;
 }
 
 int bch2_unlink_trans(struct btree_trans *trans,
-                     u64 dir_inum, struct bch_inode_unpacked *dir_u,
+                     subvol_inum dir,
+                     struct bch_inode_unpacked *dir_u,
                      struct bch_inode_unpacked *inode_u,
-                     const struct qstr *name)
+                     const struct qstr *name,
+                     bool deleting_snapshot)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
-                         *inode_iter = NULL;
+       struct btree_iter dir_iter = { NULL };
+       struct btree_iter dirent_iter = { NULL };
+       struct btree_iter inode_iter = { NULL };
        struct bch_hash_info dir_hash;
-       u64 inum, now = bch2_current_time(c);
+       subvol_inum inum;
+       u64 now = bch2_current_time(c);
        struct bkey_s_c k;
        int ret;
 
-       dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dir_iter);
+       ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        dir_hash = bch2_hash_info_init(c, dir_u);
 
-       dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
-                                                name, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(dirent_iter);
+       ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+                                        name, &inum, BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       k = bch2_btree_iter_peek_slot(dirent_iter);
-       ret = bkey_err(k);
+       ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
+       if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+               ret = bch2_empty_dir_trans(trans, inum);
+               if (ret)
+                       goto err;
+       }
 
-       inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
-       if (ret)
+       if (deleting_snapshot && !inode_u->bi_subvol) {
+               ret = -ENOENT;
                goto err;
+       }
+
+       if (deleting_snapshot || inode_u->bi_subvol) {
+               ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
+               if (ret)
+                       goto err;
 
-       if (inode_u->bi_dir             == k.k->p.inode &&
-           inode_u->bi_dir_offset      == k.k->p.offset) {
+               k = bch2_btree_iter_peek_slot(&dirent_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               /*
+                * If we're deleting a subvolume, we need to really delete the
+                * dirent, not just emit a whiteout in the current snapshot:
+                */
+               bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot);
+               ret = bch2_btree_iter_traverse(&dirent_iter);
+               if (ret)
+                       goto err;
+       } else {
+               bch2_inode_nlink_dec(inode_u);
+       }
+
+       if (inode_u->bi_dir             == dirent_iter.pos.inode &&
+           inode_u->bi_dir_offset      == dirent_iter.pos.offset) {
                inode_u->bi_dir         = 0;
                inode_u->bi_dir_offset  = 0;
        }
 
        dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
-       dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
-       bch2_inode_nlink_dec(inode_u);
-
-       ret =   (S_ISDIR(inode_u->bi_mode)
-                ? bch2_empty_dir_trans(trans, inum)
-                : 0) ?:
-               bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
-               bch2_inode_write(trans, dir_iter, dir_u) ?:
-               bch2_inode_write(trans, inode_iter, inode_u);
+       dir_u->bi_nlink -= is_subdir_for_nlink(inode_u);
+
+       ret =   bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+                                   &dir_hash, &dirent_iter,
+                                   BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+               bch2_inode_write(trans, &dir_iter, dir_u) ?:
+               bch2_inode_write(trans, &inode_iter, inode_u);
 err:
-       bch2_trans_iter_put(trans, inode_iter);
-       bch2_trans_iter_put(trans, dirent_iter);
-       bch2_trans_iter_put(trans, dir_iter);
+       bch2_trans_iter_exit(trans, &inode_iter);
+       bch2_trans_iter_exit(trans, &dirent_iter);
+       bch2_trans_iter_exit(trans, &dir_iter);
        return ret;
 }
 
@@ -210,6 +329,7 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
        bool ret = false;
 
        for (id = 0; id < Inode_opt_nr; id++) {
+               /* Skip attributes that were explicitly set on this inode */
                if (dst_u->bi_fields_set & (1 << id))
                        continue;
 
@@ -227,8 +347,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 }
 
 int bch2_rename_trans(struct btree_trans *trans,
-                     u64 src_dir, struct bch_inode_unpacked *src_dir_u,
-                     u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
+                     subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
+                     subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
                      struct bch_inode_unpacked *src_inode_u,
                      struct bch_inode_unpacked *dst_inode_u,
                      const struct qstr *src_name,
@@ -236,25 +356,27 @@ int bch2_rename_trans(struct btree_trans *trans,
                      enum bch_rename_mode mode)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
-       struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
+       struct btree_iter src_dir_iter = { NULL };
+       struct btree_iter dst_dir_iter = { NULL };
+       struct btree_iter src_inode_iter = { NULL };
+       struct btree_iter dst_inode_iter = { NULL };
        struct bch_hash_info src_hash, dst_hash;
-       u64 src_inode, src_offset, dst_inode, dst_offset;
+       subvol_inum src_inum, dst_inum;
+       u64 src_offset, dst_offset;
        u64 now = bch2_current_time(c);
        int ret;
 
-       src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
-                                      BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_dir_iter);
+       ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
        src_hash = bch2_hash_info_init(c, src_dir_u);
 
-       if (dst_dir != src_dir) {
-               dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
-                                              BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(dst_dir_iter);
+       if (dst_dir.inum        != src_dir.inum ||
+           dst_dir.subvol      != src_dir.subvol) {
+               ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir,
+                                     BTREE_ITER_INTENT);
                if (ret)
                        goto err;
 
@@ -267,22 +389,20 @@ int bch2_rename_trans(struct btree_trans *trans,
        ret = bch2_dirent_rename(trans,
                                 src_dir, &src_hash,
                                 dst_dir, &dst_hash,
-                                src_name, &src_inode, &src_offset,
-                                dst_name, &dst_inode, &dst_offset,
+                                src_name, &src_inum, &src_offset,
+                                dst_name, &dst_inum, &dst_offset,
                                 mode);
        if (ret)
                goto err;
 
-       src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
-                                        BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(src_inode_iter);
+       ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum,
+                             BTREE_ITER_INTENT);
        if (ret)
                goto err;
 
-       if (dst_inode) {
-               dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
-                                                BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(dst_inode_iter);
+       if (dst_inum.inum) {
+               ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum,
+                                     BTREE_ITER_INTENT);
                if (ret)
                        goto err;
        }
@@ -312,7 +432,7 @@ int bch2_rename_trans(struct btree_trans *trans,
                }
 
                if (S_ISDIR(dst_inode_u->bi_mode) &&
-                   bch2_empty_dir_trans(trans, dst_inode)) {
+                   bch2_empty_dir_trans(trans, dst_inum)) {
                        ret = -ENOTEMPTY;
                        goto err;
                }
@@ -331,12 +451,12 @@ int bch2_rename_trans(struct btree_trans *trans,
                goto err;
        }
 
-       if (S_ISDIR(src_inode_u->bi_mode)) {
+       if (is_subdir_for_nlink(src_inode_u)) {
                src_dir_u->bi_nlink--;
                dst_dir_u->bi_nlink++;
        }
 
-       if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
+       if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) {
                dst_dir_u->bi_nlink--;
                src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
        }
@@ -347,28 +467,28 @@ int bch2_rename_trans(struct btree_trans *trans,
        src_dir_u->bi_mtime             = now;
        src_dir_u->bi_ctime             = now;
 
-       if (src_dir != dst_dir) {
+       if (src_dir.inum != dst_dir.inum) {
                dst_dir_u->bi_mtime     = now;
                dst_dir_u->bi_ctime     = now;
        }
 
        src_inode_u->bi_ctime           = now;
 
-       if (dst_inode)
+       if (dst_inum.inum)
                dst_inode_u->bi_ctime   = now;
 
-       ret =   bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
-               (src_dir != dst_dir
-                ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
+       ret =   bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?:
+               (src_dir.inum != dst_dir.inum
+                ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u)
                 : 0 ) ?:
-               bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
-               (dst_inode
-                ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
+               bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?:
+               (dst_inum.inum
+                ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u)
                 : 0 );
 err:
-       bch2_trans_iter_put(trans, dst_inode_iter);
-       bch2_trans_iter_put(trans, src_inode_iter);
-       bch2_trans_iter_put(trans, dst_dir_iter);
-       bch2_trans_iter_put(trans, src_dir_iter);
+       bch2_trans_iter_exit(trans, &dst_inode_iter);
+       bch2_trans_iter_exit(trans, &src_inode_iter);
+       bch2_trans_iter_exit(trans, &dst_dir_iter);
+       bch2_trans_iter_exit(trans, &src_dir_iter);
        return ret;
 }
index 2273b7961c9be6ab2d8960fda7f86068d4e44cad..dde2378595143fb6d7bbfda126b26bdaef17317c 100644 (file)
@@ -4,27 +4,33 @@
 
 struct posix_acl;
 
-int bch2_create_trans(struct btree_trans *, u64,
+#define BCH_CREATE_TMPFILE             (1U << 0)
+#define BCH_CREATE_SUBVOL              (1U << 1)
+#define BCH_CREATE_SNAPSHOT            (1U << 2)
+#define BCH_CREATE_SNAPSHOT_RO         (1U << 3)
+
+int bch2_create_trans(struct btree_trans *, subvol_inum,
                      struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      const struct qstr *,
                      uid_t, gid_t, umode_t, dev_t,
                      struct posix_acl *,
-                     struct posix_acl *);
+                     struct posix_acl *,
+                     subvol_inum, unsigned);
 
-int bch2_link_trans(struct btree_trans *, u64,
-                   u64, struct bch_inode_unpacked *,
-                   struct bch_inode_unpacked *,
+int bch2_link_trans(struct btree_trans *,
+                   subvol_inum, struct bch_inode_unpacked *,
+                   subvol_inum, struct bch_inode_unpacked *,
                    const struct qstr *);
 
-int bch2_unlink_trans(struct btree_trans *,
-                     u64, struct bch_inode_unpacked *,
+int bch2_unlink_trans(struct btree_trans *, subvol_inum,
+                     struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
-                     const struct qstr *);
+                     const struct qstr *, bool);
 
 int bch2_rename_trans(struct btree_trans *,
-                     u64, struct bch_inode_unpacked *,
-                     u64, struct bch_inode_unpacked *,
+                     subvol_inum, struct bch_inode_unpacked *,
+                     subvol_inum, struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      struct bch_inode_unpacked *,
                      const struct qstr *,
index 3333f6166bf2c13531b915b0f36024a8699f5de7..1d0871f63e4e71402874373a982262d74b00c24c 100644 (file)
@@ -223,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
                return;
 
        mutex_lock(&inode->ei_quota_lock);
+       BUG_ON((s64) inode->v.i_blocks + sectors < 0);
+       inode->v.i_blocks += sectors;
+
 #ifdef CONFIG_BCACHEFS_QUOTA
        if (quota_res && sectors > 0) {
                BUG_ON(sectors > quota_res->sectors);
@@ -234,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
                bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
        }
 #endif
-       inode->v.i_blocks += sectors;
        mutex_unlock(&inode->ei_quota_lock);
 }
 
@@ -243,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 /* stored in page->private: */
 
 struct bch_page_sector {
-       /* Uncompressed, fully allocated replicas: */
-       unsigned                nr_replicas:3;
+       /* Uncompressed, fully allocated replicas (or on disk reservation): */
+       unsigned                nr_replicas:4;
 
-       /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
-       unsigned                replicas_reserved:3;
+       /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
+       unsigned                replicas_reserved:4;
 
        /* i_sectors: */
        enum {
                SECTOR_UNALLOCATED,
                SECTOR_RESERVED,
                SECTOR_DIRTY,
+               SECTOR_DIRTY_RESERVED,
                SECTOR_ALLOCATED,
-       }                       state:2;
+       }                       state:8;
 };
 
 struct bch_page_state {
        spinlock_t              lock;
        atomic_t                write_count;
+       bool                    uptodate;
        struct bch_page_sector  s[PAGE_SECTORS];
 };
 
@@ -311,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page,
        return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
 }
 
+static unsigned bkey_to_sector_state(const struct bkey *k)
+{
+       if (k->type == KEY_TYPE_reservation)
+               return SECTOR_RESERVED;
+       if (bkey_extent_is_allocation(k))
+               return SECTOR_ALLOCATED;
+       return SECTOR_UNALLOCATED;
+}
+
+static void __bch2_page_state_set(struct page *page,
+                                 unsigned pg_offset, unsigned pg_len,
+                                 unsigned nr_ptrs, unsigned state)
+{
+       struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL);
+       unsigned i;
+
+       BUG_ON(pg_offset >= PAGE_SECTORS);
+       BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+       spin_lock(&s->lock);
+
+       for (i = pg_offset; i < pg_offset + pg_len; i++) {
+               s->s[i].nr_replicas = nr_ptrs;
+               s->s[i].state = state;
+       }
+
+       if (i == PAGE_SECTORS)
+               s->uptodate = true;
+
+       spin_unlock(&s->lock);
+}
+
+static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum,
+                              struct page **pages, unsigned nr_pages)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT;
+       unsigned pg_idx = 0;
+       u32 snapshot;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, ret) {
+               unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k);
+               unsigned state = bkey_to_sector_state(k.k);
+
+               while (pg_idx < nr_pages) {
+                       struct page *page = pages[pg_idx];
+                       u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+                       u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+                       unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start;
+                       unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start;
+
+                       BUG_ON(k.k->p.offset < pg_start);
+                       BUG_ON(bkey_start_offset(k.k) > pg_end);
+
+                       if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate)
+                               __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state);
+
+                       if (k.k->p.offset < pg_end)
+                               break;
+                       pg_idx++;
+               }
+
+               if (pg_idx == nr_pages)
+                       break;
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
+       bch2_trans_exit(&trans);
+
+       return ret;
+}
+
+static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k)
+{
+       struct bvec_iter iter;
+       struct bio_vec bv;
+       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
+               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
+       unsigned state = bkey_to_sector_state(k.k);
+
+       bio_for_each_segment(bv, bio, iter)
+               __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9,
+                                     bv.bv_len >> 9, nr_ptrs, state);
+}
+
+static void mark_pagecache_unallocated(struct bch_inode_info *inode,
+                                      u64 start, u64 end)
+{
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct pagevec pvec;
+
+       if (end <= start)
+               return;
+
+       pagevec_init(&pvec);
+
+       do {
+               unsigned nr_pages, i, j;
+
+               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+                                               &index, end_index);
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+                       u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+                       unsigned pg_offset = max(start, pg_start) - pg_start;
+                       unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+                       struct bch_page_state *s;
+
+                       BUG_ON(end <= pg_start);
+                       BUG_ON(pg_offset >= PAGE_SECTORS);
+                       BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+                       lock_page(page);
+                       s = bch2_page_state(page);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = pg_offset; j < pg_offset + pg_len; j++)
+                                       s->s[j].nr_replicas = 0;
+                               spin_unlock(&s->lock);
+                       }
+
+                       unlock_page(page);
+               }
+               pagevec_release(&pvec);
+       } while (index <= end_index);
+}
+
+static void mark_pagecache_reserved(struct bch_inode_info *inode,
+                                   u64 start, u64 end)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+       pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
+       struct pagevec pvec;
+       s64 i_sectors_delta = 0;
+
+       if (end <= start)
+               return;
+
+       pagevec_init(&pvec);
+
+       do {
+               unsigned nr_pages, i, j;
+
+               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
+                                               &index, end_index);
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+                       u64 pg_start = page->index << PAGE_SECTORS_SHIFT;
+                       u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT;
+                       unsigned pg_offset = max(start, pg_start) - pg_start;
+                       unsigned pg_len = min(end, pg_end) - pg_offset - pg_start;
+                       struct bch_page_state *s;
+
+                       BUG_ON(end <= pg_start);
+                       BUG_ON(pg_offset >= PAGE_SECTORS);
+                       BUG_ON(pg_offset + pg_len > PAGE_SECTORS);
+
+                       lock_page(page);
+                       s = bch2_page_state(page);
+
+                       if (s) {
+                               spin_lock(&s->lock);
+                               for (j = pg_offset; j < pg_offset + pg_len; j++)
+                                       switch (s->s[j].state) {
+                                       case SECTOR_UNALLOCATED:
+                                               s->s[j].state = SECTOR_RESERVED;
+                                               break;
+                                       case SECTOR_DIRTY:
+                                               s->s[j].state = SECTOR_DIRTY_RESERVED;
+                                               i_sectors_delta--;
+                                               break;
+                                       default:
+                                               break;
+                                       }
+                               spin_unlock(&s->lock);
+                       }
+
+                       unlock_page(page);
+               }
+               pagevec_release(&pvec);
+       } while (index <= end_index);
+
+       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+}
+
 static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
 {
        /* XXX: this should not be open coded */
@@ -395,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c,
        if (!s)
                return -ENOMEM;
 
+       BUG_ON(!s->uptodate);
+
        for (i = round_down(offset, block_bytes(c)) >> 9;
             i < round_up(offset + len, block_bytes(c)) >> 9;
             i++) {
@@ -449,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page)
                disk_res.sectors += s->s[i].replicas_reserved;
                s->s[i].replicas_reserved = 0;
 
-               if (s->s[i].state == SECTOR_DIRTY) {
-                       dirty_sectors++;
+               switch (s->s[i].state) {
+               case SECTOR_DIRTY:
                        s->s[i].state = SECTOR_UNALLOCATED;
+                       --dirty_sectors;
+                       break;
+               case SECTOR_DIRTY_RESERVED:
+                       s->s[i].state = SECTOR_RESERVED;
+                       break;
+               default:
+                       break;
                }
        }
 
        bch2_disk_reservation_put(c, &disk_res);
 
-       if (dirty_sectors)
-               i_sectors_acct(c, inode, NULL, -dirty_sectors);
+       i_sectors_acct(c, inode, NULL, dirty_sectors);
 
        bch2_page_state_release(page);
 }
@@ -491,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c,
                s->s[i].replicas_reserved += sectors;
                res->disk.sectors -= sectors;
 
-               if (s->s[i].state == SECTOR_UNALLOCATED)
+               switch (s->s[i].state) {
+               case SECTOR_UNALLOCATED:
+                       s->s[i].state = SECTOR_DIRTY;
                        dirty_sectors++;
-
-               s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
+                       break;
+               case SECTOR_RESERVED:
+                       s->s[i].state = SECTOR_DIRTY_RESERVED;
+                       break;
+               default:
+                       break;
+               }
        }
 
        spin_unlock(&s->lock);
 
-       if (dirty_sectors)
-               i_sectors_acct(c, inode, &res->quota, dirty_sectors);
+       i_sectors_acct(c, inode, &res->quota, dirty_sectors);
 
        if (!PageDirty(page))
                __set_page_dirty_nobuffers(page);
@@ -554,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
        struct bch2_page_reservation res;
        unsigned len;
        loff_t isize;
-       int ret = VM_FAULT_LOCKED;
+       int ret;
 
        bch2_page_reservation_init(c, inode, &res);
 
@@ -580,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
 
        len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
 
+       if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+               if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) {
+                       unlock_page(page);
+                       ret = VM_FAULT_SIGBUS;
+                       goto out;
+               }
+       }
+
        if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
                unlock_page(page);
                ret = VM_FAULT_SIGBUS;
@@ -590,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
        bch2_page_reservation_put(c, inode, &res);
 
        wait_for_stable_page(page);
+       ret = VM_FAULT_LOCKED;
 out:
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
        sb_end_pagefault(inode->v.i_sb);
@@ -703,29 +936,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
        return iter->pages[iter->idx];
 }
 
-static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
-{
-       struct bvec_iter iter;
-       struct bio_vec bv;
-       unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
-       unsigned state = k.k->type == KEY_TYPE_reservation
-               ? SECTOR_RESERVED
-               : SECTOR_ALLOCATED;
-
-       bio_for_each_segment(bv, bio, iter) {
-               struct bch_page_state *s = bch2_page_state(bv.bv_page);
-               unsigned i;
-
-               for (i = bv.bv_offset >> 9;
-                    i < (bv.bv_offset + bv.bv_len) >> 9;
-                    i++) {
-                       s->s[i].nr_replicas = nr_ptrs;
-                       s->s[i].state = state;
-               }
-       }
-}
-
 static bool extent_partial_reads_expensive(struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -745,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter,
 {
        while (bio_sectors(bio) < sectors_this_extent &&
               bio->bi_vcnt < bio->bi_max_vecs) {
-               pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
+               pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT;
                struct page *page = readpage_iter_next(iter);
                int ret;
 
@@ -786,23 +996,35 @@ static void readpage_bio_extend(struct readpages_iter *iter,
        }
 }
 
-static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
-                      struct bch_read_bio *rbio, u64 inum,
+static void bchfs_read(struct btree_trans *trans,
+                      struct bch_read_bio *rbio,
+                      subvol_inum inum,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
+       struct btree_iter iter;
        struct bkey_buf sk;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
+       u32 snapshot;
        int ret = 0;
 
        rbio->c = c;
        rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
 
        bch2_bkey_buf_init(&sk);
 retry:
        bch2_trans_begin(trans);
+       iter = (struct btree_iter) { NULL };
 
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
        while (1) {
                struct bkey_s_c k;
                unsigned bytes, sectors, offset_into_extent;
@@ -817,15 +1039,15 @@ retry:
                        break;
                }
 
-               bch2_btree_iter_set_pos(iter,
-                               POS(inum, rbio->bio.bi_iter.bi_sector));
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, rbio->bio.bi_iter.bi_sector));
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        break;
 
-               offset_into_extent = iter->pos.offset -
+               offset_into_extent = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
@@ -840,8 +1062,6 @@ retry:
 
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               bch2_trans_unlock(trans);
-
                if (readpages_iter)
                        readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
                                            extent_partial_reads_expensive(k));
@@ -852,10 +1072,9 @@ retry:
                if (rbio->bio.bi_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
-               if (bkey_extent_is_allocation(k.k))
-                       bch2_add_page_sectors(&rbio->bio, k);
+               bch2_bio_page_state_set(&rbio->bio, k);
 
-               bch2_read_extent(trans, rbio, iter->pos,
+               bch2_read_extent(trans, rbio, iter.pos,
                                 data_btree, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
@@ -863,13 +1082,19 @@ retry:
 
                swap(rbio->bio.bi_iter.bi_size, bytes);
                bio_advance(&rbio->bio, bytes);
+
+               ret = btree_trans_too_many_iters(trans);
+               if (ret)
+                       break;
        }
+err:
+       bch2_trans_iter_exit(trans, &iter);
 
        if (ret == -EINTR)
                goto retry;
 
        if (ret) {
-               bch_err_inum_ratelimited(c, inum,
+               bch_err_inum_ratelimited(c, inum.inum,
                                "read error %i from btree lookup", ret);
                rbio->bio.bi_status = BLK_STS_IOERR;
                bio_endio(&rbio->bio);
@@ -884,7 +1109,6 @@ void bch2_readahead(struct readahead_control *ractl)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
        struct page *page;
        struct readpages_iter readpages_iter;
        int ret;
@@ -893,8 +1117,6 @@ void bch2_readahead(struct readahead_control *ractl)
        BUG_ON(ret);
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-                                  BTREE_ITER_SLOTS);
 
        bch2_pagecache_add_get(&inode->ei_pagecache_lock);
 
@@ -911,41 +1133,34 @@ void bch2_readahead(struct readahead_control *ractl)
                readpages_iter.idx++;
 
                bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
-               rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
+               rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT;
                rbio->bio.bi_end_io = bch2_readpages_end_io;
                BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
-               bchfs_read(&trans, iter, rbio, inode->v.i_ino,
+               bchfs_read(&trans, rbio, inode_inum(inode),
                           &readpages_iter);
        }
 
        bch2_pagecache_add_put(&inode->ei_pagecache_lock);
 
-       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        kfree(readpages_iter.pages);
 }
 
 static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
-                            u64 inum, struct page *page)
+                            subvol_inum inum, struct page *page)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
 
        bch2_page_state_create(page, __GFP_NOFAIL);
 
        bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
        rbio->bio.bi_iter.bi_sector =
-               (sector_t) page->index << PAGE_SECTOR_SHIFT;
+               (sector_t) page->index << PAGE_SECTORS_SHIFT;
        BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
-                                  BTREE_ITER_SLOTS);
-
-       bchfs_read(&trans, iter, rbio, inum, NULL);
-
-       bch2_trans_iter_put(&trans, iter);
+       bchfs_read(&trans, rbio, inum, NULL);
        bch2_trans_exit(&trans);
 }
 
@@ -959,7 +1174,7 @@ int bch2_readpage(struct file *file, struct page *page)
        rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
        rbio->bio.bi_end_io = bch2_readpages_end_io;
 
-       __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+       __bchfs_readpage(c, rbio, inode_inum(inode), page);
        return 0;
 }
 
@@ -982,7 +1197,7 @@ static int bch2_read_single_page(struct page *page,
        rbio->bio.bi_private = &done;
        rbio->bio.bi_end_io = bch2_read_single_page_end_io;
 
-       __bchfs_readpage(c, rbio, inode->v.i_ino, page);
+       __bchfs_readpage(c, rbio, inode_inum(inode), page);
        wait_for_completion(&done);
 
        ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -1063,7 +1278,7 @@ static void bch2_writepage_io_done(struct closure *cl)
         * racing with fallocate can cause us to add fewer sectors than
         * expected - but we shouldn't add more sectors than expected:
         */
-       BUG_ON(io->op.i_sectors_delta > 0);
+       WARN_ON(io->op.i_sectors_delta > 0);
 
        /*
         * (error (due to going RO) halfway through a page can screw that up
@@ -1122,10 +1337,10 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
        op                      = &w->io->op;
        bch2_write_op_init(op, c, w->opts);
        op->target              = w->opts.foreground_target;
-       op_journal_seq_set(op, &inode->ei_journal_seq);
        op->nr_replicas         = nr_replicas;
        op->res.nr_replicas     = nr_replicas;
        op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
+       op->subvol              = inode->ei_subvol;
        op->pos                 = POS(inode->v.i_ino, sector);
        op->wbio.bio.bi_iter.bi_sector = sector;
        op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
@@ -1168,16 +1383,16 @@ static int __bch2_writepage(struct page *page,
 do_io:
        s = bch2_page_state_create(page, __GFP_NOFAIL);
 
-       ret = bch2_get_page_disk_reservation(c, inode, page, true);
-       if (ret) {
-               SetPageError(page);
-               mapping_set_error(page->mapping, ret);
-               unlock_page(page);
-               return 0;
-       }
+       /*
+        * Things get really hairy with errors during writeback:
+        */
+       ret = bch2_get_page_disk_reservation(c, inode, page, false);
+       BUG_ON(ret);
 
        /* Before unlocking the page, get copy of reservations: */
+       spin_lock(&s->lock);
        orig = *s;
+       spin_unlock(&s->lock);
 
        for (i = 0; i < PAGE_SECTORS; i++) {
                if (s->s[i].state < SECTOR_DIRTY)
@@ -1210,7 +1425,7 @@ do_io:
 
        offset = 0;
        while (1) {
-               unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
+               unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0;
                u64 sector;
 
                while (offset < PAGE_SECTORS &&
@@ -1220,16 +1435,15 @@ do_io:
                if (offset == PAGE_SECTORS)
                        break;
 
-               sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
-
                while (offset + sectors < PAGE_SECTORS &&
-                      orig.s[offset + sectors].state >= SECTOR_DIRTY)
+                      orig.s[offset + sectors].state >= SECTOR_DIRTY) {
+                       reserved_sectors += orig.s[offset + sectors].replicas_reserved;
+                       dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY;
                        sectors++;
-
-               for (i = offset; i < offset + sectors; i++) {
-                       reserved_sectors += orig.s[i].replicas_reserved;
-                       dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
                }
+               BUG_ON(!sectors);
+
+               sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset;
 
                if (w->io &&
                    (w->io->op.res.nr_replicas != nr_replicas_this_write ||
@@ -1346,6 +1560,12 @@ readpage:
        if (ret)
                goto err;
 out:
+       if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+               ret = bch2_page_state_set(c, inode_inum(inode), &page, 1);
+               if (ret)
+                       goto out;
+       }
+
        ret = bch2_page_reservation_get(c, inode, page, res,
                                        offset, len, true);
        if (ret) {
@@ -1475,20 +1695,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        }
 
        while (reserved < len) {
-               struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
+               unsigned i = (offset + reserved) >> PAGE_SHIFT;
+               struct page *page = pages[i];
                unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
                unsigned pg_len = min_t(unsigned, len - reserved,
                                        PAGE_SIZE - pg_offset);
-retry_reservation:
-               ret = bch2_page_reservation_get(c, inode, page, &res,
-                                               pg_offset, pg_len, true);
 
-               if (ret && !PageUptodate(page)) {
-                       ret = bch2_read_single_page(page, mapping);
-                       if (!ret)
-                               goto retry_reservation;
+               if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) {
+                       ret = bch2_page_state_set(c, inode_inum(inode),
+                                                 pages + i, nr_pages - i);
+                       if (ret)
+                               goto out;
                }
 
+               ret = bch2_page_reservation_get(c, inode, page, &res,
+                                               pg_offset, pg_len, true);
                if (ret)
                        goto out;
 
@@ -1504,8 +1725,8 @@ retry_reservation:
                unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
                unsigned pg_len = min_t(unsigned, len - copied,
                                        PAGE_SIZE - pg_offset);
-               unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
-                                               iter, pg_offset, pg_len);
+               unsigned pg_copied = copy_page_from_iter_atomic(page,
+                                               pg_offset, pg_len,iter);
 
                if (!pg_copied)
                        break;
@@ -1518,7 +1739,6 @@ retry_reservation:
                }
 
                flush_dcache_page(page);
-               iov_iter_advance(iter, pg_copied);
                copied += pg_copied;
 
                if (pg_copied != pg_len)
@@ -1758,7 +1978,7 @@ start:
                if (iter->count)
                        closure_get(&dio->cl);
 
-               bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
+               bch2_read(c, rbio_init(bio, opts), inode_inum(inode));
        }
 
        iter->count += shorten;
@@ -1813,6 +2033,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 /* O_DIRECT writes */
 
+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum,
+                                      u64 offset, u64 size,
+                                      unsigned nr_replicas, bool compressed)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 end = offset + size;
+       u32 snapshot;
+       bool ret = true;
+       int err;
+
+       bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (err)
+               goto err;
+
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inum.inum, offset, snapshot),
+                          BTREE_ITER_SLOTS, k, err) {
+               if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0)
+                       break;
+
+               if (k.k->p.snapshot != snapshot ||
+                   nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
+                       ret = false;
+                       break;
+               }
+       }
+
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (err == -EINTR)
+               goto retry;
+       bch2_trans_exit(&trans);
+
+       return err ? false : ret;
+}
+
 static void bch2_dio_write_loop_async(struct bch_write_op *);
 
 static long bch2_dio_write_loop(struct dio_write *dio)
@@ -1888,9 +2152,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
                dio->op.end_io          = bch2_dio_write_loop_async;
                dio->op.target          = dio->op.opts.foreground_target;
-               op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
                dio->op.write_point     = writepoint_hashed((unsigned long) current);
                dio->op.nr_replicas     = dio->op.opts.data_replicas;
+               dio->op.subvol          = inode->ei_subvol;
                dio->op.pos             = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
 
                if ((req->ki_flags & IOCB_DSYNC) &&
@@ -1901,8 +2165,8 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
                                                dio->op.opts.data_replicas, 0);
                if (unlikely(ret) &&
-                   !bch2_check_range_allocated(c, dio->op.pos,
-                               bio_sectors(bio),
+                   !bch2_check_range_allocated(c, inode_inum(inode),
+                               dio->op.pos.offset, bio_sectors(bio),
                                dio->op.opts.data_replicas,
                                dio->op.opts.compression != 0))
                        goto err;
@@ -2119,45 +2383,58 @@ unlock:
 
 /* fsync: */
 
-int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+/*
+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an
+ * insert trigger: look up the btree inode instead
+ */
+static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum)
 {
-       struct bch_inode_info *inode = file_bch_inode(file);
-       struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       int ret, ret2;
+       struct bch_inode_unpacked inode;
+       int ret;
 
-       ret = file_write_and_wait_range(file, start, end);
+       if (c->opts.journal_flush_disabled)
+               return 0;
+
+       ret = bch2_inode_find_by_inum(c, inum, &inode);
        if (ret)
                return ret;
 
-       if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
-               goto out;
+       return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq);
+}
 
-       ret = sync_inode_metadata(&inode->v, 1);
-       if (ret)
-               return ret;
-out:
-       if (!c->opts.journal_flush_disabled)
-               ret = bch2_journal_flush_seq(&c->journal,
-                                            inode->ei_journal_seq);
-       ret2 = file_check_and_advance_wb_err(file);
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+{
+       struct bch_inode_info *inode = file_bch_inode(file);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       int ret, ret2, ret3;
 
-       return ret ?: ret2;
+       ret = file_write_and_wait_range(file, start, end);
+       ret2 = sync_inode_metadata(&inode->v, 1);
+       ret3 = bch2_flush_inode(c, inode_inum(inode));
+
+       return ret ?: ret2 ?: ret3;
 }
 
 /* truncate: */
 
-static inline int range_has_data(struct bch_fs *c,
-                                 struct bpos start,
-                                 struct bpos end)
+static inline int range_has_data(struct bch_fs *c, u32 subvol,
+                                struct bpos start,
+                                struct bpos end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot);
+       if (ret)
+               goto err;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
@@ -2166,9 +2443,14 @@ static inline int range_has_data(struct bch_fs *c,
                        break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       start = iter.pos;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
 static int __bch2_truncate_page(struct bch_inode_info *inode,
@@ -2181,6 +2463,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
        unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
        unsigned i;
        struct page *page;
+       s64 i_sectors_delta = 0;
        int ret = 0;
 
        /* Page boundary? Nothing to do */
@@ -2198,9 +2481,9 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
                 * XXX: we're doing two index lookups when we end up reading the
                 * page
                 */
-               ret = range_has_data(c,
-                               POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
-                               POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
+               ret = range_has_data(c, inode->ei_subvol,
+                               POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT),
+                               POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT));
                if (ret <= 0)
                        return ret;
 
@@ -2232,9 +2515,21 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
             i < round_down(end_offset, block_bytes(c)) >> 9;
             i++) {
                s->s[i].nr_replicas     = 0;
+               if (s->s[i].state == SECTOR_DIRTY)
+                       i_sectors_delta--;
                s->s[i].state           = SECTOR_UNALLOCATED;
        }
 
+       i_sectors_acct(c, inode, NULL, i_sectors_delta);
+
+       /*
+        * Caller needs to know whether this page will be written out by
+        * writeback - doing an i_size update if necessary - or whether it will
+        * be responsible for the i_size update:
+        */
+       ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT),
+                         PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY;
+
        zero_user_segment(page, start_offset, end_offset);
 
        /*
@@ -2243,8 +2538,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode,
         * XXX: because we aren't currently tracking whether the page has actual
         * data in it (vs. just 0s, or only partially written) this wrong. ick.
         */
-       ret = bch2_get_page_disk_reservation(c, inode, page, false);
-       BUG_ON(ret);
+       BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false));
 
        /*
         * This removes any writeable userspace mappings; we need to force
@@ -2266,6 +2560,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
                                    from, round_up(from, PAGE_SIZE));
 }
 
+static int bch2_truncate_pages(struct bch_inode_info *inode,
+                              loff_t start, loff_t end)
+{
+       int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT,
+                                      start, end);
+
+       if (ret >= 0 &&
+           start >> PAGE_SHIFT != end >> PAGE_SHIFT)
+               ret = __bch2_truncate_page(inode,
+                                          end >> PAGE_SHIFT,
+                                          start, end);
+       return ret;
+}
+
 static int bch2_extend(struct user_namespace *mnt_userns,
                       struct bch_inode_info *inode,
                       struct bch_inode_unpacked *inode_u,
@@ -2332,7 +2640,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
        inode_dio_wait(&inode->v);
        bch2_pagecache_block_get(&inode->ei_pagecache_lock);
 
-       ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u);
+       ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u);
        if (ret)
                goto err;
 
@@ -2356,7 +2664,7 @@ int bch2_truncate(struct user_namespace *mnt_userns,
        iattr->ia_valid &= ~ATTR_SIZE;
 
        ret = bch2_truncate_page(inode, iattr->ia_size);
-       if (unlikely(ret))
+       if (unlikely(ret < 0))
                goto err;
 
        /*
@@ -2390,11 +2698,14 @@ int bch2_truncate(struct user_namespace *mnt_userns,
 
        truncate_setsize(&inode->v, iattr->ia_size);
 
-       ret = bch2_fpunch(c, inode->v.i_ino,
+       ret = bch2_fpunch(c, inode_inum(inode),
                        round_up(iattr->ia_size, block_bytes(c)) >> 9,
-                       U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
+                       U64_MAX, &i_sectors_delta);
        i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
+       WARN_ON(!inode->v.i_size && inode->v.i_blocks &&
+               !bch2_journal_error(&c->journal));
+
        if (unlikely(ret))
                goto err;
 
@@ -2422,49 +2733,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode,
 static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
-       u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
+       u64 end         = offset + len;
+       u64 block_start = round_up(offset, block_bytes(c));
+       u64 block_end   = round_down(end, block_bytes(c));
+       bool truncated_last_page;
        int ret = 0;
 
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
-       ret = __bch2_truncate_page(inode,
-                                  offset >> PAGE_SHIFT,
-                                  offset, offset + len);
-       if (unlikely(ret))
+       ret = bch2_truncate_pages(inode, offset, end);
+       if (unlikely(ret < 0))
                goto err;
 
-       if (offset >> PAGE_SHIFT !=
-           (offset + len) >> PAGE_SHIFT) {
-               ret = __bch2_truncate_page(inode,
-                                          (offset + len) >> PAGE_SHIFT,
-                                          offset, offset + len);
-               if (unlikely(ret))
-                       goto err;
-       }
+       truncated_last_page = ret;
 
-       truncate_pagecache_range(&inode->v, offset, offset + len - 1);
+       truncate_pagecache_range(&inode->v, offset, end - 1);
 
-       if (discard_start < discard_end) {
+       if (block_start < block_end ) {
                s64 i_sectors_delta = 0;
 
-               ret = bch2_fpunch(c, inode->v.i_ino,
-                                 discard_start, discard_end,
-                                 &inode->ei_journal_seq,
+               ret = bch2_fpunch(c, inode_inum(inode),
+                                 block_start >> 9, block_end >> 9,
                                  &i_sectors_delta);
                i_sectors_acct(c, inode, NULL, i_sectors_delta);
        }
 
        mutex_lock(&inode->ei_update_lock);
-       ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
-                              ATTR_MTIME|ATTR_CTIME) ?: ret;
+       if (end >= inode->v.i_size && !truncated_last_page) {
+               ret = bch2_write_inode_size(c, inode, inode->v.i_size,
+                                           ATTR_MTIME|ATTR_CTIME);
+       } else {
+               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                      ATTR_MTIME|ATTR_CTIME);
+       }
        mutex_unlock(&inode->ei_update_lock);
 err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
-
        return ret;
 }
 
@@ -2476,7 +2777,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        struct address_space *mapping = inode->v.i_mapping;
        struct bkey_buf copy;
        struct btree_trans trans;
-       struct btree_iter *src, *dst, *del;
+       struct btree_iter src, dst, del;
        loff_t shift, new_size;
        u64 src_start;
        int ret = 0;
@@ -2484,31 +2785,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
-       /*
-        * We need i_mutex to keep the page cache consistent with the extents
-        * btree, and the btree consistent with i_size - we don't need outside
-        * locking for the extents btree itself, because we're using linked
-        * iterators
-        */
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
-
        if (insert) {
-               ret = -EFBIG;
                if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
-                       goto err;
+                       return -EFBIG;
 
-               ret = -EINVAL;
                if (offset >= inode->v.i_size)
-                       goto err;
+                       return -EINVAL;
 
                src_start       = U64_MAX;
                shift           = len;
        } else {
-               ret = -EINVAL;
                if (offset + len >= inode->v.i_size)
-                       goto err;
+                       return -EINVAL;
 
                src_start       = offset + len;
                shift           = -len;
@@ -2518,7 +2806,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 
        ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
        if (ret)
-               goto err;
+               return ret;
 
        if (insert) {
                i_size_write(&inode->v, new_size);
@@ -2529,23 +2817,22 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        } else {
                s64 i_sectors_delta = 0;
 
-               ret = bch2_fpunch(c, inode->v.i_ino,
+               ret = bch2_fpunch(c, inode_inum(inode),
                                  offset >> 9, (offset + len) >> 9,
-                                 &inode->ei_journal_seq,
                                  &i_sectors_delta);
                i_sectors_acct(c, inode, NULL, i_sectors_delta);
 
                if (ret)
-                       goto err;
+                       return ret;
        }
 
        bch2_bkey_buf_init(&copy);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+       bch2_trans_iter_init(&trans, &src, BTREE_ID_extents,
                        POS(inode->v.i_ino, src_start >> 9),
                        BTREE_ITER_INTENT);
-       dst = bch2_trans_copy_iter(&trans, src);
-       del = bch2_trans_copy_iter(&trans, src);
+       bch2_trans_copy_iter(&dst, &src);
+       bch2_trans_copy_iter(&del, &src);
 
        while (ret == 0 || ret == -EINTR) {
                struct disk_reservation disk_res =
@@ -2556,12 +2843,24 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
                struct bpos atomic_end;
                unsigned trigger_flags = 0;
+               u32 snapshot;
+
+               bch2_trans_begin(&trans);
+
+               ret = bch2_subvolume_get_snapshot(&trans,
+                                       inode->ei_subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&src, snapshot);
+               bch2_btree_iter_set_snapshot(&dst, snapshot);
+               bch2_btree_iter_set_snapshot(&del, snapshot);
 
                bch2_trans_begin(&trans);
 
                k = insert
-                       ? bch2_btree_iter_peek_prev(src)
-                       : bch2_btree_iter_peek(src);
+                       ? bch2_btree_iter_peek_prev(&src)
+                       : bch2_btree_iter_peek(&src);
                if ((ret = bkey_err(k)))
                        continue;
 
@@ -2579,9 +2878,9 @@ reassemble:
                        bch2_cut_front(move_pos, copy.k);
 
                copy.k->k.p.offset += shift >> 9;
-               bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
+               bch2_btree_iter_set_pos(&dst, bkey_start_pos(&copy.k->k));
 
-               ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
+               ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end);
                if (ret)
                        continue;
 
@@ -2599,7 +2898,7 @@ reassemble:
                delete.k.p = copy.k->k.p;
                delete.k.size = copy.k->k.size;
                delete.k.p.offset -= shift >> 9;
-               bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
+               bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k));
 
                next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2620,36 +2919,36 @@ reassemble:
                        BUG_ON(ret);
                }
 
-               ret =   bch2_btree_iter_traverse(del) ?:
-                       bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
-                       bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
-                       bch2_trans_commit(&trans, &disk_res,
-                                         &inode->ei_journal_seq,
+               ret =   bch2_btree_iter_traverse(&del) ?:
+                       bch2_trans_update(&trans, &del, &delete, trigger_flags) ?:
+                       bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?:
+                       bch2_trans_commit(&trans, &disk_res, NULL,
                                          BTREE_INSERT_NOFAIL);
                bch2_disk_reservation_put(c, &disk_res);
 
                if (!ret)
-                       bch2_btree_iter_set_pos(src, next_pos);
+                       bch2_btree_iter_set_pos(&src, next_pos);
        }
-       bch2_trans_iter_put(&trans, del);
-       bch2_trans_iter_put(&trans, dst);
-       bch2_trans_iter_put(&trans, src);
+       bch2_trans_iter_exit(&trans, &del);
+       bch2_trans_iter_exit(&trans, &dst);
+       bch2_trans_iter_exit(&trans, &src);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&copy, c);
 
        if (ret)
-               goto err;
+               return ret;
 
+       mutex_lock(&inode->ei_update_lock);
        if (!insert) {
                i_size_write(&inode->v, new_size);
-               mutex_lock(&inode->ei_update_lock);
                ret = bch2_write_inode_size(c, inode, new_size,
                                            ATTR_MTIME|ATTR_CTIME);
-               mutex_unlock(&inode->ei_update_lock);
+       } else {
+               /* We need an inode update to update bi_journal_seq for fsync: */
+               ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
+                                      ATTR_MTIME|ATTR_CTIME);
        }
-err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
+       mutex_unlock(&inode->ei_update_lock);
        return ret;
 }
 
@@ -2658,41 +2957,49 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bpos end_pos = POS(inode->v.i_ino, end_sector);
        unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
                        POS(inode->v.i_ino, start_sector),
                        BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
+       while (!ret && bkey_cmp(iter.pos, end_pos) < 0) {
                s64 i_sectors_delta = 0;
                struct disk_reservation disk_res = { 0 };
                struct quota_res quota_res = { 0 };
                struct bkey_i_reservation reservation;
                struct bkey_s_c k;
                unsigned sectors;
+               u32 snapshot;
 
                bch2_trans_begin(&trans);
 
-               k = bch2_btree_iter_peek_slot(iter);
+               ret = bch2_subvolume_get_snapshot(&trans,
+                                       inode->ei_subvol, &snapshot);
+               if (ret)
+                       goto bkey_err;
+
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+               k = bch2_btree_iter_peek_slot(&iter);
                if ((ret = bkey_err(k)))
                        goto bkey_err;
 
                /* already reserved */
                if (k.k->type == KEY_TYPE_reservation &&
                    bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
                if (bkey_extent_is_data(k.k) &&
                    !(mode & FALLOC_FL_ZERO_RANGE)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -2701,7 +3008,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                reservation.k.p         = k.k->p;
                reservation.k.size      = k.k->size;
 
-               bch2_cut_front(iter->pos,       &reservation.k_i);
+               bch2_cut_front(iter.pos,        &reservation.k_i);
                bch2_cut_back(end_pos,          &reservation.k_i);
 
                sectors = reservation.k.size;
@@ -2725,9 +3032,12 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
                        reservation.v.nr_replicas = disk_res.nr_replicas;
                }
 
-               ret = bch2_extent_update(&trans, iter, &reservation.k_i,
-                               &disk_res, &inode->ei_journal_seq,
+               ret = bch2_extent_update(&trans, inode_inum(inode), &iter,
+                                        &reservation.k_i,
+                               &disk_res, NULL,
                                0, &i_sectors_delta, true);
+               if (ret)
+                       goto bkey_err;
                i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
 bkey_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
@@ -2735,7 +3045,21 @@ bkey_err:
                if (ret == -EINTR)
                        ret = 0;
        }
-       bch2_trans_iter_put(&trans, iter);
+
+       bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */
+       mark_pagecache_reserved(inode, start_sector, iter.pos.offset);
+
+       if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) {
+               struct quota_res quota_res = { 0 };
+               s64 i_sectors_delta = 0;
+
+               bch2_fpunch_at(&trans, &iter, inode_inum(inode),
+                              end_sector, &i_sectors_delta);
+               i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
+               bch2_quota_reservation_put(c, inode, &quota_res);
+       }
+
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -2743,77 +3067,58 @@ bkey_err:
 static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
                            loff_t offset, loff_t len)
 {
-       struct address_space *mapping = inode->v.i_mapping;
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
-       loff_t end              = offset + len;
-       loff_t block_start      = round_down(offset,    block_bytes(c));
-       loff_t block_end        = round_up(end,         block_bytes(c));
-       int ret;
-
-       inode_lock(&inode->v);
-       inode_dio_wait(&inode->v);
-       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+       u64 end         = offset + len;
+       u64 block_start = round_down(offset,    block_bytes(c));
+       u64 block_end   = round_up(end,         block_bytes(c));
+       bool truncated_last_page = false;
+       int ret, ret2 = 0;
 
        if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
                ret = inode_newsize_ok(&inode->v, end);
                if (ret)
-                       goto err;
+                       return ret;
        }
 
        if (mode & FALLOC_FL_ZERO_RANGE) {
-               ret = __bch2_truncate_page(inode,
-                                          offset >> PAGE_SHIFT,
-                                          offset, end);
-
-               if (!ret &&
-                   offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
-                       ret = __bch2_truncate_page(inode,
-                                                  end >> PAGE_SHIFT,
-                                                  offset, end);
+               ret = bch2_truncate_pages(inode, offset, end);
+               if (unlikely(ret < 0))
+                       return ret;
 
-               if (unlikely(ret))
-                       goto err;
+               truncated_last_page = ret;
 
                truncate_pagecache_range(&inode->v, offset, end - 1);
+
+               block_start     = round_up(offset,      block_bytes(c));
+               block_end       = round_down(end,       block_bytes(c));
        }
 
        ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
-       if (ret)
-               goto err;
 
        /*
-        * Do we need to extend the file?
-        *
-        * If we zeroed up to the end of the file, we dropped whatever writes
-        * were going to write out the current i_size, so we have to extend
-        * manually even if FL_KEEP_SIZE was set:
+        * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update,
+        * so that the VFS cache i_size is consistent with the btree i_size:
         */
-       if (end >= inode->v.i_size &&
-           (!(mode & FALLOC_FL_KEEP_SIZE) ||
-            (mode & FALLOC_FL_ZERO_RANGE))) {
+       if (ret &&
+           !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)))
+               return ret;
 
-               /*
-                * Sync existing appends before extending i_size,
-                * as in bch2_extend():
-                */
-               ret = filemap_write_and_wait_range(mapping,
-                                       inode->ei_inode.bi_size, S64_MAX);
-               if (ret)
-                       goto err;
+       if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size)
+               end = inode->v.i_size;
 
-               if (mode & FALLOC_FL_KEEP_SIZE)
-                       end = inode->v.i_size;
-               else
-                       i_size_write(&inode->v, end);
+       if (end >= inode->v.i_size &&
+           (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) ||
+            !(mode & FALLOC_FL_KEEP_SIZE))) {
+               spin_lock(&inode->v.i_lock);
+               i_size_write(&inode->v, end);
+               spin_unlock(&inode->v.i_lock);
 
                mutex_lock(&inode->ei_update_lock);
-               ret = bch2_write_inode_size(c, inode, end, 0);
+               ret2 = bch2_write_inode_size(c, inode, end, 0);
                mutex_unlock(&inode->ei_update_lock);
        }
-err:
-       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
-       inode_unlock(&inode->v);
-       return ret;
+
+       return ret ?: ret2;
 }
 
 long bch2_fallocate_dispatch(struct file *file, int mode,
@@ -2826,6 +3131,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        if (!percpu_ref_tryget(&c->writes))
                return -EROFS;
 
+       inode_lock(&inode->v);
+       inode_dio_wait(&inode->v);
+       bch2_pagecache_block_get(&inode->ei_pagecache_lock);
+
        if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
                ret = bchfs_fallocate(inode, mode, offset, len);
        else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
@@ -2837,48 +3146,14 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
        else
                ret = -EOPNOTSUPP;
 
+
+       bch2_pagecache_block_put(&inode->ei_pagecache_lock);
+       inode_unlock(&inode->v);
        percpu_ref_put(&c->writes);
 
        return ret;
 }
 
-static void mark_range_unallocated(struct bch_inode_info *inode,
-                                  loff_t start, loff_t end)
-{
-       pgoff_t index = start >> PAGE_SHIFT;
-       pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
-       struct pagevec pvec;
-
-       pagevec_init(&pvec);
-
-       do {
-               unsigned nr_pages, i, j;
-
-               nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
-                                               &index, end_index);
-               if (nr_pages == 0)
-                       break;
-
-               for (i = 0; i < nr_pages; i++) {
-                       struct page *page = pvec.pages[i];
-                       struct bch_page_state *s;
-
-                       lock_page(page);
-                       s = bch2_page_state(page);
-
-                       if (s) {
-                               spin_lock(&s->lock);
-                               for (j = 0; j < PAGE_SECTORS; j++)
-                                       s->s[j].nr_replicas = 0;
-                               spin_unlock(&s->lock);
-                       }
-
-                       unlock_page(page);
-               }
-               pagevec_release(&pvec);
-       } while (index <= end_index);
-}
-
 loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
                             struct file *file_dst, loff_t pos_dst,
                             loff_t len, unsigned remap_flags)
@@ -2924,13 +3199,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
        if (ret)
                goto err;
 
-       mark_range_unallocated(src, pos_src, pos_src + aligned_len);
+       mark_pagecache_unallocated(src, pos_src >> 9,
+                                  (pos_src + aligned_len) >> 9);
 
        ret = bch2_remap_range(c,
-                              POS(dst->v.i_ino, pos_dst >> 9),
-                              POS(src->v.i_ino, pos_src >> 9),
+                              inode_inum(dst), pos_dst >> 9,
+                              inode_inum(src), pos_src >> 9,
                               aligned_len >> 9,
-                              &dst->ei_journal_seq,
                               pos_dst + len, &i_sectors_delta);
        if (ret < 0)
                goto err;
@@ -2948,10 +3223,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
                i_size_write(&dst->v, pos_dst + ret);
        spin_unlock(&dst->v.i_lock);
 
-       if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
-            IS_SYNC(file_inode(file_dst))) &&
-           !c->opts.journal_flush_disabled)
-               ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq);
+       if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) ||
+           IS_SYNC(file_inode(file_dst)))
+               ret = bch2_flush_inode(c, inode_inum(dst));
 err:
        bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
 
@@ -3017,9 +3291,11 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
+       subvol_inum inum = inode_inum(inode);
        u64 isize, next_data = MAX_LFS_FILESIZE;
+       u32 snapshot;
        int ret;
 
        isize = i_size_read(&inode->v);
@@ -3027,9 +3303,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                return -ENXIO;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
                        break;
                } else if (bkey_extent_is_data(k.k)) {
@@ -3038,9 +3320,12 @@ static loff_t bch2_seek_data(struct file *file, u64 offset)
                } else if (k.k->p.offset >> 9 > isize)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        if (ret)
                return ret;
 
@@ -3113,9 +3398,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
        struct bch_inode_info *inode = file_bch_inode(file);
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
+       subvol_inum inum = inode_inum(inode);
        u64 isize, next_hole = MAX_LFS_FILESIZE;
+       u32 snapshot;
        int ret;
 
        isize = i_size_read(&inode->v);
@@ -3123,9 +3410,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                return -ENXIO;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          POS(inode->v.i_ino, offset >> 9),
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents,
+                          SPOS(inode->v.i_ino, offset >> 9, snapshot),
                           BTREE_ITER_SLOTS, k, ret) {
                if (k.k->p.inode != inode->v.i_ino) {
                        next_hole = bch2_seek_pagecache_hole(&inode->v,
@@ -3142,9 +3435,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset)
                        offset = max(offset, bkey_start_offset(k.k) << 9);
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        if (ret)
                return ret;
 
index 91a0e761c8e70d5179d42a29fade38ed61666350..9f329a624c1270628aa4732b0de530bdd1b57056 100644 (file)
 #include "quota.h"
 
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include <linux/writeback.h>
 
 #define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
 #define FSOP_GOING_FLAGS_DEFAULT       0x0     /* going down */
@@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
        char *kname = NULL;
        struct qstr qstr;
        int ret = 0;
-       u64 inum;
+       subvol_inum inum;
 
        kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
        if (!kname)
@@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
        qstr.len        = ret;
        qstr.name       = kname;
 
-       ret = -ENOENT;
-       inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
-                                 &qstr);
-       if (!inum)
+       ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum);
+       if (ret)
                goto err1;
 
        vinode = bch2_vfs_inode_get(c, inum);
@@ -294,6 +296,161 @@ err:
        return ret;
 }
 
+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
+                               struct bch_ioctl_subvolume arg)
+{
+       struct inode *dir;
+       struct bch_inode_info *inode;
+       struct user_namespace *s_user_ns;
+       struct dentry *dst_dentry;
+       struct path src_path, dst_path;
+       int how = LOOKUP_FOLLOW;
+       int error;
+       subvol_inum snapshot_src = { 0 };
+       unsigned lookup_flags = 0;
+       unsigned create_flags = BCH_CREATE_SUBVOL;
+
+       if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE|
+                         BCH_SUBVOL_SNAPSHOT_RO))
+               return -EINVAL;
+
+       if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+           (arg.src_ptr ||
+            (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)))
+               return -EINVAL;
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+               create_flags |= BCH_CREATE_SNAPSHOT;
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
+               create_flags |= BCH_CREATE_SNAPSHOT_RO;
+
+       /* why do we need this lock? */
+       down_read(&c->vfs_sb->s_umount);
+
+       if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+               sync_inodes_sb(c->vfs_sb);
+retry:
+       if (arg.src_ptr) {
+               error = user_path_at(arg.dirfd,
+                               (const char __user *)(unsigned long)arg.src_ptr,
+                               how, &src_path);
+               if (error)
+                       goto err1;
+
+               if (src_path.dentry->d_sb->s_fs_info != c) {
+                       path_put(&src_path);
+                       error = -EXDEV;
+                       goto err1;
+               }
+
+               snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode));
+       }
+
+       dst_dentry = user_path_create(arg.dirfd,
+                       (const char __user *)(unsigned long)arg.dst_ptr,
+                       &dst_path, lookup_flags);
+       error = PTR_ERR_OR_ZERO(dst_dentry);
+       if (error)
+               goto err2;
+
+       if (dst_dentry->d_sb->s_fs_info != c) {
+               error = -EXDEV;
+               goto err3;
+       }
+
+       if (dst_dentry->d_inode) {
+               error = -EEXIST;
+               goto err3;
+       }
+
+       dir = dst_path.dentry->d_inode;
+       if (IS_DEADDIR(dir)) {
+               error = -ENOENT;
+               goto err3;
+       }
+
+       s_user_ns = dir->i_sb->s_user_ns;
+       if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+           !kgid_has_mapping(s_user_ns, current_fsgid())) {
+               error = -EOVERFLOW;
+               goto err3;
+       }
+
+       error = inode_permission(file_mnt_user_ns(filp),
+                                dir, MAY_WRITE | MAY_EXEC);
+       if (error)
+               goto err3;
+
+       if (!IS_POSIXACL(dir))
+               arg.mode &= ~current_umask();
+
+       error = security_path_mkdir(&dst_path, dst_dentry, arg.mode);
+       if (error)
+               goto err3;
+
+       if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) &&
+           !arg.src_ptr)
+               snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol;
+
+       inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir),
+                             dst_dentry, arg.mode|S_IFDIR,
+                             0, snapshot_src, create_flags);
+       error = PTR_ERR_OR_ZERO(inode);
+       if (error)
+               goto err3;
+
+       d_instantiate(dst_dentry, &inode->v);
+       fsnotify_mkdir(dir, dst_dentry);
+err3:
+       done_path_create(&dst_path, dst_dentry);
+err2:
+       if (arg.src_ptr)
+               path_put(&src_path);
+
+       if (retry_estale(error, lookup_flags)) {
+               lookup_flags |= LOOKUP_REVAL;
+               goto retry;
+       }
+err1:
+       up_read(&c->vfs_sb->s_umount);
+
+       return error;
+}
+
+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
+                               struct bch_ioctl_subvolume arg)
+{
+       struct path path;
+       struct inode *dir;
+       int ret = 0;
+
+       if (arg.flags)
+               return -EINVAL;
+
+       ret = user_path_at(arg.dirfd,
+                       (const char __user *)(unsigned long)arg.dst_ptr,
+                       LOOKUP_FOLLOW, &path);
+       if (ret)
+               return ret;
+
+       if (path.dentry->d_sb->s_fs_info != c) {
+               path_put(&path);
+               return -EXDEV;
+       }
+
+       dir = path.dentry->d_parent->d_inode;
+
+       ret = __bch2_unlink(dir, path.dentry, true);
+       if (!ret) {
+               fsnotify_rmdir(dir, path.dentry);
+               d_delete(path.dentry);
+       }
+       path_put(&path);
+
+       return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
@@ -324,6 +481,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case FS_IOC_GOINGDOWN:
                return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
+       case BCH_IOCTL_SUBVOLUME_CREATE: {
+               struct bch_ioctl_subvolume i;
+
+               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+                       return -EFAULT;
+               return bch2_ioctl_subvolume_create(c, file, i);
+       }
+
+       case BCH_IOCTL_SUBVOLUME_DESTROY: {
+               struct bch_ioctl_subvolume i;
+
+               if (copy_from_user(&i, (void __user *) arg, sizeof(i)))
+                       return -EFAULT;
+               return bch2_ioctl_subvolume_destroy(c, file, i);
+       }
+
        default:
                return bch2_fs_ioctl(c, cmd, (void __user *) arg);
        }
index 631fb87b81c9ca9866eaab3413fbe916a6fd4e5a..91fa1897db98358c8d2f9f246002463d13df0822 100644 (file)
 
 static struct kmem_cache *bch2_inode_cache;
 
-static void bch2_vfs_inode_init(struct bch_fs *,
+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
                                struct bch_inode_info *,
-                               struct bch_inode_unpacked *);
-
-static void journal_seq_copy(struct bch_fs *c,
-                            struct bch_inode_info *dst,
-                            u64 journal_seq)
-{
-       /*
-        * atomic64_cmpxchg has a fallback for archs that don't support it,
-        * cmpxchg does not:
-        */
-       atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
-       u64 old, v = READ_ONCE(dst->ei_journal_seq);
-
-       do {
-               old = v;
-
-               if (old >= journal_seq)
-                       break;
-       } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
-
-       bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
-}
+                               struct bch_inode_unpacked *,
+                               struct bch_subvolume *);
 
 static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
 {
@@ -113,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock)
        __pagecache_lock_get(lock, -1);
 }
 
-void bch2_inode_update_after_write(struct bch_fs *c,
+void bch2_inode_update_after_write(struct btree_trans *trans,
                                   struct bch_inode_info *inode,
                                   struct bch_inode_unpacked *bi,
                                   unsigned fields)
 {
+       struct bch_fs *c = trans->c;
+
+       BUG_ON(bi->bi_inum != inode->v.i_ino);
+
+       bch2_assert_pos_locked(trans, BTREE_ID_inodes,
+                              POS(0, bi->bi_inum),
+                              c->opts.inodes_use_key_cache);
+
        set_nlink(&inode->v, bch2_inode_nlink_get(bi));
        i_uid_write(&inode->v, bi->bi_uid);
        i_gid_write(&inode->v, bi->bi_gid);
@@ -141,7 +129,7 @@ int __must_check bch2_write_inode(struct bch_fs *c,
                                  void *p, unsigned fields)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bch_inode_unpacked inode_u;
        int ret;
 
@@ -149,23 +137,20 @@ int __must_check bch2_write_inode(struct bch_fs *c,
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                              BTREE_ITER_INTENT);
-       ret   = PTR_ERR_OR_ZERO(iter) ?:
+       ret   = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode),
+                               BTREE_ITER_INTENT) ?:
                (set ? set(inode, &inode_u, p) : 0) ?:
-               bch2_inode_write(&trans, iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &inode->ei_journal_seq,
-                                 BTREE_INSERT_NOFAIL);
+               bch2_inode_write(&trans, &iter, &inode_u) ?:
+               bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 
        /*
         * the btree node lock protects inode->ei_inode, not ei_update_lock;
         * this is important for inode updates via bchfs_write_index_update
         */
        if (!ret)
-               bch2_inode_update_after_write(c, inode, &inode_u, fields);
+               bch2_inode_update_after_write(&trans, inode, &inode_u, fields);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        if (ret == -EINTR)
                goto retry;
@@ -209,44 +194,73 @@ int bch2_fs_quota_transfer(struct bch_fs *c,
        return ret;
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
+static int bch2_iget5_test(struct inode *vinode, void *p)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       subvol_inum *inum = p;
+
+       return inode->ei_subvol == inum->subvol &&
+               inode->ei_inode.bi_inum == inum->inum;
+}
+
+static int bch2_iget5_set(struct inode *vinode, void *p)
+{
+       struct bch_inode_info *inode = to_bch_ei(vinode);
+       subvol_inum *inum = p;
+
+       inode->v.i_ino          = inum->inum;
+       inode->ei_subvol        = inum->subvol;
+       inode->ei_inode.bi_inum = inum->inum;
+       return 0;
+}
+
+static unsigned bch2_inode_hash(subvol_inum inum)
+{
+       return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
+}
+
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
 {
        struct bch_inode_unpacked inode_u;
        struct bch_inode_info *inode;
+       struct btree_trans trans;
+       struct bch_subvolume subvol;
        int ret;
 
-       inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
+       inode = to_bch_ei(iget5_locked(c->vfs_sb,
+                                      bch2_inode_hash(inum),
+                                      bch2_iget5_test,
+                                      bch2_iget5_set,
+                                      &inum));
        if (unlikely(!inode))
                return ERR_PTR(-ENOMEM);
        if (!(inode->v.i_state & I_NEW))
                return &inode->v;
 
-       ret = bch2_inode_find_by_inum(c, inum, &inode_u);
+       bch2_trans_init(&trans, c, 8, 0);
+       ret = lockrestart_do(&trans,
+               bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?:
+               bch2_inode_find_by_inum_trans(&trans, inum, &inode_u));
+
+       if (!ret)
+               bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
+       bch2_trans_exit(&trans);
+
        if (ret) {
                iget_failed(&inode->v);
                return ERR_PTR(ret);
        }
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
-
-       inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
-
        unlock_new_inode(&inode->v);
 
        return &inode->v;
 }
 
-static int inum_test(struct inode *inode, void *p)
-{
-       unsigned long *ino = p;
-
-       return *ino == inode->i_ino;
-}
-
-static struct bch_inode_info *
+struct bch_inode_info *
 __bch2_create(struct user_namespace *mnt_userns,
              struct bch_inode_info *dir, struct dentry *dentry,
-             umode_t mode, dev_t rdev, bool tmpfile)
+             umode_t mode, dev_t rdev, subvol_inum snapshot_src,
+             unsigned flags)
 {
        struct bch_fs *c = dir->v.i_sb->s_fs_info;
        struct btree_trans trans;
@@ -254,6 +268,8 @@ __bch2_create(struct user_namespace *mnt_userns,
        struct bch_inode_info *inode, *old;
        struct bch_inode_unpacked inode_u;
        struct posix_acl *default_acl = NULL, *acl = NULL;
+       subvol_inum inum;
+       struct bch_subvolume subvol;
        u64 journal_seq = 0;
        int ret;
 
@@ -274,26 +290,34 @@ __bch2_create(struct user_namespace *mnt_userns,
 
        bch2_inode_init_early(c, &inode_u);
 
-       if (!tmpfile)
+       if (!(flags & BCH_CREATE_TMPFILE))
                mutex_lock(&dir->ei_update_lock);
 
        bch2_trans_init(&trans, c, 8,
-                       2048 + (!tmpfile ? dentry->d_name.len : 0));
+                       2048 + (!(flags & BCH_CREATE_TMPFILE)
+                               ? dentry->d_name.len : 0));
 retry:
        bch2_trans_begin(&trans);
 
-       ret   = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
-                                 !tmpfile ? &dentry->d_name : NULL,
+       ret   = bch2_create_trans(&trans,
+                                 inode_inum(dir), &dir_u, &inode_u,
+                                 !(flags & BCH_CREATE_TMPFILE)
+                                 ? &dentry->d_name : NULL,
                                  from_kuid(mnt_userns, current_fsuid()),
                                  from_kgid(mnt_userns, current_fsgid()),
                                  mode, rdev,
-                                 default_acl, acl) ?:
+                                 default_acl, acl, snapshot_src, flags) ?:
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
                                KEY_TYPE_QUOTA_PREALLOC);
        if (unlikely(ret))
                goto err_before_quota;
 
-       ret   = bch2_trans_commit(&trans, NULL, &journal_seq, 0);
+       inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol;
+       inum.inum = inode_u.bi_inum;
+
+       ret   = bch2_subvolume_get(&trans, inum.subvol, true,
+                                  BTREE_ITER_WITH_UPDATES, &subvol) ?:
+               bch2_trans_commit(&trans, NULL, &journal_seq, 0);
        if (unlikely(ret)) {
                bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
@@ -303,15 +327,14 @@ err_before_quota:
                goto err_trans;
        }
 
-       if (!tmpfile) {
-               bch2_inode_update_after_write(c, dir, &dir_u,
+       if (!(flags & BCH_CREATE_TMPFILE)) {
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               journal_seq_copy(c, dir, journal_seq);
                mutex_unlock(&dir->ei_update_lock);
        }
 
-       bch2_vfs_inode_init(c, inode, &inode_u);
-       journal_seq_copy(c, inode, journal_seq);
+       bch2_iget5_set(&inode->v, &inum);
+       bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol);
 
        set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
        set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
@@ -323,8 +346,12 @@ err_before_quota:
         */
 
        inode->v.i_state |= I_CREATING;
-       old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
-                                     inum_test, NULL, &inode->v.i_ino));
+
+       old = to_bch_ei(inode_insert5(&inode->v,
+                                     bch2_inode_hash(inum),
+                                     bch2_iget5_test,
+                                     bch2_iget5_set,
+                                     &inum));
        BUG_ON(!old);
 
        if (unlikely(old != inode)) {
@@ -332,7 +359,6 @@ err_before_quota:
                 * We raced, another process pulled the new inode into cache
                 * before us:
                 */
-               journal_seq_copy(c, old, journal_seq);
                make_bad_inode(&inode->v);
                iput(&inode->v);
 
@@ -351,7 +377,7 @@ err:
        posix_acl_release(acl);
        return inode;
 err_trans:
-       if (!tmpfile)
+       if (!(flags & BCH_CREATE_TMPFILE))
                mutex_unlock(&dir->ei_update_lock);
 
        bch2_trans_exit(&trans);
@@ -370,12 +396,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
        struct bch_inode_info *dir = to_bch_ei(vdir);
        struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
        struct inode *vinode = NULL;
-       u64 inum;
+       subvol_inum inum = { .subvol = 1 };
+       int ret;
 
-       inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
-                                 &dentry->d_name);
+       ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
+                                &dentry->d_name, &inum);
 
-       if (inum)
+       if (!ret)
                vinode = bch2_vfs_inode_get(c, inum);
 
        return d_splice_alias(vinode, dentry);
@@ -386,7 +413,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns,
                      umode_t mode, dev_t rdev)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false);
+               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev,
+                             (subvol_inum) { 0 }, 0);
 
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -414,19 +442,16 @@ static int __bch2_link(struct bch_fs *c,
        mutex_lock(&inode->ei_update_lock);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0,
+       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        bch2_link_trans(&trans,
-                                       dir->v.i_ino,
-                                       inode->v.i_ino, &dir_u, &inode_u,
+                                       inode_inum(dir),   &dir_u,
+                                       inode_inum(inode), &inode_u,
                                        &dentry->d_name));
 
        if (likely(!ret)) {
-               BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-               journal_seq_copy(c, inode, dir->ei_journal_seq);
-               bch2_inode_update_after_write(c, dir, &dir_u,
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
+               bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME);
        }
 
        bch2_trans_exit(&trans);
@@ -453,7 +478,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
        return 0;
 }
 
-static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
+                 bool deleting_snapshot)
 {
        struct bch_fs *c = vdir->i_sb->s_fs_info;
        struct bch_inode_info *dir = to_bch_ei(vdir);
@@ -465,19 +491,17 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
        bch2_trans_init(&trans, c, 4, 1024);
 
-       ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
+       ret = __bch2_trans_do(&trans, NULL, NULL,
                              BTREE_INSERT_NOFAIL,
                        bch2_unlink_trans(&trans,
-                                         dir->v.i_ino, &dir_u,
-                                         &inode_u, &dentry->d_name));
+                                         inode_inum(dir), &dir_u,
+                                         &inode_u, &dentry->d_name,
+                                         deleting_snapshot));
 
        if (likely(!ret)) {
-               BUG_ON(inode_u.bi_inum != inode->v.i_ino);
-
-               journal_seq_copy(c, inode, dir->ei_journal_seq);
-               bch2_inode_update_after_write(c, dir, &dir_u,
+               bch2_inode_update_after_write(&trans, dir, &dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               bch2_inode_update_after_write(c, inode, &inode_u,
+               bch2_inode_update_after_write(&trans, inode, &inode_u,
                                              ATTR_MTIME);
        }
 
@@ -487,6 +511,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
        return ret;
 }
 
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
+{
+       return __bch2_unlink(vdir, dentry, false);
+}
+
 static int bch2_symlink(struct user_namespace *mnt_userns,
                        struct inode *vdir, struct dentry *dentry,
                        const char *symname)
@@ -495,7 +524,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
        struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
        int ret;
 
-       inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
+       inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0,
+                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
        if (unlikely(IS_ERR(inode)))
                return PTR_ERR(inode);
 
@@ -510,8 +540,6 @@ static int bch2_symlink(struct user_namespace *mnt_userns,
        if (unlikely(ret))
                goto err;
 
-       journal_seq_copy(c, dir, inode->ei_journal_seq);
-
        ret = __bch2_link(c, inode, dir, dentry);
        if (unlikely(ret))
                goto err;
@@ -546,7 +574,6 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
                ? BCH_RENAME_EXCHANGE
                : dst_dentry->d_inode
                ? BCH_RENAME_OVERWRITE : BCH_RENAME;
-       u64 journal_seq = 0;
        int ret;
 
        if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
@@ -586,10 +613,10 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
                        goto err;
        }
 
-       ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0,
+       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        bch2_rename_trans(&trans,
-                                         src_dir->v.i_ino, &src_dir_u,
-                                         dst_dir->v.i_ino, &dst_dir_u,
+                                         inode_inum(src_dir), &src_dir_u,
+                                         inode_inum(dst_dir), &dst_dir_u,
                                          &src_inode_u,
                                          &dst_inode_u,
                                          &src_dentry->d_name,
@@ -602,25 +629,19 @@ static int bch2_rename2(struct user_namespace *mnt_userns,
        BUG_ON(dst_inode &&
               dst_inode->v.i_ino != dst_inode_u.bi_inum);
 
-       bch2_inode_update_after_write(c, src_dir, &src_dir_u,
+       bch2_inode_update_after_write(&trans, src_dir, &src_dir_u,
                                      ATTR_MTIME|ATTR_CTIME);
-       journal_seq_copy(c, src_dir, journal_seq);
 
-       if (src_dir != dst_dir) {
-               bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
+       if (src_dir != dst_dir)
+               bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u,
                                              ATTR_MTIME|ATTR_CTIME);
-               journal_seq_copy(c, dst_dir, journal_seq);
-       }
 
-       bch2_inode_update_after_write(c, src_inode, &src_inode_u,
+       bch2_inode_update_after_write(&trans, src_inode, &src_inode_u,
                                      ATTR_CTIME);
-       journal_seq_copy(c, src_inode, journal_seq);
 
-       if (dst_inode) {
-               bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
+       if (dst_inode)
+               bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u,
                                              ATTR_CTIME);
-               journal_seq_copy(c, dst_inode, journal_seq);
-       }
 err:
        bch2_trans_exit(&trans);
 
@@ -686,7 +707,7 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_qid qid;
        struct btree_trans trans;
-       struct btree_iter *inode_iter;
+       struct btree_iter inode_iter = { NULL };
        struct bch_inode_unpacked inode_u;
        struct posix_acl *acl = NULL;
        int ret;
@@ -712,33 +733,32 @@ retry:
        kfree(acl);
        acl = NULL;
 
-       inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
-                                    BTREE_ITER_INTENT);
-       ret = PTR_ERR_OR_ZERO(inode_iter);
+       ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode),
+                             BTREE_ITER_INTENT);
        if (ret)
                goto btree_err;
 
        bch2_setattr_copy(mnt_userns, inode, &inode_u, attr);
 
        if (attr->ia_valid & ATTR_MODE) {
-               ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
+               ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u,
+                                    inode_u.bi_mode, &acl);
                if (ret)
                        goto btree_err;
        }
 
-       ret =   bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-               bch2_trans_commit(&trans, NULL,
-                                 &inode->ei_journal_seq,
+       ret =   bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+               bch2_trans_commit(&trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL);
 btree_err:
-       bch2_trans_iter_put(&trans, inode_iter);
+       bch2_trans_iter_exit(&trans, &inode_iter);
 
        if (ret == -EINTR)
                goto retry;
        if (unlikely(ret))
                goto err_trans;
 
-       bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
+       bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid);
 
        if (acl)
                set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -812,7 +832,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns,
                        struct inode *vdir, struct dentry *dentry, umode_t mode)
 {
        struct bch_inode_info *inode =
-               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true);
+               __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0,
+                             (subvol_inum) { 0 }, BCH_CREATE_TMPFILE);
 
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -844,8 +865,8 @@ static int bch2_fill_extent(struct bch_fs *c,
                        else
                                offset += p.crc.offset;
 
-                       if ((offset & (c->opts.block_size - 1)) ||
-                           (k.k->size & (c->opts.block_size - 1)))
+                       if ((offset & (block_sectors(c) - 1)) ||
+                           (k.k->size & (block_sectors(c) - 1)))
                                flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
 
                        ret = fiemap_fill_next_extent(info,
@@ -881,12 +902,13 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct bch_fs *c = vinode->i_sb->s_fs_info;
        struct bch_inode_info *ei = to_bch_ei(vinode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
        unsigned offset_into_extent, sectors;
        bool have_extent = false;
+       u32 snapshot;
        int ret = 0;
 
        ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
@@ -896,27 +918,33 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        if (start + len < start)
                return -EINVAL;
 
+       start >>= 9;
+
        bch2_bkey_buf_init(&cur);
        bch2_bkey_buf_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(ei->v.i_ino, start >> 9), 0);
 retry:
        bch2_trans_begin(&trans);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(ei->v.i_ino, start, snapshot), 0);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
               !(ret = bkey_err(k)) &&
-              bkey_cmp(iter->pos, end) < 0) {
+              bkey_cmp(iter.pos, end) < 0) {
                enum btree_id data_btree = BTREE_ID_extents;
 
                if (!bkey_extent_is_data(k.k) &&
                    k.k->type != KEY_TYPE_reservation) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
-               offset_into_extent      = iter->pos.offset -
+               offset_into_extent      = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
@@ -937,7 +965,7 @@ retry:
                                   offset_into_extent),
                               cur.k);
                bch2_key_resize(&cur.k->k, sectors);
-               cur.k->k.p = iter->pos;
+               cur.k->k.p = iter.pos;
                cur.k->k.p.offset += cur.k->k.size;
 
                if (have_extent) {
@@ -950,10 +978,12 @@ retry:
                bkey_copy(prev.k, cur.k);
                have_extent = true;
 
-               bch2_btree_iter_set_pos(iter,
-                       POS(iter->pos.inode, iter->pos.offset + sectors));
+               bch2_btree_iter_set_pos(&iter,
+                       POS(iter.pos.inode, iter.pos.offset + sectors));
        }
-
+       start = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
        if (ret == -EINTR)
                goto retry;
 
@@ -961,8 +991,7 @@ retry:
                ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
 
-       bch2_trans_iter_put(&trans, iter);
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&cur, c);
        bch2_bkey_buf_exit(&prev, c);
        return ret < 0 ? ret : 0;
@@ -998,7 +1027,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
        if (!dir_emit_dots(file, ctx))
                return 0;
 
-       return bch2_readdir(c, inode->v.i_ino, ctx);
+       return bch2_readdir(c, inode_inum(inode), ctx);
 }
 
 static const struct file_operations bch_file_operations = {
@@ -1098,51 +1127,243 @@ static const struct address_space_operations bch_address_space_operations = {
        .error_remove_page = generic_error_remove_page,
 };
 
-static struct inode *bch2_nfs_get_inode(struct super_block *sb,
-               u64 ino, u32 generation)
+struct bcachefs_fid {
+       u64             inum;
+       u32             subvol;
+       u32             gen;
+} __packed;
+
+struct bcachefs_fid_with_parent {
+       struct bcachefs_fid     fid;
+       struct bcachefs_fid     dir;
+} __packed;
+
+static int bcachefs_fid_valid(int fh_len, int fh_type)
 {
-       struct bch_fs *c = sb->s_fs_info;
-       struct inode *vinode;
+       switch (fh_type) {
+       case FILEID_BCACHEFS_WITHOUT_PARENT:
+               return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32);
+       case FILEID_BCACHEFS_WITH_PARENT:
+               return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32);
+       default:
+               return false;
+       }
+}
+
+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode)
+{
+       return (struct bcachefs_fid) {
+               .inum   = inode->ei_inode.bi_inum,
+               .subvol = inode->ei_subvol,
+               .gen    = inode->ei_inode.bi_generation,
+       };
+}
 
-       if (ino < BCACHEFS_ROOT_INO)
-               return ERR_PTR(-ESTALE);
+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len,
+                         struct inode *vdir)
+{
+       struct bch_inode_info *inode    = to_bch_ei(vinode);
+       struct bch_inode_info *dir      = to_bch_ei(vdir);
+
+       if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32))
+               return FILEID_INVALID;
+
+       if (!S_ISDIR(inode->v.i_mode) && dir) {
+               struct bcachefs_fid_with_parent *fid = (void *) fh;
+
+               fid->fid = bch2_inode_to_fid(inode);
+               fid->dir = bch2_inode_to_fid(dir);
+
+               *len = sizeof(*fid) / sizeof(u32);
+               return FILEID_BCACHEFS_WITH_PARENT;
+       } else {
+               struct bcachefs_fid *fid = (void *) fh;
+
+               *fid = bch2_inode_to_fid(inode);
+
+               *len = sizeof(*fid) / sizeof(u32);
+               return FILEID_BCACHEFS_WITHOUT_PARENT;
+       }
+}
 
-       vinode = bch2_vfs_inode_get(c, ino);
-       if (IS_ERR(vinode))
-               return ERR_CAST(vinode);
-       if (generation && vinode->i_generation != generation) {
-               /* we didn't find the right inode.. */
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
+                                       struct bcachefs_fid fid)
+{
+       struct bch_fs *c = sb->s_fs_info;
+       struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) {
+                                   .subvol = fid.subvol,
+                                   .inum = fid.inum,
+       });
+       if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) {
                iput(vinode);
-               return ERR_PTR(-ESTALE);
+               vinode = ERR_PTR(-ESTALE);
        }
        return vinode;
 }
 
-static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid,
                int fh_len, int fh_type)
 {
-       return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-                                   bch2_nfs_get_inode);
+       struct bcachefs_fid *fid = (void *) _fid;
+
+       if (!bcachefs_fid_valid(fh_len, fh_type))
+               return NULL;
+
+       return d_obtain_alias(bch2_nfs_get_inode(sb, *fid));
 }
 
-static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid,
                int fh_len, int fh_type)
 {
-       return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-                                   bch2_nfs_get_inode);
+       struct bcachefs_fid_with_parent *fid = (void *) _fid;
+
+       if (!bcachefs_fid_valid(fh_len, fh_type) ||
+           fh_type != FILEID_BCACHEFS_WITH_PARENT)
+               return NULL;
+
+       return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir));
+}
+
+static struct dentry *bch2_get_parent(struct dentry *child)
+{
+       struct bch_inode_info *inode = to_bch_ei(child->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       subvol_inum parent_inum = {
+               .subvol = inode->ei_inode.bi_parent_subvol ?:
+                       inode->ei_subvol,
+               .inum = inode->ei_inode.bi_dir,
+       };
+
+       if (!parent_inum.inum)
+               return NULL;
+
+       return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum));
+}
+
+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child)
+{
+       struct bch_inode_info *inode    = to_bch_ei(child->d_inode);
+       struct bch_inode_info *dir      = to_bch_ei(parent->d_inode);
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct btree_trans trans;
+       struct btree_iter iter1;
+       struct btree_iter iter2;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       struct bch_inode_unpacked inode_u;
+       subvol_inum target;
+       u32 snapshot;
+       unsigned name_len;
+       int ret;
+
+       if (!S_ISDIR(dir->v.i_mode))
+               return -EINVAL;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents,
+                            POS(dir->ei_inode.bi_inum, 0), 0);
+       bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents,
+                            POS(dir->ei_inode.bi_inum, 0), 0);
+retry:
+       bch2_trans_begin(&trans);
+
+       ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_btree_iter_set_snapshot(&iter1, snapshot);
+       bch2_btree_iter_set_snapshot(&iter2, snapshot);
+
+       ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u);
+       if (ret)
+               goto err;
+
+       if (inode_u.bi_dir == dir->ei_inode.bi_inum) {
+               bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset));
+
+               k = bch2_btree_iter_peek_slot(&iter1);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_dirent) {
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               d = bkey_s_c_to_dirent(k);
+               ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+               if (ret > 0)
+                       ret = -ENOENT;
+               if (ret)
+                       goto err;
+
+               if (target.subvol       == inode->ei_subvol &&
+                   target.inum         == inode->ei_inode.bi_inum)
+                       goto found;
+       } else {
+               /*
+                * File with multiple hardlinks and our backref is to the wrong
+                * directory - linear search:
+                */
+               for_each_btree_key_continue_norestart(iter2, 0, k, ret) {
+                       if (k.k->p.inode > dir->ei_inode.bi_inum)
+                               break;
+
+                       if (k.k->type != KEY_TYPE_dirent)
+                               continue;
+
+                       d = bkey_s_c_to_dirent(k);
+                       ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target);
+                       if (ret < 0)
+                               break;
+                       if (ret)
+                               continue;
+
+                       if (target.subvol       == inode->ei_subvol &&
+                           target.inum         == inode->ei_inode.bi_inum)
+                               goto found;
+               }
+       }
+
+       ret = -ENOENT;
+       goto err;
+found:
+       name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX);
+
+       memcpy(name, d.v->d_name, name_len);
+       name[name_len] = '\0';
+err:
+       if (ret == -EINTR)
+               goto retry;
+
+       bch2_trans_iter_exit(&trans, &iter1);
+       bch2_trans_iter_exit(&trans, &iter2);
+       bch2_trans_exit(&trans);
+
+       return ret;
 }
 
 static const struct export_operations bch_export_ops = {
+       .encode_fh      = bch2_encode_fh,
        .fh_to_dentry   = bch2_fh_to_dentry,
        .fh_to_parent   = bch2_fh_to_parent,
-       //.get_parent   = bch2_get_parent,
+       .get_parent     = bch2_get_parent,
+       .get_name       = bch2_get_name,
 };
 
-static void bch2_vfs_inode_init(struct bch_fs *c,
+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
                                struct bch_inode_info *inode,
-                               struct bch_inode_unpacked *bi)
+                               struct bch_inode_unpacked *bi,
+                               struct bch_subvolume *subvol)
 {
-       bch2_inode_update_after_write(c, inode, bi, ~0);
+       bch2_inode_update_after_write(trans, inode, bi, ~0);
+
+       if (BCH_SUBVOLUME_SNAP(subvol))
+               set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
+       else
+               clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags);
 
        inode->v.i_blocks       = bi->bi_sectors;
        inode->v.i_ino          = bi->bi_inum;
@@ -1151,9 +1372,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c,
        inode->v.i_size         = bi->bi_size;
 
        inode->ei_flags         = 0;
-       inode->ei_journal_seq   = 0;
        inode->ei_quota_reserved = 0;
        inode->ei_qid           = bch_qid(bi);
+       inode->ei_subvol        = inum.subvol;
 
        inode->v.i_mapping->a_ops = &bch_address_space_operations;
 
@@ -1189,7 +1410,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
        mutex_init(&inode->ei_update_lock);
        pagecache_lock_init(&inode->ei_pagecache_lock);
        mutex_init(&inode->ei_quota_lock);
-       inode->ei_journal_seq = 0;
 
        return &inode->v;
 }
@@ -1251,10 +1471,57 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode->v.i_ino, true);
+               bch2_inode_rm(c, inode_inum(inode));
        }
 }
 
+void bch2_evict_subvolume_inodes(struct bch_fs *c,
+                                struct snapshot_id_list *s)
+{
+       struct super_block *sb = c->vfs_sb;
+       struct inode *inode;
+
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               d_mark_dontcache(inode);
+               d_prune_aliases(inode);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+again:
+       cond_resched();
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) ||
+                   (inode->i_state & I_FREEING))
+                       continue;
+
+               if (!(inode->i_state & I_DONTCACHE)) {
+                       d_mark_dontcache(inode);
+                       d_prune_aliases(inode);
+               }
+
+               spin_lock(&inode->i_lock);
+               if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) &&
+                   !(inode->i_state & I_FREEING)) {
+                       wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW);
+                       DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
+                       prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       spin_unlock(&sb->s_inode_list_lock);
+                       schedule();
+                       finish_wait(wq, &wait.wq_entry);
+                       goto again;
+               }
+
+               spin_unlock(&inode->i_lock);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+}
+
 static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct super_block *sb = dentry->d_sb;
@@ -1413,7 +1680,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
                const struct bch_option *opt = &bch2_opt_table[i];
                u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-               if (!(opt->mode & OPT_MOUNT))
+               if (!(opt->flags & OPT_MOUNT))
                        continue;
 
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
@@ -1595,7 +1862,9 @@ got_sb:
                sb->s_flags     |= SB_POSIXACL;
 #endif
 
-       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
+       sb->s_shrink.seeks = 0;
+
+       vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
        if (IS_ERR(vinode)) {
                bch_err(c, "error mounting: error getting root inode %i",
                        (int) PTR_ERR(vinode));
index 36cc6ba2d644f3c10b3ef3c2428bb21b6e84ed43..b2211ec7f3028600a941014057440dc3dc056c7a 100644 (file)
@@ -36,7 +36,6 @@ struct bch_inode_info {
        unsigned long           ei_flags;
 
        struct mutex            ei_update_lock;
-       u64                     ei_journal_seq;
        u64                     ei_quota_reserved;
        unsigned long           ei_last_dirtied;
 
@@ -45,16 +44,32 @@ struct bch_inode_info {
        struct mutex            ei_quota_lock;
        struct bch_qid          ei_qid;
 
+       u32                     ei_subvol;
+
        /* copy of inode in btree: */
        struct bch_inode_unpacked ei_inode;
 };
 
+static inline subvol_inum inode_inum(struct bch_inode_info *inode)
+{
+       return (subvol_inum) {
+               .subvol = inode->ei_subvol,
+               .inum   = inode->ei_inode.bi_inum,
+       };
+}
+
 /*
  * Set if we've gotten a btree error for this inode, and thus the vfs inode and
  * btree inode may be inconsistent:
  */
 #define EI_INODE_ERROR                 0
 
+/*
+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in
+ * those:
+ */
+#define EI_INODE_SNAPSHOT              1
+
 #define to_bch_ei(_inode)                                      \
        container_of_or_null(_inode, struct bch_inode_info, v)
 
@@ -135,6 +150,10 @@ struct bch_inode_unpacked;
 
 #ifndef NO_BCACHEFS_FS
 
+struct bch_inode_info *
+__bch2_create(struct user_namespace *, struct bch_inode_info *,
+             struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+
 int bch2_fs_quota_transfer(struct bch_fs *,
                           struct bch_inode_info *,
                           struct bch_qid,
@@ -154,13 +173,13 @@ static inline int bch2_set_projid(struct bch_fs *c,
                                      KEY_TYPE_QUOTA_PREALLOC);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum);
 
 /* returns 0 if we want to do the update, or error is passed up */
 typedef int (*inode_set_fn)(struct bch_inode_info *,
                            struct bch_inode_unpacked *, void *);
 
-void bch2_inode_update_after_write(struct bch_fs *,
+void bch2_inode_update_after_write(struct btree_trans *,
                                   struct bch_inode_info *,
                                   struct bch_inode_unpacked *,
                                   unsigned);
@@ -170,12 +189,17 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 int bch2_setattr_nonsize(struct user_namespace *,
                         struct bch_inode_info *,
                         struct iattr *);
+int __bch2_unlink(struct inode *, struct dentry *, bool);
+
+void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *);
 
 void bch2_vfs_exit(void);
 int bch2_vfs_init(void);
 
 #else
 
+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
+                                              struct snapshot_id_list *s) {}
 static inline void bch2_vfs_exit(void) {}
 static inline int bch2_vfs_init(void) { return 0; }
 
index 36eba46d566e351c1b50dec637bc02f1ff2a7e17..ced4d671eb8d707e49b8600a5bfa607c4711751f 100644 (file)
@@ -9,6 +9,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
+#include "subvolume.h"
 #include "super.h"
 #include "xattr.h"
 
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
+                                   u32 snapshot)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 sectors = 0;
        int ret;
 
        for_each_btree_key(trans, iter, BTREE_ID_extents,
-                          POS(inum, 0), 0, k, ret) {
+                          SPOS(inum, 0, snapshot), 0, k, ret) {
                if (k.k->p.inode != inum)
                        break;
 
@@ -33,33 +35,138 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
                        sectors += k.k->size;
        }
 
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret ?: sectors;
 }
 
+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
+                                   u32 snapshot)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       u64 subdirs = 0;
+       int ret;
+
+       for_each_btree_key(trans, iter, BTREE_ID_dirents,
+                          SPOS(inum, 0, snapshot), 0, k, ret) {
+               if (k.k->p.inode != inum)
+                       break;
+
+               if (k.k->type != KEY_TYPE_dirent)
+                       continue;
+
+               d = bkey_s_c_to_dirent(k);
+               if (d.v->d_type == DT_DIR)
+                       subdirs++;
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret ?: subdirs;
+}
+
+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
+                                   u32 *subvol)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+                            POS(0, snapshot), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch_err(trans->c, "snapshot %u not fonud", snapshot);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+
+}
+
+static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
+                          u32 *snapshot, u64 *inum)
+{
+       struct bch_subvolume s;
+       int ret;
+
+       ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+
+       *snapshot = le32_to_cpu(s.snapshot);
+       *inum = le64_to_cpu(s.inode);
+       return ret;
+}
+
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+                        u32 *snapshot, u64 *inum)
+{
+       return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
+}
+
+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
+                             struct bch_inode_unpacked *inode)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            POS(0, inode_nr),
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) {
+               ret = -ENOENT;
+               goto err;
+       }
+
+       ret = bch2_inode_unpack(k, inode);
+err:
+       if (ret && ret != -EINTR)
+               bch_err(trans->c, "error %i fetching inode %llu",
+                       ret, inode_nr);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
 static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
                          struct bch_inode_unpacked *inode,
                          u32 *snapshot)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
-                       POS(0, inode_nr), 0);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            SPOS(0, inode_nr, *snapshot), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       if (snapshot)
-               *snapshot = iter->pos.snapshot;
-       ret = k.k->type == KEY_TYPE_inode
-               ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
+       ret = bkey_is_inode(k.k)
+               ? bch2_inode_unpack(k, inode)
                : -ENOENT;
+       if (!ret)
+               *snapshot = iter.pos.snapshot;
 err:
-       bch2_trans_iter_free(trans, iter);
+       if (ret && ret != -EINTR)
+               bch_err(trans->c, "error %i fetching inode %llu:%u",
+                       ret, inode_nr, *snapshot);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -70,17 +177,41 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
        return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
 }
 
+static int __lookup_dirent(struct btree_trans *trans,
+                          struct bch_hash_info hash_info,
+                          subvol_inum dir, struct qstr *name,
+                          u64 *target, unsigned *type)
+{
+       struct btree_iter iter;
+       struct bkey_s_c_dirent d;
+       int ret;
+
+       ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
+                              &hash_info, dir, name, 0);
+       if (ret)
+               return ret;
+
+       d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter));
+       *target = le64_to_cpu(d.v->d_inum);
+       *type = d.v->d_type;
+       bch2_trans_iter_exit(trans, &iter);
+       return 0;
+}
+
 static int __write_inode(struct btree_trans *trans,
                         struct bch_inode_unpacked *inode,
                         u32 snapshot)
 {
-       struct btree_iter *inode_iter =
-               bch2_trans_get_iter(trans, BTREE_ID_inodes,
-                                   SPOS(0, inode->bi_inum, snapshot),
-                                   BTREE_ITER_INTENT);
-       int ret = bch2_btree_iter_traverse(inode_iter) ?:
-               bch2_inode_write(trans, inode_iter, inode);
-       bch2_trans_iter_put(trans, inode_iter);
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                           SPOS(0, inode->bi_inum, snapshot),
+                           BTREE_ITER_INTENT);
+
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_inode_write(trans, &iter, inode);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -97,110 +228,176 @@ static int write_inode(struct btree_trans *trans,
        return ret;
 }
 
+static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+       struct btree_iter iter = { NULL };
+       struct bkey_i_inode_generation delete;
+       struct bch_inode_unpacked inode_u;
+       struct bkey_s_c k;
+       int ret;
+
+       ret   = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL) ?:
+               bch2_btree_delete_range_trans(trans, BTREE_ID_dirents,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL) ?:
+               bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs,
+                                             SPOS(inum, 0, snapshot),
+                                             SPOS(inum, U64_MAX, snapshot),
+                                             0, NULL);
+       if (ret)
+               goto err;
+retry:
+       bch2_trans_begin(trans);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            SPOS(0, inum, snapshot), BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (!bkey_is_inode(k.k)) {
+               bch2_fs_inconsistent(trans->c,
+                                    "inode %llu:%u not found when deleting",
+                                    inum, snapshot);
+               ret = -EIO;
+               goto err;
+       }
+
+       bch2_inode_unpack(k, &inode_u);
+
+       /* Subvolume root? */
+       if (inode_u.bi_subvol) {
+               ret = bch2_subvolume_delete(trans, inode_u.bi_subvol);
+               if (ret)
+                       goto err;
+       }
+
+       bkey_inode_generation_init(&delete.k_i);
+       delete.k.p = iter.pos;
+       delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
+
+       ret   = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                               BTREE_INSERT_NOFAIL);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       if (ret == -EINTR)
+               goto retry;
+
+       return ret;
+}
+
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bch_inode_unpacked dir_inode;
        struct bch_hash_info dir_hash_info;
        int ret;
 
-       ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
+       ret = lookup_first_inode(trans, pos.inode, &dir_inode);
        if (ret)
                return ret;
 
        dir_hash_info = bch2_hash_info_init(c, &dir_inode);
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
        ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-                                 &dir_hash_info, iter);
-       bch2_trans_iter_put(trans, iter);
-       return ret;
-}
-
-static int remove_dirent(struct btree_trans *trans, struct bpos pos)
-{
-       int ret = __bch2_trans_do(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW,
-                                 __remove_dirent(trans, pos));
-       if (ret)
-               bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
+                                 &dir_hash_info, &iter, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
 /* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans,
+static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
                            struct bch_inode_unpacked *lostfound)
 {
        struct bch_fs *c = trans->c;
        struct bch_inode_unpacked root;
        struct bch_hash_info root_hash_info;
        struct qstr lostfound_str = QSTR("lost+found");
-       u64 inum;
+       subvol_inum root_inum = { .subvol = subvol };
+       u64 inum = 0;
+       unsigned d_type = 0;
        u32 snapshot;
        int ret;
 
-       ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
-       if (ret && ret != -ENOENT)
+       ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+       if (ret)
+               return ret;
+
+       ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+       if (ret)
                return ret;
 
        root_hash_info = bch2_hash_info_init(c, &root);
-       inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
-                                 &lostfound_str);
-       if (!inum) {
+
+       ret = __lookup_dirent(trans, root_hash_info, root_inum,
+                           &lostfound_str, &inum, &d_type);
+       if (ret == -ENOENT) {
                bch_notice(c, "creating lost+found");
                goto create_lostfound;
        }
 
-       ret = lookup_inode(trans, inum, lostfound, &snapshot);
-       if (ret && ret != -ENOENT) {
-               /*
-                * The check_dirents pass has already run, dangling dirents
-                * shouldn't exist here:
-                */
+       if (ret && ret != -EINTR)
                bch_err(c, "error looking up lost+found: %i", ret);
+       if (ret)
                return ret;
-       }
 
-       if (ret == -ENOENT) {
-create_lostfound:
-               bch2_inode_init_early(c, lostfound);
-
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                       bch2_create_trans(trans,
-                                         BCACHEFS_ROOT_INO, &root,
-                                         lostfound,
-                                         &lostfound_str,
-                                         0, 0, S_IFDIR|0700, 0, NULL, NULL));
-               if (ret)
-                       bch_err(c, "error creating lost+found: %i", ret);
+       if (d_type != DT_DIR) {
+               bch_err(c, "error looking up lost+found: not a directory");
+               return ret;
        }
 
-       return 0;
+       /*
+        * The check_dirents pass has already run, dangling dirents
+        * shouldn't exist here:
+        */
+       return __lookup_inode(trans, inum, lostfound, &snapshot);
+
+create_lostfound:
+       bch2_inode_init_early(c, lostfound);
+
+       ret = bch2_create_trans(trans, root_inum, &root,
+                               lostfound, &lostfound_str,
+                               0, 0, S_IFDIR|0700, 0, NULL, NULL,
+                               (subvol_inum) { }, 0);
+       if (ret && ret != -EINTR)
+               bch_err(c, "error creating lost+found: %i", ret);
+       return ret;
 }
 
-static int reattach_inode(struct btree_trans *trans,
-                         struct bch_inode_unpacked *inode)
+static int __reattach_inode(struct btree_trans *trans,
+                         struct bch_inode_unpacked *inode,
+                         u32 inode_snapshot)
 {
        struct bch_hash_info dir_hash;
        struct bch_inode_unpacked lostfound;
        char name_buf[20];
        struct qstr name;
        u64 dir_offset = 0;
+       u32 subvol;
        int ret;
 
-       ret = lookup_lostfound(trans, &lostfound);
+       ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
+       if (ret)
+               return ret;
+
+       ret = lookup_lostfound(trans, subvol, &lostfound);
        if (ret)
                return ret;
 
        if (S_ISDIR(inode->bi_mode)) {
                lostfound.bi_nlink++;
 
-               ret = write_inode(trans, &lostfound, U32_MAX);
+               ret = __write_inode(trans, &lostfound, U32_MAX);
                if (ret)
                        return ret;
        }
@@ -210,33 +407,51 @@ static int reattach_inode(struct btree_trans *trans,
        snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
        name = (struct qstr) QSTR(name_buf);
 
-       ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-               bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
-                                  mode_to_type(inode->bi_mode),
-                                  &name, inode->bi_inum, &dir_offset,
-                                  BCH_HASH_SET_MUST_CREATE));
+       ret = bch2_dirent_create(trans,
+                                (subvol_inum) {
+                                       .subvol = subvol,
+                                       .inum = lostfound.bi_inum,
+                                },
+                                &dir_hash,
+                                inode_d_type(inode),
+                                &name, inode->bi_inum, &dir_offset,
+                                BCH_HASH_SET_MUST_CREATE);
+       if (ret)
+               return ret;
+
+       inode->bi_dir           = lostfound.bi_inum;
+       inode->bi_dir_offset    = dir_offset;
+
+       return __write_inode(trans, inode, inode_snapshot);
+}
+
+static int reattach_inode(struct btree_trans *trans,
+                         struct bch_inode_unpacked *inode,
+                         u32 inode_snapshot)
+{
+       int ret = __bch2_trans_do(trans, NULL, NULL,
+                                 BTREE_INSERT_LAZY_RW|
+                                 BTREE_INSERT_NOFAIL,
+                       __reattach_inode(trans, inode, inode_snapshot));
        if (ret) {
                bch_err(trans->c, "error %i reattaching inode %llu",
                        ret, inode->bi_inum);
                return ret;
        }
 
-       inode->bi_dir           = lostfound.bi_inum;
-       inode->bi_dir_offset    = dir_offset;
-
-       return write_inode(trans, inode, U32_MAX);
+       return ret;
 }
 
 static int remove_backpointer(struct btree_trans *trans,
                              struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-                                  POS(inode->bi_dir, inode->bi_dir_offset), 0);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
+                            POS(inode->bi_dir, inode->bi_dir_offset), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto out;
@@ -245,51 +460,254 @@ static int remove_backpointer(struct btree_trans *trans,
                goto out;
        }
 
-       ret = remove_dirent(trans, k.k->p);
+       ret = __remove_dirent(trans, k.k->p);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos)
+{
+       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
+
+       if (bkey_cmp(s->pos, pos))
+               s->nr = 0;
+       s->pos = pos;
+
+       /* Might get called multiple times due to lock restarts */
+       if (s->nr && s->d[s->nr - 1] == pos.snapshot)
+               return 0;
+
+       return snapshots_seen_add(c, s, pos.snapshot);
+}
+
+/**
+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor,
+ * and @ancestor hasn't been overwritten in @seen
+ *
+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot
+ */
+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen,
+                                   u32 id, u32 ancestor)
+{
+       ssize_t i;
+
+       BUG_ON(id > ancestor);
+
+       id              = snapshot_t(c, id)->equiv;
+       ancestor        = snapshot_t(c, ancestor)->equiv;
+
+       /* @ancestor should be the snapshot most recently added to @seen */
+       BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor);
+       BUG_ON(seen->pos.snapshot != ancestor);
+
+       if (id == ancestor)
+               return true;
+
+       if (!bch2_snapshot_is_ancestor(c, id, ancestor))
+               return false;
+
+       for (i = seen->nr - 2;
+            i >= 0 && seen->d[i] >= id;
+            --i)
+               if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) &&
+                   bch2_snapshot_is_ancestor(c, seen->d[i], ancestor))
+                       return false;
+
+       return true;
+}
+
+/**
+ * ref_visible - given a key with snapshot id @src that points to a key with
+ * snapshot id @dst, test whether there is some snapshot in which @dst is
+ * visible.
+ *
+ * This assumes we're visiting @src keys in natural key order.
+ *
+ * @s  - list of snapshot IDs already seen at @src
+ * @src        - snapshot ID of src key
+ * @dst        - snapshot ID of dst key
+ */
+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s,
+                      u32 src, u32 dst)
+{
+       return dst <= src
+               ? key_visible_in_snapshot(c, s, dst, src)
+               : bch2_snapshot_is_ancestor(c, src, dst);
+}
+
+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i)      \
+       for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\
+               if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot))
+
 struct inode_walker {
-       bool                    first_this_inode;
-       bool                    have_inode;
-       u64                     cur_inum;
-       u32                     snapshot;
-       struct bch_inode_unpacked inode;
+       bool                            first_this_inode;
+       u64                             cur_inum;
+
+       size_t                          nr;
+       size_t                          size;
+       struct inode_walker_entry {
+               struct bch_inode_unpacked inode;
+               u32                     snapshot;
+               u64                     count;
+       } *d;
 };
 
+static void inode_walker_exit(struct inode_walker *w)
+{
+       kfree(w->d);
+       w->d = NULL;
+}
+
 static struct inode_walker inode_walker_init(void)
 {
-       return (struct inode_walker) {
-               .cur_inum       = -1,
-               .have_inode     = false,
+       return (struct inode_walker) { 0, };
+}
+
+static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w)
+{
+       if (w->nr == w->size) {
+               size_t new_size = max_t(size_t, 8UL, w->size * 2);
+               void *d = krealloc(w->d, new_size * sizeof(w->d[0]),
+                                  GFP_KERNEL);
+               if (!d) {
+                       bch_err(c, "fsck: error allocating memory for inode_walker, size %zu",
+                               new_size);
+                       return -ENOMEM;
+               }
+
+               w->d = d;
+               w->size = new_size;
+       }
+
+       return 0;
+}
+
+static int add_inode(struct bch_fs *c, struct inode_walker *w,
+                    struct bkey_s_c inode)
+{
+       struct bch_inode_unpacked u;
+       int ret;
+
+       ret = inode_walker_realloc(c, w);
+       if (ret)
+               return ret;
+
+       BUG_ON(bch2_inode_unpack(inode, &u));
+
+       w->d[w->nr++] = (struct inode_walker_entry) {
+               .inode          = u,
+               .snapshot       = snapshot_t(c, inode.k->p.snapshot)->equiv,
        };
+
+       return 0;
 }
 
 static int __walk_inode(struct btree_trans *trans,
-                       struct inode_walker *w, u64 inum)
+                       struct inode_walker *w, struct bpos pos)
 {
-       if (inum != w->cur_inum) {
-               int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       unsigned i, ancestor_pos;
+       int ret;
 
-               if (ret && ret != -ENOENT)
-                       return ret;
+       pos.snapshot = snapshot_t(c, pos.snapshot)->equiv;
 
-               w->have_inode   = !ret;
-               w->cur_inum     = inum;
-               w->first_this_inode = true;
-       } else {
+       if (pos.inode == w->cur_inum) {
                w->first_this_inode = false;
+               goto lookup_snapshot;
        }
 
-       return 0;
+       w->nr = 0;
+
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode),
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (k.k->p.offset != pos.inode)
+                       break;
+
+               if (bkey_is_inode(k.k))
+                       add_inode(c, w, k);
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               return ret;
+
+       w->cur_inum             = pos.inode;
+       w->first_this_inode     = true;
+lookup_snapshot:
+       for (i = 0; i < w->nr; i++)
+               if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot))
+                       goto found;
+       return INT_MAX;
+found:
+       BUG_ON(pos.snapshot > w->d[i].snapshot);
+
+       if (pos.snapshot != w->d[i].snapshot) {
+               ancestor_pos = i;
+
+               while (i && w->d[i - 1].snapshot > pos.snapshot)
+                       --i;
+
+               ret = inode_walker_realloc(c, w);
+               if (ret)
+                       return ret;
+
+               array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]);
+               w->d[i].snapshot = pos.snapshot;
+               w->d[i].count   = 0;
+       }
+
+       return i;
 }
 
-static int walk_inode(struct btree_trans *trans,
-                     struct inode_walker *w, u64 inum)
+static int __get_visible_inodes(struct btree_trans *trans,
+                               struct inode_walker *w,
+                               struct snapshots_seen *s,
+                               u64 inum)
 {
-       return lockrestart_do(trans, __walk_inode(trans, w, inum));
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       w->nr = 0;
+
+       for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (k.k->p.offset != inum)
+                       break;
+
+               if (!bkey_is_inode(k.k))
+                       continue;
+
+               if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) {
+                       add_inode(c, w, k);
+                       if (k.k->p.snapshot >= s->pos.snapshot)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ret;
+}
+
+static int check_key_has_snapshot(struct btree_trans *trans,
+                                 struct btree_iter *iter,
+                                 struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       char buf[200];
+       int ret = 0;
+
+       if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c,
+                       "key in missing snapshot: %s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1;
+fsck_err:
+       return ret;
 }
 
 static int hash_redo_key(struct btree_trans *trans,
@@ -297,6 +715,9 @@ static int hash_redo_key(struct btree_trans *trans,
                         struct bch_hash_info *hash_info,
                         struct btree_iter *k_iter, struct bkey_s_c k)
 {
+       bch_err(trans->c, "hash_redo_key() not implemented yet");
+       return -EINVAL;
+#if 0
        struct bkey_i *delete;
        struct bkey_i *tmp;
 
@@ -315,26 +736,7 @@ static int hash_redo_key(struct btree_trans *trans,
        return  bch2_btree_iter_traverse(k_iter) ?:
                bch2_trans_update(trans, k_iter, delete, 0) ?:
                bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
-}
-
-static int fsck_hash_delete_at(struct btree_trans *trans,
-                              const struct bch_hash_desc desc,
-                              struct bch_hash_info *info,
-                              struct btree_iter *iter)
-{
-       int ret;
-retry:
-       ret   = bch2_hash_delete_at(trans, desc, info, iter) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_LAZY_RW);
-       if (ret == -EINTR) {
-               ret = bch2_btree_iter_traverse(iter);
-               if (!ret)
-                       goto retry;
-       }
-
-       return ret;
+#endif
 }
 
 static int hash_check_key(struct btree_trans *trans,
@@ -343,7 +745,7 @@ static int hash_check_key(struct btree_trans *trans,
                          struct btree_iter *k_iter, struct bkey_s_c hash_k)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        char buf[200];
        struct bkey_s_c k;
        u64 hash;
@@ -370,20 +772,17 @@ static int hash_check_key(struct btree_trans *trans,
                                "duplicate hash table keys:\n%s",
                                (bch2_bkey_val_to_text(&PBUF(buf), c,
                                                       hash_k), buf))) {
-                       ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
-                       if (ret)
-                               return ret;
-                       ret = 1;
+                       ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
                        break;
                }
 
                if (bkey_deleted(k.k)) {
-                       bch2_trans_iter_free(trans, iter);
+                       bch2_trans_iter_exit(trans, &iter);
                        goto bad_hash;
                }
 
        }
-       bch2_trans_iter_free(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 bad_hash:
        if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
@@ -392,9 +791,7 @@ bad_hash:
                     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
                return 0;
 
-       ret = __bch2_trans_do(trans, NULL, NULL,
-                             BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
-               hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
+       ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
        if (ret) {
                bch_err(c, "hash_redo_key err %i", ret);
                return ret;
@@ -406,30 +803,64 @@ fsck_err:
 
 static int check_inode(struct btree_trans *trans,
                       struct btree_iter *iter,
-                      struct bkey_s_c_inode inode)
+                      struct bch_inode_unpacked *prev,
+                      bool full)
 {
        struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
        struct bch_inode_unpacked u;
        bool do_update = false;
-       int ret = 0;
+       int ret;
 
-       ret = bch2_inode_unpack(inode, &u);
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
 
-       if (bch2_fs_inconsistent_on(ret, c,
-                        "error unpacking inode %llu in fsck",
-                        inode.k->p.inode))
+       ret = bkey_err(k);
+       if (ret)
                return ret;
 
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret < 0 ? ret : 0;
+
+       /*
+        * if snapshot id isn't a leaf node, skip it - deletion in
+        * particular is not atomic, so on the internal snapshot nodes
+        * we can see inodes marked for deletion after a clean shutdown
+        */
+       if (bch2_snapshot_internal_node(c, k.k->p.snapshot))
+               return 0;
+
+       if (!bkey_is_inode(k.k))
+               return 0;
+
+       BUG_ON(bch2_inode_unpack(k, &u));
+
+       if (!full &&
+           !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|
+                           BCH_INODE_I_SECTORS_DIRTY|
+                           BCH_INODE_UNLINKED)))
+               return 0;
+
+       if (prev->bi_inum != u.bi_inum)
+               *prev = u;
+
+       if (fsck_err_on(prev->bi_hash_seed      != u.bi_hash_seed ||
+                       inode_d_type(prev)      != inode_d_type(&u), c,
+                       "inodes in different snapshots don't match")) {
+               bch_err(c, "repair not implemented yet");
+               return -EINVAL;
+       }
+
        if (u.bi_flags & BCH_INODE_UNLINKED &&
            (!c->sb.clean ||
             fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
                      u.bi_inum))) {
-               bch_verbose(c, "deleting inode %llu", u.bi_inum);
-
                bch2_trans_unlock(trans);
                bch2_fs_lazy_rw(c);
 
-               ret = bch2_inode_rm(c, u.bi_inum, false);
+               ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot);
                if (ret)
                        bch_err(c, "error in fsck: error %i while deleting inode", ret);
                return ret;
@@ -449,9 +880,10 @@ static int check_inode(struct btree_trans *trans,
                 * just switch units to bytes and that issue goes away
                 */
                ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-                               POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9),
+                               SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
+                                    iter->pos.snapshot),
                                POS(u.bi_inum, U64_MAX),
-                               NULL);
+                               0, NULL);
                if (ret) {
                        bch_err(c, "error in fsck: error %i truncating inode", ret);
                        return ret;
@@ -476,7 +908,7 @@ static int check_inode(struct btree_trans *trans,
                bch_verbose(c, "recounting sectors for inode %llu",
                            u.bi_inum);
 
-               sectors = bch2_count_inode_sectors(trans, u.bi_inum);
+               sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
                if (sectors < 0) {
                        bch_err(c, "error in fsck: error %i recounting inode sectors",
                                (int) sectors);
@@ -496,11 +928,7 @@ static int check_inode(struct btree_trans *trans,
        }
 
        if (do_update) {
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                               bch2_btree_iter_traverse(iter) ?:
-                               bch2_inode_write(trans, iter, &u));
+               ret = write_inode(trans, &u, iter->pos.snapshot);
                if (ret)
                        bch_err(c, "error in fsck: error %i "
                                "updating inode", ret);
@@ -513,41 +941,99 @@ noinline_for_stack
 static int check_inodes(struct bch_fs *c, bool full)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
+       struct btree_iter iter;
+       struct bch_inode_unpacked prev = { 0 };
        int ret;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
-                          BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
-               if (k.k->type != KEY_TYPE_inode)
-                       continue;
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-               inode = bkey_s_c_to_inode(k);
+       do {
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                       check_inode(&trans, &iter, &prev, full));
+               if (ret)
+                       break;
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
 
-               if (full ||
-                   (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
-                                         BCH_INODE_I_SECTORS_DIRTY|
-                                         BCH_INODE_UNLINKED))) {
-                       ret = check_inode(&trans, iter, inode);
-                       if (ret)
-                               break;
-               }
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int check_subvol(struct btree_trans *trans,
+                       struct btree_iter *iter)
+{
+       struct bkey_s_c k;
+       struct bkey_s_c_subvolume subvol;
+       int ret;
+
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
+
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       if (k.k->type != KEY_TYPE_subvolume)
+               return 0;
+
+       subvol = bkey_s_c_to_subvolume(k);
+
+       if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
+               ret = bch2_subvolume_delete(trans, iter->pos.offset);
+               if (ret && ret != -EINTR)
+                       bch_err(trans->c, "error deleting subvolume %llu: %i",
+                               iter->pos.offset, ret);
+               if (ret)
+                       return ret;
        }
-       bch2_trans_iter_put(&trans, iter);
 
-       BUG_ON(ret == -EINTR);
+       return 0;
+}
 
-       return bch2_trans_exit(&trans) ?: ret;
+noinline_for_stack
+static int check_subvols(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       int ret;
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes,
+                            POS_MIN,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH);
+
+       do {
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                                     check_subvol(&trans, &iter));
+               if (ret)
+                       break;
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
+/*
+ * Checking for overlapping extents needs to be reimplemented
+ */
+#if 0
 static int fix_overlapping_extent(struct btree_trans *trans,
                                       struct bkey_s_c k, struct bpos cut_at)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i *u;
        int ret;
 
@@ -567,46 +1053,208 @@ static int fix_overlapping_extent(struct btree_trans *trans,
         * assume things about extent overwrites - we should be running the
         * triggers manually here
         */
-       iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
-                                  BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p,
+                            BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
 
-       BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?:
+       BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS);
+       ret   = bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
+#endif
 
-static int inode_backpointer_exists(struct btree_trans *trans,
-                                   struct bch_inode_unpacked *inode)
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+                                               struct btree_iter *iter,
+                                               struct bpos pos)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
-                                  POS(inode->bi_dir, inode->bi_dir_offset), 0);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
+       if (!ret && k.k->type != KEY_TYPE_dirent)
+               ret = -ENOENT;
+       if (ret) {
+               bch2_trans_iter_exit(trans, iter);
+               return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
+       }
+
+       return bkey_s_c_to_dirent(k);
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+                                  struct bkey_s_c_dirent d)
+{
+       return  inode->bi_dir           == d.k->p.inode &&
+               inode->bi_dir_offset    == d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+                                  struct bch_inode_unpacked *inode)
+{
+       return d.v->d_type == DT_SUBVOL
+               ? le32_to_cpu(d.v->d_child_subvol)      == inode->bi_subvol
+               : le64_to_cpu(d.v->d_inum)              == inode->bi_inum;
+}
+
+static int inode_backpointer_exists(struct btree_trans *trans,
+                                   struct bch_inode_unpacked *inode,
+                                   u32 snapshot)
+{
+       struct btree_iter iter;
+       struct bkey_s_c_dirent d;
+       int ret;
+
+       d = dirent_get_by_pos(trans, &iter,
+                       SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
+       ret = bkey_err(d.s_c);
        if (ret)
-               goto out;
-       if (k.k->type != KEY_TYPE_dirent)
-               goto out;
+               return ret;
 
-       ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
-out:
-       bch2_trans_iter_free(trans, iter);
+       ret = dirent_points_to_inode(d, inode);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
-static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
-                                     struct bch_inode_unpacked *inode)
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+       struct bch_fs *c = trans->c;
+       struct inode_walker_entry *i;
+       int ret = 0, ret2 = 0;
+       s64 count2;
+
+       for (i = w->d; i < w->d + w->nr; i++) {
+               if (i->inode.bi_sectors == i->count)
+                       continue;
+
+               count2 = lockrestart_do(trans,
+                       bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot));
+
+               if (i->count != count2) {
+                       bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu",
+                               i->count, count2);
+                       i->count = count2;
+                       if (i->inode.bi_sectors == i->count)
+                               continue;
+               }
+
+               if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c,
+                           "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu",
+                           w->cur_inum, i->snapshot,
+                           i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE)
+                       continue;
+
+               i->inode.bi_sectors = i->count;
+               ret = write_inode(trans, &i->inode, i->snapshot);
+               if (ret)
+                       break;
+               ret2 = -EINTR;
+       }
+fsck_err:
+       return ret ?: ret2;
+}
+
+static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
+                       struct inode_walker *inode,
+                       struct snapshots_seen *s)
 {
-       return d.k->p.inode == inode->bi_dir &&
-               d.k->p.offset == inode->bi_dir_offset;
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+       struct inode_walker_entry *i;
+       char buf[200];
+       int ret = 0;
+
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
+
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret < 0 ? ret : 0;
+
+       ret = snapshots_seen_update(c, s, k.k->p);
+       if (ret)
+               return ret;
+
+       if (k.k->type == KEY_TYPE_whiteout)
+               return 0;
+
+       if (inode->cur_inum != k.k->p.inode) {
+               ret = check_i_sectors(trans, inode);
+               if (ret)
+                       return ret;
+       }
+#if 0
+       if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
+               char buf1[200];
+               char buf2[200];
+
+               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
+               bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+
+               if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
+                       return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR;
+       }
+#endif
+       ret = __walk_inode(trans, inode, k.k->p);
+       if (ret < 0)
+               return ret;
+
+       if (fsck_err_on(ret == INT_MAX, c,
+                       "extent in missing inode:\n  %s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+       if (ret == INT_MAX)
+               return 0;
+
+       i = inode->d + ret;
+       ret = 0;
+
+       if (fsck_err_on(!S_ISREG(i->inode.bi_mode) &&
+                       !S_ISLNK(i->inode.bi_mode), c,
+                       "extent in non regular inode mode %o:\n  %s",
+                       i->inode.bi_mode,
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return bch2_btree_delete_at(trans, iter,
+                                           BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+
+       if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) {
+               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) {
+                       if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                                       k.k->type != KEY_TYPE_reservation &&
+                                       k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c,
+                                       "extent type %u offset %llu past end of inode %llu, i_size %llu",
+                                       k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) {
+                               bch2_fs_lazy_rw(c);
+                               return bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
+                                               SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9,
+                                                    k.k->p.snapshot),
+                                               POS(k.k->p.inode, U64_MAX),
+                                               0, NULL) ?: -EINTR;
+                       }
+               }
+       }
+
+       if (bkey_extent_is_allocation(k.k))
+               for_each_visible_inode(c, s, inode, k.k->p.snapshot, i)
+                       i->count += k.k->size;
+#if 0
+       bch2_bkey_buf_reassemble(&prev, c, k);
+#endif
+
+fsck_err:
+       return ret;
 }
 
 /*
@@ -617,111 +1265,208 @@ noinline_for_stack
 static int check_extents(struct bch_fs *c)
 {
        struct inode_walker w = inode_walker_init();
+       struct snapshots_seen s;
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
-       struct bkey_buf prev;
-       u64 i_sectors = 0;
+       struct btree_iter iter;
        int ret = 0;
 
+#if 0
+       struct bkey_buf prev;
        bch2_bkey_buf_init(&prev);
        prev.k->k = KEY(0, 0, 0);
+#endif
+       snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        bch_verbose(c, "checking extents");
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
-retry:
-       while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k))) {
-               if (w.have_inode &&
-                   w.cur_inum != k.k->p.inode &&
-                   !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
-                   fsck_err_on(w.inode.bi_sectors != i_sectors, c,
-                               "inode %llu has incorrect i_sectors: got %llu, should be %llu",
-                               w.inode.bi_inum,
-                               w.inode.bi_sectors, i_sectors)) {
-                       w.inode.bi_sectors = i_sectors;
-
-                       ret = write_inode(&trans, &w.inode, w.snapshot);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+
+       do {
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                       check_extent(&trans, &iter, &w, &s));
+               if (ret)
+                       break;
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
+#if 0
+       bch2_bkey_buf_exit(&prev, c);
+#endif
+       inode_walker_exit(&w);
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
+
+       return ret;
+}
+
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+       struct bch_fs *c = trans->c;
+       struct inode_walker_entry *i;
+       int ret = 0, ret2 = 0;
+       s64 count2;
+
+       for (i = w->d; i < w->d + w->nr; i++) {
+               if (i->inode.bi_nlink == i->count)
+                       continue;
+
+               count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot);
+               if (count2 < 0)
+                       return count2;
+
+               if (i->count != count2) {
+                       bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
+                               i->count, count2);
+                       i->count = count2;
+                       if (i->inode.bi_nlink == i->count)
+                               continue;
+               }
+
+               if (fsck_err_on(i->inode.bi_nlink != i->count, c,
+                               "directory %llu:%u with wrong i_nlink: got %u, should be %llu",
+                               w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) {
+                       i->inode.bi_nlink = i->count;
+                       ret = write_inode(trans, &i->inode, i->snapshot);
                        if (ret)
                                break;
+                       ret2 = -EINTR;
                }
+       }
+fsck_err:
+       return ret ?: ret2;
+}
 
-               if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
-                       char buf1[200];
-                       char buf2[200];
+static int check_dirent_target(struct btree_trans *trans,
+                              struct btree_iter *iter,
+                              struct bkey_s_c_dirent d,
+                              struct bch_inode_unpacked *target,
+                              u32 target_snapshot)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_i_dirent *n;
+       bool backpointer_exists = true;
+       char buf[200];
+       int ret = 0;
+
+       if (!target->bi_dir &&
+           !target->bi_dir_offset) {
+               target->bi_dir          = d.k->p.inode;
+               target->bi_dir_offset   = d.k->p.offset;
+
+               ret = __write_inode(trans, target, target_snapshot);
+               if (ret)
+                       goto err;
+       }
 
-                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
-                       bch2_bkey_val_to_text(&PBUF(buf2), c, k);
+       if (!inode_points_to_dirent(target, d)) {
+               ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
+               if (ret < 0)
+                       goto err;
 
-                       if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
-                               return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
+               backpointer_exists = ret;
+               ret = 0;
+
+               if (fsck_err_on(S_ISDIR(target->bi_mode) &&
+                               backpointer_exists, c,
+                               "directory %llu with multiple links",
+                               target->bi_inum)) {
+                       ret = __remove_dirent(trans, d.k->p);
+                       if (ret)
+                               goto err;
+                       return 0;
                }
 
-               ret = walk_inode(&trans, &w, k.k->p.inode);
-               if (ret)
-                       break;
+               if (fsck_err_on(backpointer_exists &&
+                               !target->bi_nlink, c,
+                               "inode %llu has multiple links but i_nlink 0",
+                               target->bi_inum)) {
+                       target->bi_nlink++;
+                       target->bi_flags &= ~BCH_INODE_UNLINKED;
 
-               if (w.first_this_inode)
-                       i_sectors = 0;
-
-               if (fsck_err_on(!w.have_inode, c,
-                               "extent type %u for missing inode %llu",
-                               k.k->type, k.k->p.inode) ||
-                   fsck_err_on(w.have_inode &&
-                               !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
-                               "extent type %u for non regular file, inode %llu mode %o",
-                               k.k->type, k.k->p.inode, w.inode.bi_mode)) {
-                       bch2_fs_lazy_rw(c);
-                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                                      POS(k.k->p.inode, 0),
-                                                      POS(k.k->p.inode, U64_MAX),
-                                                      NULL) ?: -EINTR;
+                       ret = __write_inode(trans, target, target_snapshot);
+                       if (ret)
+                               goto err;
                }
 
-               if (fsck_err_on(w.have_inode &&
-                               !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                               k.k->type != KEY_TYPE_reservation &&
-                               k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
-                               "extent type %u offset %llu past end of inode %llu, i_size %llu",
-                               k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
-                       bch2_fs_lazy_rw(c);
-                       return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                       POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9),
-                                       POS(k.k->p.inode, U64_MAX),
-                                       NULL) ?: -EINTR;
+               if (fsck_err_on(!backpointer_exists, c,
+                               "inode %llu:%u has wrong backpointer:\n"
+                               "got       %llu:%llu\n"
+                               "should be %llu:%llu",
+                               target->bi_inum, target_snapshot,
+                               target->bi_dir,
+                               target->bi_dir_offset,
+                               d.k->p.inode,
+                               d.k->p.offset)) {
+                       target->bi_dir          = d.k->p.inode;
+                       target->bi_dir_offset   = d.k->p.offset;
+
+                       ret = __write_inode(trans, target, target_snapshot);
+                       if (ret)
+                               goto err;
                }
+       }
 
-               if (bkey_extent_is_allocation(k.k))
-                       i_sectors += k.k->size;
-               bch2_bkey_buf_reassemble(&prev, c, k);
+       if (fsck_err_on(d.v->d_type != inode_d_type(target), c,
+                       "incorrect d_type: got %s, should be %s:\n%s",
+                       bch2_d_type_str(d.v->d_type),
+                       bch2_d_type_str(inode_d_type(target)),
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) {
+               n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       return ret;
+
+               bkey_reassemble(&n->k_i, d.s_c);
+               n->v.d_type = inode_d_type(target);
+
+               ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+               if (ret)
+                       return ret;
 
-               bch2_btree_iter_advance(iter);
+               d = dirent_i_to_s_c(n);
        }
+
+       if (d.v->d_type == DT_SUBVOL &&
+           target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
+           (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
+            fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
+                     le32_to_cpu(d.v->d_parent_subvol),
+                     target->bi_parent_subvol))) {
+               n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       return ret;
+
+               bkey_reassemble(&n->k_i, d.s_c);
+               n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+
+               ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+               if (ret)
+                       return ret;
+
+               d = dirent_i_to_s_c(n);
+       }
+err:
 fsck_err:
-       if (ret == -EINTR)
-               goto retry;
-       bch2_trans_iter_put(&trans, iter);
-       bch2_bkey_buf_exit(&prev, c);
-       return bch2_trans_exit(&trans) ?: ret;
+       return ret;
 }
 
 static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bch_hash_info *hash_info,
-                       struct inode_walker *w, unsigned *nr_subdirs)
+                       struct inode_walker *dir,
+                       struct inode_walker *target,
+                       struct snapshots_seen *s)
 {
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
        struct bkey_s_c_dirent d;
-       struct bch_inode_unpacked target;
-       u32 target_snapshot;
-       bool have_target;
-       bool backpointer_exists = true;
-       u64 d_inum;
+       struct inode_walker_entry *i;
        char buf[200];
        int ret;
 
@@ -733,38 +1478,47 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
        if (ret)
                return ret;
 
-       if (w->have_inode &&
-           w->cur_inum != k.k->p.inode &&
-           fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
-                       "directory %llu with wrong i_nlink: got %u, should be %u",
-                       w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
-               w->inode.bi_nlink = *nr_subdirs;
-               ret = write_inode(trans, &w->inode, w->snapshot);
-               return ret ?: -EINTR;
-       }
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret < 0 ? ret : 0;
 
-       ret = __walk_inode(trans, w, k.k->p.inode);
+       ret = snapshots_seen_update(c, s, k.k->p);
        if (ret)
                return ret;
 
-       if (w->first_this_inode)
-               *nr_subdirs = 0;
+       if (k.k->type == KEY_TYPE_whiteout)
+               return 0;
+
+       if (dir->cur_inum != k.k->p.inode) {
+               ret = check_subdir_count(trans, dir);
+               if (ret)
+                       return ret;
+       }
+
+       ret = __walk_inode(trans, dir, k.k->p);
+       if (ret < 0)
+               return ret;
 
-       if (fsck_err_on(!w->have_inode, c,
+       if (fsck_err_on(ret == INT_MAX, c,
                        "dirent in nonexisting directory:\n%s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
-           fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
-                       "dirent in non directory inode type %u:\n%s",
-                       mode_to_type(w->inode.bi_mode),
                        (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
-               return __bch2_trans_do(trans, NULL, NULL, 0,
-                               bch2_btree_delete_at(trans, iter, 0));
+               return bch2_btree_delete_at(trans, iter,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 
-       if (!w->have_inode)
+       if (ret == INT_MAX)
                return 0;
 
-       if (w->first_this_inode)
-               *hash_info = bch2_hash_info_init(c, &w->inode);
+       i = dir->d + ret;
+       ret = 0;
+
+       if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c,
+                       "dirent in non directory inode type %s:\n%s",
+                       bch2_d_type_str(inode_d_type(&i->inode)),
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return bch2_btree_delete_at(trans, iter, 0);
+
+       if (dir->first_this_inode)
+               *hash_info = bch2_hash_info_init(c, &dir->d[0].inode);
 
        ret = hash_check_key(trans, bch2_dirent_hash_desc,
                             hash_info, iter, k);
@@ -777,105 +1531,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
                return 0;
 
        d = bkey_s_c_to_dirent(k);
-       d_inum = le64_to_cpu(d.v->d_inum);
 
-       ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
-       if (ret && ret != -ENOENT)
-               return ret;
+       if (d.v->d_type == DT_SUBVOL) {
+               struct bch_inode_unpacked subvol_root;
+               u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+               u32 target_snapshot;
+               u64 target_inum;
 
-       have_target = !ret;
-       ret = 0;
+               ret = __subvol_lookup(trans, target_subvol,
+                                     &target_snapshot, &target_inum);
+               if (ret && ret != -ENOENT)
+                       return ret;
 
-       if (fsck_err_on(!have_target, c,
-                       "dirent points to missing inode:\n%s",
-                       (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                              k), buf)))
-               return remove_dirent(trans, d.k->p);
+               if (fsck_err_on(ret, c,
+                               "dirent points to missing subvolume %llu",
+                               le64_to_cpu(d.v->d_child_subvol)))
+                       return __remove_dirent(trans, d.k->p);
 
-       if (!have_target)
-               return 0;
+               ret = __lookup_inode(trans, target_inum,
+                                  &subvol_root, &target_snapshot);
+               if (ret && ret != -ENOENT)
+                       return ret;
+
+               if (fsck_err_on(ret, c,
+                               "subvolume %u points to missing subvolume root %llu",
+                               target_subvol,
+                               target_inum)) {
+                       bch_err(c, "repair not implemented yet");
+                       return -EINVAL;
+               }
 
-       if (!target.bi_dir &&
-           !target.bi_dir_offset) {
-               target.bi_dir           = k.k->p.inode;
-               target.bi_dir_offset    = k.k->p.offset;
+               if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c,
+                               "subvol root %llu has wrong bi_subvol field: got %u, should be %u",
+                               target_inum,
+                               subvol_root.bi_subvol, target_subvol)) {
+                       subvol_root.bi_subvol = target_subvol;
+                       ret = __write_inode(trans, &subvol_root, target_snapshot);
+                       if (ret)
+                               return ret;
+               }
 
-               ret = __write_inode(trans, &target, target_snapshot) ?:
-                       bch2_trans_commit(trans, NULL, NULL,
-                                         BTREE_INSERT_NOFAIL|
-                                         BTREE_INSERT_LAZY_RW);
+               ret = check_dirent_target(trans, iter, d, &subvol_root,
+                                         target_snapshot);
                if (ret)
                        return ret;
-               return -EINTR;
-       }
-
-       if (!inode_backpointer_matches(d, &target)) {
-               ret = inode_backpointer_exists(trans, &target);
-               if (ret < 0)
+       } else {
+               ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
+               if (ret)
                        return ret;
 
-               backpointer_exists = ret;
-               ret = 0;
-
-               if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-                               backpointer_exists, c,
-                               "directory %llu with multiple links",
-                               target.bi_inum))
-                       return remove_dirent(trans, d.k->p);
-
-               if (fsck_err_on(backpointer_exists &&
-                               !target.bi_nlink, c,
-                               "inode %llu has multiple links but i_nlink 0",
-                               d_inum)) {
-                       target.bi_nlink++;
-                       target.bi_flags &= ~BCH_INODE_UNLINKED;
-
-                       ret = write_inode(trans, &target, target_snapshot);
-                       return ret ?: -EINTR;
+               if (fsck_err_on(!target->nr, c,
+                               "dirent points to missing inode:\n%s",
+                               (bch2_bkey_val_to_text(&PBUF(buf), c,
+                                                      k), buf))) {
+                       ret = __remove_dirent(trans, d.k->p);
+                       if (ret)
+                               return ret;
                }
 
-               if (fsck_err_on(!backpointer_exists, c,
-                               "inode %llu has wrong backpointer:\n"
-                               "got       %llu:%llu\n"
-                               "should be %llu:%llu",
-                               d_inum,
-                               target.bi_dir,
-                               target.bi_dir_offset,
-                               k.k->p.inode,
-                               k.k->p.offset)) {
-                       target.bi_dir           = k.k->p.inode;
-                       target.bi_dir_offset    = k.k->p.offset;
-
-                       ret = write_inode(trans, &target, target_snapshot);
-                       return ret ?: -EINTR;
+               for (i = target->d; i < target->d + target->nr; i++) {
+                       ret = check_dirent_target(trans, iter, d,
+                                                 &i->inode, i->snapshot);
+                       if (ret)
+                               return ret;
                }
        }
 
-       if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
-                       "incorrect d_type: should be %u:\n%s",
-                       mode_to_type(target.bi_mode),
-                       (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                              k), buf))) {
-               struct bkey_i_dirent *n;
-
-               n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-               if (!n)
-                       return -ENOMEM;
-
-               bkey_reassemble(&n->k_i, d.s_c);
-               n->v.d_type = mode_to_type(target.bi_mode);
-
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(trans, iter, &n->k_i, 0));
-               kfree(n);
-               return ret ?: -EINTR;
-       }
+       if (d.v->d_type == DT_DIR)
+               for_each_visible_inode(c, s, dir, d.k->p.snapshot, i)
+                       i->count++;
 
-       *nr_subdirs += d.v->d_type == DT_DIR;
-       return 0;
 fsck_err:
        return ret;
 }
@@ -887,31 +1612,83 @@ fsck_err:
 noinline_for_stack
 static int check_dirents(struct bch_fs *c)
 {
-       struct inode_walker w = inode_walker_init();
+       struct inode_walker dir = inode_walker_init();
+       struct inode_walker target = inode_walker_init();
+       struct snapshots_seen s;
        struct bch_hash_info hash_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
-       unsigned nr_subdirs = 0;
+       struct btree_iter iter;
        int ret = 0;
 
        bch_verbose(c, "checking dirents");
 
+       snapshots_seen_init(&s);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
        do {
-               ret = lockrestart_do(&trans,
-                               check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                       check_dirent(&trans, &iter, &hash_info,
+                                    &dir, &target, &s));
                if (ret)
                        break;
-       } while (bch2_btree_iter_advance(iter));
-       bch2_trans_iter_put(&trans, iter);
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
+       inode_walker_exit(&dir);
+       inode_walker_exit(&target);
+       return ret;
+}
+
+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
+                      struct bch_hash_info *hash_info,
+                      struct inode_walker *inode)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+       int ret;
+
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
+
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       ret = check_key_has_snapshot(trans, iter, k);
+       if (ret)
+               return ret;
+
+       ret = __walk_inode(trans, inode, k.k->p);
+       if (ret < 0)
+               return ret;
+
+       if (fsck_err_on(ret == INT_MAX, c,
+                       "xattr for missing inode %llu",
+                       k.k->p.inode))
+               return bch2_btree_delete_at(trans, iter, 0);
+
+       if (ret == INT_MAX)
+               return 0;
+
+       ret = 0;
+
+       if (inode->first_this_inode)
+               *hash_info = bch2_hash_info_init(c, &inode->d[0].inode);
 
-       return bch2_trans_exit(&trans) ?: ret;
+       ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+fsck_err:
+       return ret;
 }
 
 /*
@@ -920,90 +1697,101 @@ static int check_dirents(struct bch_fs *c)
 noinline_for_stack
 static int check_xattrs(struct bch_fs *c)
 {
-       struct inode_walker w = inode_walker_init();
+       struct inode_walker inode = inode_walker_init();
        struct bch_hash_info hash_info;
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
+       struct btree_iter iter;
        int ret = 0;
 
        bch_verbose(c, "checking xattrs");
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
-retry:
-       while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k))) {
-               ret = walk_inode(&trans, &w, k.k->p.inode);
-               if (ret)
-                       break;
-
-               if (fsck_err_on(!w.have_inode, c,
-                               "xattr for missing inode %llu",
-                               k.k->p.inode)) {
-                       ret = bch2_btree_delete_at(&trans, iter, 0);
-                       if (ret)
-                               break;
-                       continue;
-               }
-
-               if (w.first_this_inode && w.have_inode)
-                       hash_info = bch2_hash_info_init(c, &w.inode);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            POS(BCACHEFS_ROOT_INO, 0),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-               ret = hash_check_key(&trans, bch2_xattr_hash_desc,
-                                    &hash_info, iter, k);
+       do {
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW|
+                                     BTREE_INSERT_NOFAIL,
+                                     check_xattr(&trans, &iter, &hash_info,
+                                                 &inode));
                if (ret)
                        break;
+       } while (bch2_btree_iter_advance(&iter));
+       bch2_trans_iter_exit(&trans, &iter);
 
-               bch2_btree_iter_advance(iter);
-       }
-fsck_err:
-       if (ret == -EINTR)
-               goto retry;
-
-       bch2_trans_iter_put(&trans, iter);
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
-/* Get root directory, create if it doesn't exist: */
-static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
+static int check_root_trans(struct btree_trans *trans)
 {
-       struct bkey_inode_buf packed;
+       struct bch_fs *c = trans->c;
+       struct bch_inode_unpacked root_inode;
        u32 snapshot;
+       u64 inum;
        int ret;
 
-       bch_verbose(c, "checking root directory");
-
-       ret = bch2_trans_do(c, NULL, NULL, 0,
-               lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
+       ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
        if (ret && ret != -ENOENT)
                return ret;
 
-       if (fsck_err_on(ret, c, "root directory missing"))
-               goto create_root;
+       if (mustfix_fsck_err_on(ret, c, "root subvol missing")) {
+               struct bkey_i_subvolume root_subvol;
 
-       if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
-                       "root inode not a directory"))
-               goto create_root;
+               snapshot        = U32_MAX;
+               inum            = BCACHEFS_ROOT_INO;
 
-       return 0;
+               bkey_subvolume_init(&root_subvol.k_i);
+               root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+               root_subvol.v.flags     = 0;
+               root_subvol.v.snapshot  = cpu_to_le32(snapshot);
+               root_subvol.v.inode     = cpu_to_le64(inum);
+               ret = __bch2_trans_do(trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i));
+               if (ret) {
+                       bch_err(c, "error writing root subvol: %i", ret);
+                       goto err;
+               }
+
+       }
+
+       ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+       if (ret && ret != -ENOENT)
+               return ret;
+
+       if (mustfix_fsck_err_on(ret, c, "root directory missing") ||
+           mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c,
+                               "root inode not a directory")) {
+               bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
+                               0, NULL);
+               root_inode.bi_inum = inum;
+
+               ret = __write_inode(trans, &root_inode, snapshot);
+               if (ret)
+                       bch_err(c, "error writing root inode: %i", ret);
+       }
+err:
 fsck_err:
        return ret;
-create_root:
-       bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
-                       0, NULL);
-       root_inode->bi_inum = BCACHEFS_ROOT_INO;
+}
 
-       bch2_inode_pack(c, &packed, root_inode);
+/* Get root directory, create if it doesn't exist: */
+noinline_for_stack
+static int check_root(struct bch_fs *c)
+{
+       bch_verbose(c, "checking root directory");
 
-       return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
-                                NULL, NULL,
-                                BTREE_INSERT_NOFAIL|
-                                BTREE_INSERT_LAZY_RW);
+       return bch2_trans_do(c, NULL, NULL,
+                            BTREE_INSERT_NOFAIL|
+                            BTREE_INSERT_LAZY_RW,
+               check_root_trans(&trans));
 }
 
 struct pathbuf {
@@ -1012,10 +1800,24 @@ struct pathbuf {
 
        struct pathbuf_entry {
                u64     inum;
+               u32     snapshot;
        }               *entries;
 };
 
-static int path_down(struct pathbuf *p, u64 inum)
+static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot)
+{
+       struct pathbuf_entry *i;
+
+       for (i = p->entries; i < p->entries + p->nr; i++)
+               if (i->inum     == inum &&
+                   i->snapshot == snapshot)
+                       return true;
+
+       return false;
+}
+
+static int path_down(struct bch_fs *c, struct pathbuf *p,
+                    u64 inum, u32 snapshot)
 {
        if (p->nr == p->size) {
                size_t new_size = max_t(size_t, 256UL, p->size * 2);
@@ -1023,6 +1825,8 @@ static int path_down(struct pathbuf *p, u64 inum)
                                   new_size * sizeof(p->entries[0]),
                                   GFP_KERNEL);
                if (!n) {
+                       bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
+                               new_size);
                        return -ENOMEM;
                }
 
@@ -1031,73 +1835,109 @@ static int path_down(struct pathbuf *p, u64 inum)
        };
 
        p->entries[p->nr++] = (struct pathbuf_entry) {
-               .inum = inum,
+               .inum           = inum,
+               .snapshot       = snapshot,
        };
        return 0;
 }
 
+/*
+ * Check that a given inode is reachable from the root:
+ *
+ * XXX: we should also be verifying that inodes are in the right subvolumes
+ */
 static int check_path(struct btree_trans *trans,
                      struct pathbuf *p,
-                     struct bch_inode_unpacked *inode)
+                     struct bch_inode_unpacked *inode,
+                     u32 snapshot)
 {
        struct bch_fs *c = trans->c;
-       u32 snapshot;
-       size_t i;
        int ret = 0;
 
+       snapshot = snapshot_t(c, snapshot)->equiv;
        p->nr = 0;
 
-       while (inode->bi_inum != BCACHEFS_ROOT_INO) {
+       while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
+                inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+               struct btree_iter dirent_iter;
+               struct bkey_s_c_dirent d;
+               u32 parent_snapshot = snapshot;
+
+               if (inode->bi_subvol) {
+                       u64 inum;
+
+                       ret = subvol_lookup(trans, inode->bi_parent_subvol,
+                                           &parent_snapshot, &inum);
+                       if (ret)
+                               break;
+               }
+
                ret = lockrestart_do(trans,
-                       inode_backpointer_exists(trans, inode));
-               if (ret < 0)
+                       PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
+                                         SPOS(inode->bi_dir, inode->bi_dir_offset,
+                                              parent_snapshot))).k));
+               if (ret && ret != -ENOENT)
                        break;
 
-               if (!ret) {
-                       if (fsck_err(c,  "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
-                                    inode->bi_inum,
-                                    mode_to_type(inode->bi_mode),
+               if (!ret && !dirent_points_to_inode(d, inode)) {
+                       bch2_trans_iter_exit(trans, &dirent_iter);
+                       ret = -ENOENT;
+               }
+
+               if (ret == -ENOENT) {
+                       if (fsck_err(c,  "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
+                                    inode->bi_inum, snapshot,
+                                    bch2_d_type_str(inode_d_type(inode)),
                                     inode->bi_nlink,
                                     inode->bi_dir,
                                     inode->bi_dir_offset))
-                               ret = reattach_inode(trans, inode);
+                               ret = reattach_inode(trans, inode, snapshot);
                        break;
                }
-               ret = 0;
+
+               bch2_trans_iter_exit(trans, &dirent_iter);
 
                if (!S_ISDIR(inode->bi_mode))
                        break;
 
-               ret = path_down(p, inode->bi_inum);
+               ret = path_down(c, p, inode->bi_inum, snapshot);
                if (ret) {
                        bch_err(c, "memory allocation failure");
                        return ret;
                }
 
-               for (i = 0; i < p->nr; i++) {
-                       if (inode->bi_dir != p->entries[i].inum)
-                               continue;
+               snapshot = parent_snapshot;
+
+               ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+               if (ret) {
+                       /* Should have been caught in dirents pass */
+                       bch_err(c, "error looking up parent directory: %i", ret);
+                       break;
+               }
+
+               if (path_is_dup(p, inode->bi_inum, snapshot)) {
+                       struct pathbuf_entry *i;
 
                        /* XXX print path */
+                       bch_err(c, "directory structure loop");
+
+                       for (i = p->entries; i < p->entries + p->nr; i++)
+                               pr_err("%llu:%u", i->inum, i->snapshot);
+                       pr_err("%llu:%u", inode->bi_inum, snapshot);
+
                        if (!fsck_err(c, "directory structure loop"))
                                return 0;
 
-                       ret = lockrestart_do(trans,
-                                        remove_backpointer(trans, inode));
+                       ret = __bch2_trans_do(trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL|
+                                             BTREE_INSERT_LAZY_RW,
+                                       remove_backpointer(trans, inode));
                        if (ret) {
                                bch_err(c, "error removing dirent: %i", ret);
                                break;
                        }
 
-                       ret = reattach_inode(trans, inode);
-                       break;
-               }
-
-               ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
-               if (ret) {
-                       /* Should have been caught in dirents pass */
-                       bch_err(c, "error looking up parent directory: %i", ret);
-                       break;
+                       ret = reattach_inode(trans, inode, snapshot);
                }
        }
 fsck_err:
@@ -1111,10 +1951,11 @@ fsck_err:
  * After check_dirents(), if an inode backpointer doesn't exist that means it's
  * unreachable:
  */
+noinline_for_stack
 static int check_directory_structure(struct bch_fs *c)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_inode_unpacked u;
        struct pathbuf path = { 0, 0, NULL };
@@ -1124,28 +1965,33 @@ static int check_directory_structure(struct bch_fs *c)
 
        for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
-               if (k.k->type != KEY_TYPE_inode)
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
+               ret = bch2_inode_unpack(k, &u);
                if (ret) {
                        /* Should have been caught earlier in fsck: */
                        bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
                        break;
                }
 
-               ret = check_path(&trans, &path, &u);
+               if (u.bi_flags & BCH_INODE_UNLINKED)
+                       continue;
+
+               ret = check_path(&trans, &path, &u, iter.pos.snapshot);
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(ret == -EINTR);
 
        kfree(path.entries);
 
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
 struct nlink_table {
@@ -1159,12 +2005,15 @@ struct nlink_table {
        }               *d;
 };
 
-static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
+static int add_nlink(struct bch_fs *c, struct nlink_table *t,
+                    u64 inum, u32 snapshot)
 {
        if (t->nr == t->size) {
                size_t new_size = max_t(size_t, 128UL, t->size * 2);
                void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
                if (!d) {
+                       bch_err(c, "fsck: error allocating memory for nlink_table, size %zu",
+                               new_size);
                        return -ENOMEM;
                }
 
@@ -1193,8 +2042,9 @@ static int nlink_cmp(const void *_l, const void *_r)
        return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
 }
 
-static void inc_link(struct bch_fs *c, struct nlink_table *links,
-                    u64 range_start, u64 range_end, u64 inum)
+static void inc_link(struct bch_fs *c, struct snapshots_seen *s,
+                    struct nlink_table *links,
+                    u64 range_start, u64 range_end, u64 inum, u32 snapshot)
 {
        struct nlink *link, key = {
                .inum = inum, .snapshot = U32_MAX,
@@ -1205,8 +2055,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links,
 
        link = __inline_bsearch(&key, links->d, links->nr,
                                sizeof(links->d[0]), nlink_cmp);
-       if (link)
-               link->count++;
+       if (!link)
+               return;
+
+       while (link > links->d && link[0].inum == link[-1].inum)
+               --link;
+
+       for (; link < links->d + links->nr && link->inum == inum; link++)
+               if (ref_visible(c, s, snapshot, link->snapshot)) {
+                       link->count++;
+                       if (link->snapshot >= snapshot)
+                               break;
+               }
 }
 
 noinline_for_stack
@@ -1215,9 +2075,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                                       u64 start, u64 *end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
        int ret = 0;
 
@@ -1226,26 +2085,25 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
        for_each_btree_key(&trans, iter, BTREE_ID_inodes,
                           POS(0, start),
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
-               if (k.k->type != KEY_TYPE_inode)
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               inode = bkey_s_c_to_inode(k);
+               /* Should never fail, checked by bch2_inode_invalid: */
+               BUG_ON(bch2_inode_unpack(k, &u));
 
                /*
                 * Backpointer and directory structure checks are sufficient for
                 * directories, since they can't have hardlinks:
                 */
-               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+               if (S_ISDIR(le16_to_cpu(u.bi_mode)))
                        continue;
 
-               /* Should never fail, checked by bch2_inode_invalid: */
-               BUG_ON(bch2_inode_unpack(inode, &u));
-
                if (!u.bi_nlink)
                        continue;
 
-               ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+               ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
                if (ret) {
                        *end = k.k->p.offset;
                        ret = 0;
@@ -1253,7 +2111,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
                }
 
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
        if (ret)
@@ -1267,34 +2125,43 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links
                                     u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct snapshots_seen s;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_s_c_dirent d;
        int ret;
 
+       snapshots_seen_init(&s);
+
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN,
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               ret = snapshots_seen_update(c, &s, k.k->p);
+               if (ret)
+                       break;
+
                switch (k.k->type) {
                case KEY_TYPE_dirent:
                        d = bkey_s_c_to_dirent(k);
 
-                       if (d.v->d_type != DT_DIR)
-                               inc_link(c, links, range_start, range_end,
-                                        le64_to_cpu(d.v->d_inum));
+                       if (d.v->d_type != DT_DIR &&
+                           d.v->d_type != DT_SUBVOL)
+                               inc_link(c, &s, links, range_start, range_end,
+                                        le64_to_cpu(d.v->d_inum),
+                                        d.k->p.snapshot);
                        break;
                }
-
-               bch2_trans_cond_resched(&trans);
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
 
+       bch2_trans_exit(&trans);
+       snapshots_seen_exit(&s);
        return ret;
 }
 
@@ -1304,9 +2171,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                               u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
-       struct bkey_s_c_inode inode;
        struct bch_inode_unpacked u;
        struct nlink *link = links->d;
        int ret = 0;
@@ -1316,23 +2182,24 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
        for_each_btree_key(&trans, iter, BTREE_ID_inodes,
                           POS(0, range_start),
                           BTREE_ITER_INTENT|
-                          BTREE_ITER_PREFETCH, k, ret) {
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
                if (k.k->p.offset >= range_end)
                        break;
 
-               if (k.k->type != KEY_TYPE_inode)
+               if (!bkey_is_inode(k.k))
                        continue;
 
-               inode = bkey_s_c_to_inode(k);
-               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
-                       continue;
+               BUG_ON(bch2_inode_unpack(k, &u));
 
-               BUG_ON(bch2_inode_unpack(inode, &u));
+               if (S_ISDIR(le16_to_cpu(u.bi_mode)))
+                       continue;
 
                if (!u.bi_nlink)
                        continue;
 
-               while (link->inum < k.k->p.offset) {
+               while ((cmp_int(link->inum, k.k->p.offset) ?:
+                       cmp_int(link->snapshot, k.k->p.snapshot)) < 0) {
                        link++;
                        BUG_ON(link >= links->d + links->nr);
                }
@@ -1343,17 +2210,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
                                bch2_inode_nlink_get(&u), link->count)) {
                        bch2_inode_nlink_set(&u, link->count);
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL|
-                                             BTREE_INSERT_LAZY_RW,
-                                             bch2_btree_iter_traverse(iter) ?:
-                                       bch2_inode_write(&trans, iter, &u));
+                       ret = write_inode(&trans, &u, k.k->p.snapshot);
                        if (ret)
                                bch_err(c, "error in fsck: error %i updating inode", ret);
                }
        }
 fsck_err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
        if (ret)
@@ -1399,21 +2262,91 @@ static int check_nlinks(struct bch_fs *c)
        return ret;
 }
 
+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter)
+{
+       struct bkey_s_c k;
+       struct bkey_s_c_reflink_p p;
+       struct bkey_i_reflink_p *u;
+       int ret;
+
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 0;
+
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       if (k.k->type != KEY_TYPE_reflink_p)
+               return 0;
+
+       p = bkey_s_c_to_reflink_p(k);
+
+       if (!p.v->front_pad && !p.v->back_pad)
+               return 0;
+
+       u = bch2_trans_kmalloc(trans, sizeof(*u));
+       ret = PTR_ERR_OR_ZERO(u);
+       if (ret)
+               return ret;
+
+       bkey_reassemble(&u->k_i, k);
+       u->v.front_pad  = 0;
+       u->v.back_pad   = 0;
+
+       return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN);
+}
+
+noinline_for_stack
+static int fix_reflink_p(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
+               return 0;
+
+       bch_verbose(c, "fixing reflink_p keys");
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+                          BTREE_ITER_INTENT|
+                          BTREE_ITER_PREFETCH|
+                          BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+               if (k.k->type == KEY_TYPE_reflink_p) {
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL|
+                                             BTREE_INSERT_LAZY_RW,
+                                             fix_reflink_p_key(&trans, &iter));
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
 /*
  * Checks for inconsistencies that shouldn't happen, unless we have a bug.
  * Doesn't fix them yet, mainly because they haven't yet been observed:
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-       struct bch_inode_unpacked root_inode;
-
-       return  check_inodes(c, true) ?:
+       return  bch2_fs_snapshots_check(c) ?:
+               check_inodes(c, true) ?:
+               check_subvols(c) ?:
                check_extents(c) ?:
                check_dirents(c) ?:
                check_xattrs(c) ?:
-               check_root(c, &root_inode) ?:
+               check_root(c) ?:
                check_directory_structure(c) ?:
-               check_nlinks(c);
+               check_nlinks(c) ?:
+               fix_reflink_p(c);
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
index 3b671082cd1e31f65bcf34be04f9afacd444d578..78e2db6c938b8791aa1c3b52144a156c8973f616 100644 (file)
@@ -4,10 +4,13 @@
 #include "btree_key_cache.h"
 #include "bkey_methods.h"
 #include "btree_update.h"
+#include "buckets.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
 #include "inode.h"
 #include "str_hash.h"
+#include "subvolume.h"
 #include "varint.h"
 
 #include <linux/random.h>
@@ -22,39 +25,6 @@ const char * const bch2_inode_opts[] = {
 };
 
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
-static const u8 bits_table[8] = {
-       1  * 8 - 1,
-       2  * 8 - 2,
-       3  * 8 - 3,
-       4  * 8 - 4,
-       6  * 8 - 5,
-       8  * 8 - 6,
-       10 * 8 - 7,
-       13 * 8 - 8,
-};
-
-static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
-{
-       __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
-       unsigned shift, bytes, bits = likely(!hi)
-               ? fls64(lo)
-               : fls64(hi) + 64;
-
-       for (shift = 1; shift <= 8; shift++)
-               if (bits < bits_table[shift - 1])
-                       goto got_shift;
-
-       BUG();
-got_shift:
-       bytes = byte_table[shift - 1];
-
-       BUG_ON(out + bytes > end);
-
-       memcpy(out, (u8 *) in + 16 - bytes, bytes);
-       *out |= (1 << 8) >> shift;
-
-       return bytes;
-}
 
 static int inode_decode_field(const u8 *in, const u8 *end,
                              u64 out[2], unsigned *out_bits)
@@ -90,42 +60,11 @@ static int inode_decode_field(const u8 *in, const u8 *end,
        return bytes;
 }
 
-static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
-                                       const struct bch_inode_unpacked *inode)
-{
-       struct bkey_i_inode *k = &packed->inode;
-       u8 *out = k->v.fields;
-       u8 *end = (void *) &packed[1];
-       u8 *last_nonzero_field = out;
-       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
-       unsigned bytes;
-
-#define x(_name, _bits)                                                        \
-       out += inode_encode_field(out, end, 0, inode->_name);           \
-       nr_fields++;                                                    \
-                                                                       \
-       if (inode->_name) {                                             \
-               last_nonzero_field = out;                               \
-               last_nonzero_fieldnr = nr_fields;                       \
-       }
-
-       BCH_INODE_FIELDS()
-#undef  x
-
-       out = last_nonzero_field;
-       nr_fields = last_nonzero_fieldnr;
-
-       bytes = out - (u8 *) &packed->inode.v;
-       set_bkey_val_bytes(&packed->inode.k, bytes);
-       memset_u64s_tail(&packed->inode.v, 0, bytes);
-
-       SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
-                              const struct bch_inode_unpacked *inode)
+void bch2_inode_pack(struct bch_fs *c,
+                    struct bkey_inode_buf *packed,
+                    const struct bch_inode_unpacked *inode)
 {
-       struct bkey_i_inode *k = &packed->inode;
+       struct bkey_i_inode_v2 *k = &packed->inode;
        u8 *out = k->v.fields;
        u8 *end = (void *) &packed[1];
        u8 *last_nonzero_field = out;
@@ -133,6 +72,14 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
        unsigned bytes;
        int ret;
 
+       bkey_inode_v2_init(&packed->inode.k_i);
+       packed->inode.k.p.offset        = inode->bi_inum;
+       packed->inode.v.bi_journal_seq  = cpu_to_le64(inode->bi_journal_seq);
+       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
+       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
+       packed->inode.v.bi_flags        = cpu_to_le64(inode->bi_flags);
+       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
+
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
                                                                        \
@@ -163,30 +110,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
        set_bkey_val_bytes(&packed->inode.k, bytes);
        memset_u64s_tail(&packed->inode.v, 0, bytes);
 
-       SET_INODE_NR_FIELDS(&k->v, nr_fields);
-}
-
-void bch2_inode_pack(struct bch_fs *c,
-                    struct bkey_inode_buf *packed,
-                    const struct bch_inode_unpacked *inode)
-{
-       bkey_inode_init(&packed->inode.k_i);
-       packed->inode.k.p.offset        = inode->bi_inum;
-       packed->inode.v.bi_hash_seed    = inode->bi_hash_seed;
-       packed->inode.v.bi_flags        = cpu_to_le32(inode->bi_flags);
-       packed->inode.v.bi_mode         = cpu_to_le16(inode->bi_mode);
-
-       if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
-               SET_INODE_NEW_VARINT(&packed->inode.v, true);
-               bch2_inode_pack_v2(packed, inode);
-       } else {
-               bch2_inode_pack_v1(packed, inode);
-       }
+       SET_INODEv2_NR_FIELDS(&k->v, nr_fields);
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
                struct bch_inode_unpacked unpacked;
 
-               int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
+               int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i),
                                           &unpacked);
                BUG_ON(ret);
                BUG_ON(unpacked.bi_inum         != inode->bi_inum);
@@ -235,17 +164,16 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
        return 0;
 }
 
-static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
-                               struct bch_inode_unpacked *unpacked)
+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked,
+                               const u8 *in, const u8 *end,
+                               unsigned nr_fields)
 {
-       const u8 *in = inode.v->fields;
-       const u8 *end = bkey_val_end(inode);
        unsigned fieldnr = 0;
        int ret;
        u64 v[2];
 
 #define x(_name, _bits)                                                        \
-       if (fieldnr < INODE_NR_FIELDS(inode.v)) {                       \
+       if (fieldnr < nr_fields) {                                      \
                ret = bch2_varint_decode_fast(in, end, &v[0]);          \
                if (ret < 0)                                            \
                        return ret;                                     \
@@ -275,52 +203,79 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
        return 0;
 }
 
-int bch2_inode_unpack(struct bkey_s_c_inode inode,
+int bch2_inode_unpack(struct bkey_s_c k,
                      struct bch_inode_unpacked *unpacked)
 {
-       unpacked->bi_inum       = inode.k->p.offset;
-       unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
-       unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
-       unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+       switch (k.k->type) {
+       case KEY_TYPE_inode: {
+               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
 
-       if (INODE_NEW_VARINT(inode.v)) {
-               return bch2_inode_unpack_v2(inode, unpacked);
-       } else {
-               return bch2_inode_unpack_v1(inode, unpacked);
+               unpacked->bi_inum       = inode.k->p.offset;
+               unpacked->bi_journal_seq= 0;
+               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+               unpacked->bi_flags      = le32_to_cpu(inode.v->bi_flags);
+               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+               if (INODE_NEW_VARINT(inode.v)) {
+                       return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+                                                   bkey_val_end(inode),
+                                                   INODE_NR_FIELDS(inode.v));
+               } else {
+                       return bch2_inode_unpack_v1(inode, unpacked);
+               }
+               break;
+       }
+       case KEY_TYPE_inode_v2: {
+               struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+
+               unpacked->bi_inum       = inode.k->p.offset;
+               unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
+               unpacked->bi_hash_seed  = inode.v->bi_hash_seed;
+               unpacked->bi_flags      = le64_to_cpu(inode.v->bi_flags);
+               unpacked->bi_mode       = le16_to_cpu(inode.v->bi_mode);
+
+               return bch2_inode_unpack_v2(unpacked, inode.v->fields,
+                                           bkey_val_end(inode),
+                                           INODEv2_NR_FIELDS(inode.v));
+       }
+       default:
+               BUG();
        }
-
-       return 0;
 }
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
-                                  struct bch_inode_unpacked *inode,
-                                  u64 inum, unsigned flags)
+int bch2_inode_peek(struct btree_trans *trans,
+                   struct btree_iter *iter,
+                   struct bch_inode_unpacked *inode,
+                   subvol_inum inum, unsigned flags)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
-       if (trans->c->opts.inodes_use_key_cache)
-               flags |= BTREE_ITER_CACHED;
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
+                            SPOS(0, inum.inum, snapshot),
+                            flags|BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
+       ret = bkey_is_inode(k.k) ? 0 : -ENOENT;
        if (ret)
                goto err;
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       ret = bch2_inode_unpack(k, inode);
        if (ret)
                goto err;
 
-       return iter;
+       return 0;
 err:
-       bch2_trans_iter_put(trans, iter);
-       return ERR_PTR(ret);
+       bch2_trans_iter_exit(trans, iter);
+       return ret;
 }
 
 int bch2_inode_write(struct btree_trans *trans,
@@ -340,8 +295,8 @@ int bch2_inode_write(struct btree_trans *trans,
 
 const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-               struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-               struct bch_inode_unpacked unpacked;
+       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
+       struct bch_inode_unpacked unpacked;
 
        if (k.k->p.inode)
                return "nonzero k.p.inode";
@@ -355,7 +310,7 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
                return "invalid str hash type";
 
-       if (bch2_inode_unpack(inode, &unpacked))
+       if (bch2_inode_unpack(k, &unpacked))
                return "invalid variable length fields";
 
        if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
@@ -368,15 +323,56 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
            unpacked.bi_nlink != 0)
                return "flagged as unlinked but bi_nlink != 0";
 
+       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+               return "subvolume root but not a directory";
+
+       return NULL;
+}
+
+const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);
+       struct bch_inode_unpacked unpacked;
+
+       if (k.k->p.inode)
+               return "nonzero k.p.inode";
+
+       if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
+               return "incorrect value size";
+
+       if (k.k->p.offset < BLOCKDEV_INODE_MAX)
+               return "fs inode in blockdev range";
+
+       if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
+               return "invalid str hash type";
+
+       if (bch2_inode_unpack(k, &unpacked))
+               return "invalid variable length fields";
+
+       if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
+               return "invalid data checksum type";
+
+       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
+               return "invalid data checksum type";
+
+       if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
+           unpacked.bi_nlink != 0)
+               return "flagged as unlinked but bi_nlink != 0";
+
+       if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode))
+               return "subvolume root but not a directory";
+
        return NULL;
 }
 
 static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
 {
-       pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags);
+       pr_buf(out, "mode %o flags %x journal_seq %llu",
+              inode->bi_mode, inode->bi_flags,
+              inode->bi_journal_seq);
 
 #define x(_name, _bits)                                                \
-       pr_buf(out, #_name " %llu ", (u64) inode->_name);
+       pr_buf(out, " "#_name " %llu", (u64) inode->_name);
        BCH_INODE_FIELDS()
 #undef  x
 }
@@ -390,15 +386,14 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked
 void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
                       struct bkey_s_c k)
 {
-       struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
-       struct bch_inode_unpacked unpacked;
+       struct bch_inode_unpacked inode;
 
-       if (bch2_inode_unpack(inode, &unpacked)) {
+       if (bch2_inode_unpack(k, &inode)) {
                pr_buf(out, "(unpack error)");
                return;
        }
 
-       __bch2_inode_unpacked_to_text(out, &unpacked);
+       __bch2_inode_unpacked_to_text(out, &inode);
 }
 
 const char *bch2_inode_generation_invalid(const struct bch_fs *c,
@@ -474,6 +469,7 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 {
        switch (k.k->type) {
        case KEY_TYPE_inode:
+       case KEY_TYPE_inode_v2:
                BUG();
        case KEY_TYPE_inode_generation:
                return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
@@ -482,12 +478,15 @@ static inline u32 bkey_generation(struct bkey_s_c k)
        }
 }
 
-struct btree_iter *bch2_inode_create(struct btree_trans *trans,
-                                    struct bch_inode_unpacked *inode_u,
-                                    u32 snapshot, u64 cpu)
+/*
+ * This just finds an empty slot:
+ */
+int bch2_inode_create(struct btree_trans *trans,
+                     struct btree_iter *iter,
+                     struct bch_inode_unpacked *inode_u,
+                     u32 snapshot, u64 cpu)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 min, max, start, pos, *hint;
        int ret = 0;
@@ -513,9 +512,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans,
                start = min;
 
        pos = start;
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
-                                  BTREE_ITER_ALL_SNAPSHOTS|
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos),
+                            BTREE_ITER_ALL_SNAPSHOTS|
+                            BTREE_ITER_INTENT);
 again:
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
@@ -528,7 +527,7 @@ again:
                }
 
                if (k.k->p.snapshot == snapshot &&
-                   k.k->type != KEY_TYPE_inode &&
+                   !bkey_is_inode(k.k) &&
                    !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
                        bch2_btree_iter_advance(iter);
                        continue;
@@ -553,8 +552,8 @@ again:
                ret = -ENOSPC;
 
        if (ret) {
-               bch2_trans_iter_put(trans, iter);
-               return ERR_PTR(ret);
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
        }
 
        /* Retry from start */
@@ -566,36 +565,80 @@ found_slot:
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret) {
-               bch2_trans_iter_put(trans, iter);
-               return ERR_PTR(ret);
+               bch2_trans_iter_exit(trans, iter);
+               return ret;
        }
 
        /* We may have raced while the iterator wasn't pointing at pos: */
-       if (k.k->type == KEY_TYPE_inode ||
+       if (bkey_is_inode(k.k) ||
            bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
                goto again;
 
        *hint                   = k.k->p.offset;
        inode_u->bi_inum        = k.k->p.offset;
        inode_u->bi_generation  = bkey_generation(k);
-       return iter;
+       return 0;
 }
 
-int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
+static int bch2_inode_delete_keys(struct btree_trans *trans,
+                                 subvol_inum inum, enum btree_id id)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i delete;
+       u32 snapshot;
+       int ret = 0;
+
+       /*
+        * We're never going to be deleting extents, no need to use an extent
+        * iterator:
+        */
+       bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0),
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_INTENT);
+
+       while (1) {
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       goto err;
+
+               bch2_btree_iter_set_snapshot(&iter, snapshot);
+
+               k = bch2_btree_iter_peek(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (!k.k || iter.pos.inode != inum.inum)
+                       break;
+
+               bkey_init(&delete.k);
+               delete.k.p = iter.pos;
+
+               ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
+                     bch2_trans_commit(trans, NULL, NULL,
+                                       BTREE_INSERT_NOFAIL);
+err:
+               if (ret && ret != -EINTR)
+                       break;
+       }
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
-       struct bpos start = POS(inode_nr, 0);
-       struct bpos end = POS(inode_nr + 1, 0);
        struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
-       unsigned iter_flags = BTREE_ITER_INTENT;
+       u32 snapshot;
        int ret;
 
-       if (cached && c->opts.inodes_use_key_cache)
-               iter_flags |= BTREE_ITER_CACHED;
-
        bch2_trans_init(&trans, c, 0, 1024);
 
        /*
@@ -606,44 +649,49 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
         * XXX: the dirent could ideally would delete whiteouts when they're no
         * longer needed
         */
-       ret   = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
-                                             start, end, NULL) ?:
-               bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
-                                             start, end, NULL) ?:
-               bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
-                                             start, end, NULL);
+       ret   = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?:
+               bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?:
+               bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents);
        if (ret)
                goto err;
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
-                                  POS(0, inode_nr), iter_flags);
-       k = bch2_btree_iter_peek_slot(iter);
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
+                            SPOS(0, inum.inum, snapshot),
+                            BTREE_ITER_INTENT|BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
        if (ret)
                goto err;
 
-       if (k.k->type != KEY_TYPE_inode) {
+       if (!bkey_is_inode(k.k)) {
                bch2_fs_inconsistent(trans.c,
                                     "inode %llu not found when deleting",
-                                    inode_nr);
+                                    inum.inum);
                ret = -EIO;
                goto err;
        }
 
-       bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
+       bch2_inode_unpack(k, &inode_u);
+
+       /* Subvolume root? */
+       BUG_ON(inode_u.bi_subvol);
 
        bkey_inode_generation_init(&delete.k_i);
-       delete.k.p = iter->pos;
+       delete.k.p = iter.pos;
        delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
 
-       ret   = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?:
+       ret   = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?:
                bch2_trans_commit(&trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        if (ret == -EINTR)
                goto retry;
 
@@ -651,21 +699,22 @@ err:
        return ret;
 }
 
-static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
-                                        struct bch_inode_unpacked *inode)
+int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
+                                 subvol_inum inum,
+                                 struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_inode_peek(trans, inode, inode_nr, 0);
-       ret = PTR_ERR_OR_ZERO(iter);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_inode_peek(trans, &iter, inode, inum, 0);
+       if (!ret)
+               bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
-int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
                            struct bch_inode_unpacked *inode)
 {
        return bch2_trans_do(c, NULL, NULL, 0,
-               bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
+               bch2_inode_find_by_inum_trans(&trans, inum, inode));
 }
index d67af4f56f05b1f7e366e5f4bb036efd54751ffe..77957cc7f9dda3eac49a9bd435969c72184c6545 100644 (file)
@@ -7,6 +7,7 @@
 extern const char * const bch2_inode_opts[];
 
 const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_inode (struct bkey_ops) {                \
@@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
        .val_to_text    = bch2_inode_to_text,           \
 }
 
+#define bch2_bkey_ops_inode_v2 (struct bkey_ops) {     \
+       .key_invalid    = bch2_inode_v2_invalid,        \
+       .val_to_text    = bch2_inode_to_text,           \
+}
+
+static inline bool bkey_is_inode(const struct bkey *k)
+{
+       return  k->type == KEY_TYPE_inode ||
+               k->type == KEY_TYPE_inode_v2;
+}
+
 const char *bch2_inode_generation_invalid(const struct bch_fs *,
                                          struct bkey_s_c);
 void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
@@ -34,6 +46,7 @@ typedef u64 u96;
 
 struct bch_inode_unpacked {
        u64                     bi_inum;
+       u64                     bi_journal_seq;
        __le64                  bi_hash_seed;
        u32                     bi_flags;
        u16                     bi_mode;
@@ -44,7 +57,7 @@ struct bch_inode_unpacked {
 };
 
 struct bkey_inode_buf {
-       struct bkey_i_inode     inode;
+       struct bkey_i_inode_v2  inode;
 
 #define x(_name, _bits)                + 8 + _bits / 8
        u8              _pad[0 + BCH_INODE_FIELDS()];
@@ -53,12 +66,12 @@ struct bkey_inode_buf {
 
 void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
                     const struct bch_inode_unpacked *);
-int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_peek(struct btree_trans *,
-                       struct bch_inode_unpacked *, u64, unsigned);
+int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+                   struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_write(struct btree_trans *, struct btree_iter *,
                     struct bch_inode_unpacked *);
 
@@ -71,12 +84,15 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     uid_t, gid_t, umode_t, dev_t,
                     struct bch_inode_unpacked *);
 
-struct btree_iter *bch2_inode_create(struct btree_trans *,
-                                    struct bch_inode_unpacked *, u32, u64);
+int bch2_inode_create(struct btree_trans *, struct btree_iter *,
+                     struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_rm(struct bch_fs *, u64, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
-int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
+                                 struct bch_inode_unpacked *);
+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum,
+                           struct bch_inode_unpacked *);
 
 static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
 {
@@ -133,6 +149,11 @@ static inline u8 mode_to_type(umode_t mode)
        return (mode >> 12) & 15;
 }
 
+static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
+{
+       return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)
index 4585a4036f1b9948aff47f5d83f2266a98d6c977..10f8b3aedc3cf4f580de2a89cc314ade08047539 100644 (file)
@@ -27,6 +27,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 
@@ -186,26 +187,24 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 int bch2_sum_sector_overwrites(struct btree_trans *trans,
                               struct btree_iter *extent_iter,
                               struct bkey_i *new,
-                              bool *maybe_extending,
                               bool *usage_increasing,
                               s64 *i_sectors_delta,
                               s64 *disk_sectors_delta)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c old;
        unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
        bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
        int ret = 0;
 
-       *maybe_extending        = true;
        *usage_increasing       = false;
        *i_sectors_delta        = 0;
        *disk_sectors_delta     = 0;
 
-       iter = bch2_trans_copy_iter(trans, extent_iter);
+       bch2_trans_copy_iter(&iter, extent_iter);
 
-       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
+       for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
                s64 sectors = min(new->k.p.offset, old.k->p.offset) -
                        max(bkey_start_offset(&new->k),
                            bkey_start_offset(old.k));
@@ -220,42 +219,21 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                        : 0;
 
                if (!*usage_increasing &&
-                   (new_replicas > bch2_bkey_replicas(c, old) ||
+                   (new->k.p.snapshot != old.k->p.snapshot ||
+                    new_replicas > bch2_bkey_replicas(c, old) ||
                     (!new_compressed && bch2_bkey_sectors_compressed(old))))
                        *usage_increasing = true;
 
-               if (bkey_cmp(old.k->p, new->k.p) >= 0) {
-                       /*
-                        * Check if there's already data above where we're
-                        * going to be writing to - this means we're definitely
-                        * not extending the file:
-                        *
-                        * Note that it's not sufficient to check if there's
-                        * data up to the sector offset we're going to be
-                        * writing to, because i_size could be up to one block
-                        * less:
-                        */
-                       if (!bkey_cmp(old.k->p, new->k.p)) {
-                               old = bch2_btree_iter_next(iter);
-                               ret = bkey_err(old);
-                               if (ret)
-                                       break;
-                       }
-
-                       if (old.k && !bkey_err(old) &&
-                           old.k->p.inode == extent_iter->pos.inode &&
-                           bkey_extent_is_data(old.k))
-                               *maybe_extending = false;
-
+               if (bkey_cmp(old.k->p, new->k.p) >= 0)
                        break;
-               }
        }
 
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
 int bch2_extent_update(struct btree_trans *trans,
+                      subvol_inum inum,
                       struct btree_iter *iter,
                       struct bkey_i *k,
                       struct disk_reservation *disk_res,
@@ -264,213 +242,208 @@ int bch2_extent_update(struct btree_trans *trans,
                       s64 *i_sectors_delta_total,
                       bool check_enospc)
 {
-       /* this must live until after bch2_trans_commit(): */
-       struct bkey_inode_buf inode_p;
-       bool extending = false, usage_increasing;
+       struct btree_iter inode_iter;
+       struct bch_inode_unpacked inode_u;
+       struct bpos next_pos;
+       bool usage_increasing;
        s64 i_sectors_delta = 0, disk_sectors_delta = 0;
        int ret;
 
-       ret = bch2_extent_trim_atomic(k, iter);
+       /*
+        * This traverses us the iterator without changing iter->path->pos to
+        * search_key() (which is pos + 1 for extents): we want there to be a
+        * path already traversed at iter->pos because
+        * bch2_trans_extent_update() will use it to attempt extent merging
+        */
+       ret = __bch2_btree_iter_traverse(iter);
        if (ret)
                return ret;
 
+       ret = bch2_extent_trim_atomic(trans, iter, k);
+       if (ret)
+               return ret;
+
+       new_i_size = min(k->k.p.offset << 9, new_i_size);
+       next_pos = k->k.p;
+
        ret = bch2_sum_sector_overwrites(trans, iter, k,
-                       &extending,
                        &usage_increasing,
                        &i_sectors_delta,
                        &disk_sectors_delta);
        if (ret)
                return ret;
 
-       if (!usage_increasing)
-               check_enospc = false;
-
        if (disk_res &&
            disk_sectors_delta > (s64) disk_res->sectors) {
                ret = bch2_disk_reservation_add(trans->c, disk_res,
                                        disk_sectors_delta - disk_res->sectors,
-                                       !check_enospc
+                                       !check_enospc || !usage_increasing
                                        ? BCH_DISK_RESERVATION_NOFAIL : 0);
                if (ret)
                        return ret;
        }
 
-       new_i_size = extending
-               ? min(k->k.p.offset << 9, new_i_size)
-               : 0;
-
-       if (i_sectors_delta || new_i_size) {
-               struct btree_iter *inode_iter;
-               struct bch_inode_unpacked inode_u;
-
-               inode_iter = bch2_inode_peek(trans, &inode_u,
-                               k->k.p.inode, BTREE_ITER_INTENT);
-               ret = PTR_ERR_OR_ZERO(inode_iter);
-               if (ret)
-                       return ret;
-
-               /*
-                * XXX:
-                * writeback can race a bit with truncate, because truncate
-                * first updates the inode then truncates the pagecache. This is
-                * ugly, but lets us preserve the invariant that the in memory
-                * i_size is always >= the on disk i_size.
-                *
-               BUG_ON(new_i_size > inode_u.bi_size &&
-                      (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
-                */
-               BUG_ON(new_i_size > inode_u.bi_size && !extending);
-
-               if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
-                   new_i_size > inode_u.bi_size)
-                       inode_u.bi_size = new_i_size;
-               else
-                       new_i_size = 0;
-
-               inode_u.bi_sectors += i_sectors_delta;
-
-               if (i_sectors_delta || new_i_size) {
-                       bch2_inode_pack(trans->c, &inode_p, &inode_u);
-
-                       inode_p.inode.k.p.snapshot = iter->snapshot;
-
-                       ret = bch2_trans_update(trans, inode_iter,
-                                         &inode_p.inode.k_i, 0);
-               }
+       ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum,
+                             BTREE_ITER_INTENT);
+       if (ret)
+               return ret;
 
-               bch2_trans_iter_put(trans, inode_iter);
+       if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+           new_i_size > inode_u.bi_size)
+               inode_u.bi_size = new_i_size;
 
-               if (ret)
-                       return ret;
-       }
+       inode_u.bi_sectors += i_sectors_delta;
 
        ret =   bch2_trans_update(trans, iter, k, 0) ?:
+               bch2_inode_write(trans, &inode_iter, &inode_u) ?:
                bch2_trans_commit(trans, disk_res, journal_seq,
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL);
-       BUG_ON(ret == -ENOSPC);
+       bch2_trans_iter_exit(trans, &inode_iter);
+
        if (ret)
                return ret;
 
        if (i_sectors_delta_total)
                *i_sectors_delta_total += i_sectors_delta;
+       bch2_btree_iter_set_pos(iter, next_pos);
+
        return 0;
 }
 
+/*
+ * Returns -EINTR if we had to drop locks:
+ */
 int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
-                  struct bpos end, u64 *journal_seq,
+                  subvol_inum inum, u64 end,
                   s64 *i_sectors_delta)
 {
        struct bch_fs *c        = trans->c;
        unsigned max_sectors    = KEY_SIZE_MAX & (~0 << c->block_bits);
+       struct bpos end_pos = POS(inum.inum, end);
        struct bkey_s_c k;
        int ret = 0, ret2 = 0;
+       u32 snapshot;
 
-       while ((bch2_trans_begin(trans),
-               (k = bch2_btree_iter_peek(iter)).k) &&
-              bkey_cmp(iter->pos, end) < 0) {
+       while (!ret || ret == -EINTR) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
                struct bkey_i delete;
 
+               if (ret)
+                       ret2 = ret;
+
+               bch2_trans_begin(trans);
+
+               ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(iter, snapshot);
+
+               k = bch2_btree_iter_peek(iter);
+               if (bkey_cmp(iter->pos, end_pos) >= 0) {
+                       bch2_btree_iter_set_pos(iter, end_pos);
+                       break;
+               }
+
                ret = bkey_err(k);
                if (ret)
-                       goto btree_err;
+                       continue;
 
                bkey_init(&delete.k);
                delete.k.p = iter->pos;
 
                /* create the biggest key we can */
                bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end, &delete);
+               bch2_cut_back(end_pos, &delete);
 
-               ret = bch2_extent_update(trans, iter, &delete,
-                               &disk_res, journal_seq,
+               ret = bch2_extent_update(trans, inum, iter, &delete,
+                               &disk_res, NULL,
                                0, i_sectors_delta, false);
                bch2_disk_reservation_put(c, &disk_res);
-btree_err:
-               if (ret == -EINTR) {
-                       ret2 = ret;
-                       ret = 0;
-               }
-               if (ret)
-                       break;
-       }
-
-       if (bkey_cmp(iter->pos, end) > 0) {
-               bch2_btree_iter_set_pos(iter, end);
-               ret = bch2_btree_iter_traverse(iter);
        }
 
        return ret ?: ret2;
 }
 
-int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
-               u64 *journal_seq, s64 *i_sectors_delta)
+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end,
+               s64 *i_sectors_delta)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
-       int ret = 0;
+       struct btree_iter iter;
+       int ret;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(inum, start),
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            POS(inum.inum, start),
+                            BTREE_ITER_INTENT);
 
-       ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
-                            journal_seq, i_sectors_delta);
+       ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta);
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
 
-       if (ret == -EINTR)
-               ret = 0;
-
-       return ret;
+       return ret == -EINTR ? 0 : ret;
 }
 
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct bkey_buf sk;
+       struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
+       subvol_inum inum = {
+               .subvol = op->subvol,
+               .inum   = k->k.p.inode,
+       };
        int ret;
 
+       BUG_ON(!inum.subvol);
+
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  bkey_start_pos(&k->k),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
        do {
                bch2_trans_begin(&trans);
 
                k = bch2_keylist_front(keys);
+               bch2_bkey_buf_copy(&sk, c, k);
 
-               k->k.p.snapshot = iter->snapshot;
+               ret = bch2_subvolume_get_snapshot(&trans, inum.subvol,
+                                                 &sk.k->k.p.snapshot);
+               if (ret == -EINTR)
+                       continue;
+               if (ret)
+                       break;
 
-               bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
-               bkey_copy(sk.k, k);
-               bch2_cut_front(iter->pos, sk.k);
+               bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                                    bkey_start_pos(&sk.k->k),
+                                    BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-               ret = bch2_extent_update(&trans, iter, sk.k,
+               ret = bch2_extent_update(&trans, inum, &iter, sk.k,
                                         &op->res, op_journal_seq(op),
                                         op->new_i_size, &op->i_sectors_delta,
                                         op->flags & BCH_WRITE_CHECK_ENOSPC);
+               bch2_trans_iter_exit(&trans, &iter);
+
                if (ret == -EINTR)
                        continue;
                if (ret)
                        break;
 
-               if (bkey_cmp(iter->pos, k->k.p) >= 0)
-                       bch2_keylist_pop_front(keys);
+               if (ec_ob)
+                       bch2_ob_add_backpointer(c, ec_ob, &sk.k->k);
+
+               if (bkey_cmp(iter.pos, k->k.p) >= 0)
+                       bch2_keylist_pop_front(&op->insert_keys);
+               else
+                       bch2_cut_front(iter.pos, k);
        } while (!bch2_keylist_empty(keys));
 
-       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
@@ -692,11 +665,7 @@ static void init_append_extent(struct bch_write_op *op,
 {
        struct bch_fs *c = op->c;
        struct bkey_i_extent *e;
-       struct open_bucket *ob;
-       unsigned i;
 
-       BUG_ON(crc.compressed_size > wp->sectors_free);
-       wp->sectors_free -= crc.compressed_size;
        op->pos.offset += crc.uncompressed_size;
 
        e = bkey_extent_init(op->insert_keys.top);
@@ -709,22 +678,8 @@ static void init_append_extent(struct bch_write_op *op,
            crc.nonce)
                bch2_extent_crc_append(&e->k_i, crc);
 
-       open_bucket_for_each(c, &wp->ptrs, ob, i) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
-               union bch_extent_entry *end =
-                       bkey_val_end(bkey_i_to_s(&e->k_i));
-
-               end->ptr = ob->ptr;
-               end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-               end->ptr.cached = !ca->mi.durability ||
-                       (op->flags & BCH_WRITE_CACHED) != 0;
-               end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
-
-               e->k.u64s++;
-
-               BUG_ON(crc.compressed_size > ob->sectors_free);
-               ob->sectors_free -= crc.compressed_size;
-       }
+       bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+                                      op->flags & BCH_WRITE_CACHED);
 
        bch2_keylist_push(&op->insert_keys);
 }
@@ -744,6 +699,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
                                       ? ((unsigned long) buf & (PAGE_SIZE - 1))
                                       : 0), PAGE_SIZE);
 
+       pages = min(pages, BIO_MAX_VECS);
+
        bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
        wbio                    = wbio_init(bio);
        wbio->put_bio           = true;
@@ -763,7 +720,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
         */
        bch2_bio_alloc_pages_pool(c, bio,
                                  min_t(unsigned, output_available,
-                                       c->sb.encoded_extent_max << 9));
+                                       c->opts.encoded_extent_max));
 
        if (bio->bi_iter.bi_size < output_available)
                *page_alloc_failed =
@@ -909,7 +866,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
        struct bio *src = &op->wbio.bio, *dst = src;
        struct bvec_iter saved_iter;
        void *ec_buf;
-       struct bpos ec_pos = op->pos;
        unsigned total_output = 0, total_input = 0;
        bool bounce = false;
        bool page_alloc_failed = false;
@@ -926,7 +882,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                ret = -EIO;
                goto err;
        case PREP_ENCODED_CHECKSUM_ERR:
-               BUG();
                goto csum_err;
        case PREP_ENCODED_DO_WRITE:
                /* XXX look for bug here */
@@ -962,8 +917,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                size_t dst_len, src_len;
 
                if (page_alloc_failed &&
-                   bio_sectors(dst) < wp->sectors_free &&
-                   bio_sectors(dst) < c->sb.encoded_extent_max)
+                   dst->bi_iter.bi_size  < (wp->sectors_free << 9) &&
+                   dst->bi_iter.bi_size < c->opts.encoded_extent_max)
                        break;
 
                BUG_ON(op->compression_type &&
@@ -983,7 +938,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
                        if (op->csum_type)
                                dst_len = min_t(unsigned, dst_len,
-                                               c->sb.encoded_extent_max << 9);
+                                               c->opts.encoded_extent_max);
 
                        if (bounce) {
                                swap(dst->bi_iter.bi_size, dst_len);
@@ -1080,9 +1035,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
 
        dst->bi_iter.bi_size = total_output;
 do_write:
-       /* might have done a realloc... */
-       bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
-
        *_dst = dst;
        return more;
 csum_err:
@@ -1141,7 +1093,7 @@ again:
                 */
                wp = bch2_alloc_sectors_start(c,
                        op->target,
-                       op->opts.erasure_code,
+                       op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
                        op->write_point,
                        &op->devs_have,
                        op->nr_replicas,
@@ -1319,7 +1271,7 @@ void bch2_write(struct closure *cl)
        bch2_keylist_init(&op->insert_keys, op->inline_keys);
        wbio_init(bio)->put_bio = false;
 
-       if (bio_sectors(bio) & (c->opts.block_size - 1)) {
+       if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) {
                bch_err_inum_ratelimited(c, op->pos.inode,
                                         "misaligned write");
                op->error = -EIO;
@@ -1631,12 +1583,12 @@ static void bch2_rbio_done(struct bch_read_bio *rbio)
 }
 
 static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
-                                    struct bvec_iter bvec_iter, u64 inode,
+                                    struct bvec_iter bvec_iter,
                                     struct bch_io_failures *failed,
                                     unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
@@ -1647,12 +1599,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, rbio->data_btree,
-                                  rbio->read_pos, BTREE_ITER_SLOTS);
+       bch2_trans_iter_init(&trans, &iter, rbio->data_btree,
+                            rbio->read_pos, BTREE_ITER_SLOTS);
 retry:
        rbio->bio.bi_status = 0;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        if (bkey_err(k))
                goto err;
 
@@ -1679,7 +1631,7 @@ retry:
                goto err;
 out:
        bch2_rbio_done(rbio);
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
        return;
@@ -1695,7 +1647,10 @@ static void bch2_rbio_retry(struct work_struct *work)
        struct bch_fs *c        = rbio->c;
        struct bvec_iter iter   = rbio->bvec_iter;
        unsigned flags          = rbio->flags;
-       u64 inode               = rbio->read_pos.inode;
+       subvol_inum inum = {
+               .subvol = rbio->subvol,
+               .inum   = rbio->read_pos.inode,
+       };
        struct bch_io_failures failed = { .nr = 0 };
 
        trace_read_retry(&rbio->bio);
@@ -1711,12 +1666,12 @@ static void bch2_rbio_retry(struct work_struct *work)
        flags &= ~BCH_READ_MAY_PROMOTE;
 
        if (flags & BCH_READ_NODECODE) {
-               bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
+               bch2_read_retry_nodecode(c, rbio, iter, &failed, flags);
        } else {
                flags &= ~BCH_READ_LAST_FRAGMENT;
                flags |= BCH_READ_MUST_CLONE;
 
-               __bch2_read(c, rbio, iter, inode, &failed, flags);
+               __bch2_read(c, rbio, iter, inum, &failed, flags);
        }
 }
 
@@ -1745,7 +1700,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        struct bch_fs *c = rbio->c;
        u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
        struct bch_extent_crc_unpacked new_crc;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter;
        struct bkey_i *new;
        struct bkey_s_c k;
        int ret = 0;
@@ -1753,9 +1708,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (crc_is_compressed(rbio->pick.crc))
                return 0;
 
-       iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
        if ((ret = bkey_err(k)))
                goto out;
 
@@ -1790,9 +1745,10 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (!bch2_bkey_narrow_crcs(new, new_crc))
                goto out;
 
-       ret = bch2_trans_update(trans, iter, new, 0);
+       ret = bch2_trans_update(trans, &iter, new,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 out:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -1937,9 +1893,8 @@ static void bch2_read_endio(struct bio *bio)
                return;
        }
 
-       if (rbio->pick.ptr.cached &&
-           (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-            ptr_stale(ca, &rbio->pick.ptr))) {
+       if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
+           ptr_stale(ca, &rbio->pick.ptr)) {
                atomic_long_inc(&c->read_realloc_races);
 
                if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1963,7 +1918,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
                                unsigned *offset_into_extent,
                                struct bkey_buf *orig_k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        u64 reflink_offset;
        int ret;
@@ -1971,10 +1926,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
                *offset_into_extent;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
-                                  POS(0, reflink_offset),
-                                  BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
+                            POS(0, reflink_offset),
+                            BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -1991,13 +1946,40 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
                goto err;
        }
 
-       *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
+       *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
        bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
+                                                  struct bkey_s_c k,
+                                                  struct bch_extent_ptr ptr)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev);
+       struct btree_iter iter;
+       char buf[200];
+       int ret;
+
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)),
+                            BTREE_ITER_CACHED);
+
+       ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+       if (ret)
+               return;
+
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch_err(c, "%s", buf);
+       bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset));
+       bch2_trans_iter_exit(trans, &iter);
+}
+
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                       struct bvec_iter iter, struct bpos read_pos,
                       enum btree_id data_btree, struct bkey_s_c k,
@@ -2007,7 +1989,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
        struct bch_fs *c = trans->c;
        struct extent_ptr_decoded pick;
        struct bch_read_bio *rbio = NULL;
-       struct bch_dev *ca;
+       struct bch_dev *ca = NULL;
        struct promote_op *promote = NULL;
        bool bounce = false, read_full = false, narrow_crcs = false;
        struct bpos data_pos = bkey_start_pos(k.k);
@@ -2024,7 +2006,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                zero_fill_bio_iter(&orig->bio, iter);
                goto out_read_done;
        }
-
+retry_pick:
        pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
        /* hole or reservation - just zero fill: */
@@ -2037,8 +2019,20 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
                goto err;
        }
 
-       if (pick_ret > 0)
-               ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+       if (!pick.ptr.cached &&
+           unlikely(ptr_stale(ca, &pick.ptr))) {
+               read_from_stale_dirty_pointer(trans, k, pick.ptr);
+               bch2_mark_io_failure(failed, &pick);
+               goto retry_pick;
+       }
+
+       /*
+        * Unlock the iterator while the btree node's lock is still in
+        * cache, before doing the IO:
+        */
+       bch2_trans_unlock(trans);
 
        if (flags & BCH_READ_NODECODE) {
                /*
@@ -2065,7 +2059,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
        EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
 
        if (crc_is_compressed(pick.crc) ||
-           (pick.crc.csum_type != BCH_CSUM_NONE &&
+           (pick.crc.csum_type != BCH_CSUM_none &&
             (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
              (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
               (flags & BCH_READ_USER_MAPPED)) ||
@@ -2158,6 +2152,7 @@ get_bio:
        /* XXX: only initialize this if needed */
        rbio->devs_have         = bch2_bkey_devs(k);
        rbio->pick              = pick;
+       rbio->subvol            = orig->subvol;
        rbio->read_pos          = read_pos;
        rbio->data_btree        = data_btree;
        rbio->data_pos          = data_pos;
@@ -2260,26 +2255,31 @@ out_read_done:
 }
 
 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                struct bvec_iter bvec_iter, u64 inode,
+                struct bvec_iter bvec_iter, subvol_inum inum,
                 struct bch_io_failures *failed, unsigned flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_buf sk;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
        BUG_ON(flags & BCH_READ_NODECODE);
 
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
-                                  POS(inode, bvec_iter.bi_sector),
-                                  BTREE_ITER_SLOTS);
 retry:
        bch2_trans_begin(&trans);
+       iter = (struct btree_iter) { NULL };
 
+       ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot);
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(inum.inum, bvec_iter.bi_sector, snapshot),
+                            BTREE_ITER_SLOTS);
        while (1) {
                unsigned bytes, sectors, offset_into_extent;
                enum btree_id data_btree = BTREE_ID_extents;
@@ -2293,15 +2293,15 @@ retry:
                        break;
                }
 
-               bch2_btree_iter_set_pos(iter,
-                               POS(inode, bvec_iter.bi_sector));
+               bch2_btree_iter_set_pos(&iter,
+                               POS(inum.inum, bvec_iter.bi_sector));
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        break;
 
-               offset_into_extent = iter->pos.offset -
+               offset_into_extent = iter.pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
@@ -2320,19 +2320,13 @@ retry:
                 */
                sectors = min(sectors, k.k->size - offset_into_extent);
 
-               /*
-                * Unlock the iterator while the btree node's lock is still in
-                * cache, before doing the IO:
-                */
-               bch2_trans_unlock(&trans);
-
                bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
                swap(bvec_iter.bi_size, bytes);
 
                if (bvec_iter.bi_size == bytes)
                        flags |= BCH_READ_LAST_FRAGMENT;
 
-               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
+               ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos,
                                         data_btree, k,
                                         offset_into_extent, failed, flags);
                if (ret)
@@ -2343,17 +2337,22 @@ retry:
 
                swap(bvec_iter.bi_size, bytes);
                bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+
+               ret = btree_trans_too_many_iters(&trans);
+               if (ret)
+                       break;
        }
+err:
+       bch2_trans_iter_exit(&trans, &iter);
 
        if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
                goto retry;
 
-       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        if (ret) {
-               bch_err_inum_ratelimited(c, inode,
+               bch_err_inum_ratelimited(c, inum.inum,
                                         "read error %i from btree lookup", ret);
                rbio->bio.bi_status = BLK_STS_IOERR;
                bch2_rbio_done(rbio);
@@ -2381,8 +2380,8 @@ int bch2_fs_io_init(struct bch_fs *c)
            mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
                                         c->opts.btree_node_size,
-                                        c->sb.encoded_extent_max) /
-                                  PAGE_SECTORS, 0) ||
+                                        c->opts.encoded_extent_max) /
+                                  PAGE_SIZE, 0) ||
            rhashtable_init(&c->promote_table, &bch_promote_params))
                return -ENOMEM;
 
index bc0a0bd6f849438a82474c7e3ce2b331f6950be7..1aa422dccef7de794d3b65155d8d7cc28fac8bd7 100644 (file)
@@ -48,12 +48,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
                ? op->journal_seq_p : &op->journal_seq;
 }
 
-static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
-{
-       op->journal_seq_p = journal_seq;
-       op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-}
-
 static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
        return op->alloc_reserve == RESERVE_MOVINGGC
@@ -62,13 +56,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 }
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
-                              struct bkey_i *, bool *, bool *, s64 *, s64 *);
-int bch2_extent_update(struct btree_trans *, struct btree_iter *,
-                      struct bkey_i *, struct disk_reservation *,
-                      u64 *, u64, s64 *, bool);
+                              struct bkey_i *, bool *, s64 *, s64 *);
+int bch2_extent_update(struct btree_trans *, subvol_inum,
+                      struct btree_iter *, struct bkey_i *,
+                      struct disk_reservation *, u64 *, u64, s64 *, bool);
+
 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
-                  struct bpos, u64 *, s64 *);
-int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
+                  subvol_inum, u64, s64 *);
+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
 
 int bch2_write_index_default(struct bch_write_op *);
 
@@ -90,6 +85,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
        op->devs_have.nr        = 0;
        op->target              = 0;
        op->opts                = opts;
+       op->subvol              = 0;
        op->pos                 = POS_MAX;
        op->version             = ZERO_VERSION;
        op->write_point         = (struct write_point_specifier) { 0 };
@@ -157,10 +153,10 @@ static inline void bch2_read_extent(struct btree_trans *trans,
 }
 
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-                u64, struct bch_io_failures *, unsigned flags);
+                subvol_inum, struct bch_io_failures *, unsigned flags);
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-                            u64 inode)
+                            subvol_inum inum)
 {
        struct bch_io_failures failed = { .nr = 0 };
 
@@ -168,8 +164,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 
        rbio->c = c;
        rbio->start_time = local_clock();
+       rbio->subvol = inum.subvol;
 
-       __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
+       __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed,
                    BCH_READ_RETRY_IF_STALE|
                    BCH_READ_MAY_PROMOTE|
                    BCH_READ_USER_MAPPED);
index 0aab77951c4c37022899d8a8f40aa4601cc5d7ea..78bff13d36f27cb46c6a28c5bcc9dd65cb0ecac6 100644 (file)
@@ -62,6 +62,7 @@ struct bch_read_bio {
        /*
         * pos we read from - different from data_pos for indirect extents:
         */
+       u32                     subvol;
        struct bpos             read_pos;
 
        /*
@@ -122,6 +123,7 @@ struct bch_write_op {
        u16                     nonce;
        struct bch_io_opts      opts;
 
+       u32                     subvol;
        struct bpos             pos;
        struct bversion         version;
 
index ac4071fc4e80f05d9ec73f0ab85761dcfb98ee6d..158df42e5e10487caca016cf52478ab5377e5152 100644 (file)
@@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j)
        buf->must_flush = false;
        buf->separate_flush = false;
 
-       memset(buf->has_inode, 0, sizeof(buf->has_inode));
-
        memset(buf->data, 0, sizeof(*buf->data));
        buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
        buf->data->u64s = 0;
@@ -109,7 +107,12 @@ void bch2_journal_halt(struct journal *j)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       j->err_seq = journal_cur_seq(j);
+       /*
+        * XXX: we're not using j->lock here because this can be called from
+        * interrupt context, this can race with journal_write_done()
+        */
+       if (!j->err_seq)
+               j->err_seq = journal_cur_seq(j);
        journal_wake(j);
        closure_wake_up(&journal_cur_buf(j)->wait);
 }
@@ -308,7 +311,7 @@ static int journal_entry_open(struct journal *j)
 
        mod_delayed_work(c->io_complete_wq,
                         &j->write_work,
-                        msecs_to_jiffies(j->write_delay_ms));
+                        msecs_to_jiffies(c->opts.journal_flush_delay));
        journal_wake(j);
        return 0;
 }
@@ -335,55 +338,6 @@ static void journal_write_work(struct work_struct *work)
        journal_entry_close(j);
 }
 
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
-       size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       union journal_res_state s;
-       unsigned i;
-       u64 seq;
-
-
-       spin_lock(&j->lock);
-       seq = journal_cur_seq(j);
-       s = READ_ONCE(j->reservations);
-       i = s.idx;
-
-       while (1) {
-               if (test_bit(h, j->buf[i].has_inode))
-                       goto out;
-
-               if (i == s.unwritten_idx)
-                       break;
-
-               i = (i - 1) & JOURNAL_BUF_MASK;
-               seq--;
-       }
-
-       seq = 0;
-out:
-       spin_unlock(&j->lock);
-
-       return seq;
-}
-
-void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
-{
-       size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-       struct journal_buf *buf;
-
-       spin_lock(&j->lock);
-
-       if ((buf = journal_seq_to_buf(j, seq)))
-               set_bit(h, buf->has_inode);
-
-       spin_unlock(&j->lock);
-}
-
 static int __journal_res_get(struct journal *j, struct journal_res *res,
                             unsigned flags)
 {
@@ -602,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
        spin_lock(&j->lock);
 
-       BUG_ON(seq > journal_cur_seq(j));
+       if (WARN_ONCE(seq > journal_cur_seq(j),
+                     "requested to flush journal seq %llu, but currently at %llu",
+                     seq, journal_cur_seq(j)))
+               goto out;
 
        /* Recheck under lock: */
        if (j->err_seq && seq >= j->err_seq) {
@@ -669,6 +626,12 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
        u64 start_time = local_clock();
        int ret, ret2;
 
+       /*
+        * Don't update time_stats when @seq is already flushed:
+        */
+       if (seq <= j->flushed_seq_ondisk)
+               return 0;
+
        ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
 
        if (!ret)
@@ -679,6 +642,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 
 int bch2_journal_meta(struct journal *j)
 {
+       struct journal_buf *buf;
        struct journal_res res;
        int ret;
 
@@ -688,6 +652,10 @@ int bch2_journal_meta(struct journal *j)
        if (ret)
                return ret;
 
+       buf = j->buf + (res.seq & JOURNAL_BUF_MASK);
+       buf->must_flush = true;
+       set_bit(JOURNAL_NEED_WRITE, &j->flags);
+
        bch2_journal_res_put(j, &res);
 
        return bch2_journal_flush_seq(j, res.seq);
@@ -737,6 +705,44 @@ int bch2_journal_flush(struct journal *j)
        return bch2_journal_flush_seq(j, seq);
 }
 
+/*
+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before
+ * @seq
+ */
+bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       u64 unwritten_seq;
+       bool ret = false;
+
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush)))
+               return false;
+
+       if (seq <= c->journal.flushed_seq_ondisk)
+               return false;
+
+       spin_lock(&j->lock);
+       if (seq <= c->journal.flushed_seq_ondisk)
+               goto out;
+
+       for (unwritten_seq = last_unwritten_seq(j);
+            unwritten_seq < seq;
+            unwritten_seq++) {
+               struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
+
+               /* journal write is already in flight, and was a flush write: */
+               if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush)
+                       goto out;
+
+               buf->noflush = true;
+       }
+
+       ret = true;
+out:
+       spin_unlock(&j->lock);
+       return ret;
+}
+
 /* block/unlock the journal: */
 
 void bch2_journal_unblock(struct journal *j)
@@ -807,11 +813,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                long b;
 
                if (new_fs) {
-                       if (c)
-                               percpu_down_read(&c->mark_lock);
                        b = bch2_bucket_alloc_new_fs(ca);
                        if (b < 0) {
-                               percpu_up_read(&c->mark_lock);
                                ret = -ENOSPC;
                                goto err;
                        }
@@ -825,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                goto err;
                        }
 
-                       b = sector_to_bucket(ca, ob->ptr.offset);
+                       b = ob->bucket;
                }
 
                if (c)
@@ -859,14 +862,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (c)
                        spin_unlock(&c->journal.lock);
 
-               if (new_fs) {
-                       bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
-                                                 ca->mi.bucket_size,
-                                                 gc_phase(GC_PHASE_SB),
-                                                 0);
-                       if (c)
-                               percpu_up_read(&c->mark_lock);
-               } else {
+               if (!new_fs) {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                bch2_trans_mark_metadata_bucket(&trans, ca,
                                                b, BCH_DATA_journal,
@@ -1032,10 +1028,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        j->replay_journal_seq   = last_seq;
        j->replay_journal_seq_end = cur_seq;
        j->last_seq_ondisk      = last_seq;
+       j->flushed_seq_ondisk   = cur_seq - 1;
        j->pin.front            = last_seq;
        j->pin.back             = cur_seq;
        atomic64_set(&j->seq, cur_seq - 1);
 
+       if (list_empty(journal_entries))
+               j->last_empty_seq = cur_seq - 1;
+
        fifo_for_each_entry_ptr(p, &j->pin, seq)
                journal_pin_list_init(p, 1);
 
@@ -1048,6 +1048,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                if (seq < last_seq)
                        continue;
 
+               if (journal_entry_empty(&i->j))
+                       j->last_empty_seq = le64_to_cpu(i->j.seq);
+
                p = journal_seq_pin(j, seq);
 
                p->devs.nr = 0;
@@ -1055,6 +1058,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
                        bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
        }
 
+       if (list_empty(journal_entries))
+               j->last_empty_seq = cur_seq;
+
        spin_lock(&j->lock);
 
        set_bit(JOURNAL_STARTED, &j->flags);
@@ -1144,9 +1150,6 @@ int bch2_fs_journal_init(struct journal *j)
 
        lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-       j->write_delay_ms       = 1000;
-       j->reclaim_delay_ms     = 100;
-
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
@@ -1178,44 +1181,29 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        union journal_res_state s;
        struct bch_dev *ca;
+       unsigned long now = jiffies;
        unsigned i;
 
        rcu_read_lock();
        s = READ_ONCE(j->reservations);
 
-       pr_buf(out,
-              "active journal entries:\t%llu\n"
-              "seq:\t\t\t%llu\n"
-              "last_seq:\t\t%llu\n"
-              "last_seq_ondisk:\t%llu\n"
-              "flushed_seq_ondisk:\t%llu\n"
-              "prereserved:\t\t%u/%u\n"
-              "each entry reserved:\t%u\n"
-              "nr flush writes:\t%llu\n"
-              "nr noflush writes:\t%llu\n"
-              "nr direct reclaim:\t%llu\n"
-              "nr background reclaim:\t%llu\n"
-              "reclaim kicked:\t\t%u\n"
-              "reclaim runs in:\t%u ms\n"
-              "current entry sectors:\t%u\n"
-              "current entry error:\t%u\n"
-              "current entry:\t\t",
-              fifo_used(&j->pin),
-              journal_cur_seq(j),
-              journal_last_seq(j),
-              j->last_seq_ondisk,
-              j->flushed_seq_ondisk,
-              j->prereserved.reserved,
-              j->prereserved.remaining,
-              j->entry_u64s_reserved,
-              j->nr_flush_writes,
-              j->nr_noflush_writes,
-              j->nr_direct_reclaim,
-              j->nr_background_reclaim,
-              j->reclaim_kicked,
-              jiffies_to_msecs(j->next_reclaim - jiffies),
-              j->cur_entry_sectors,
-              j->cur_entry_error);
+       pr_buf(out, "active journal entries:\t%llu\n",  fifo_used(&j->pin));
+       pr_buf(out, "seq:\t\t\t%llu\n",                 journal_cur_seq(j));
+       pr_buf(out, "last_seq:\t\t%llu\n",              journal_last_seq(j));
+       pr_buf(out, "last_seq_ondisk:\t%llu\n",         j->last_seq_ondisk);
+       pr_buf(out, "flushed_seq_ondisk:\t%llu\n",      j->flushed_seq_ondisk);
+       pr_buf(out, "prereserved:\t\t%u/%u\n",          j->prereserved.reserved, j->prereserved.remaining);
+       pr_buf(out, "each entry reserved:\t%u\n",       j->entry_u64s_reserved);
+       pr_buf(out, "nr flush writes:\t%llu\n",         j->nr_flush_writes);
+       pr_buf(out, "nr noflush writes:\t%llu\n",       j->nr_noflush_writes);
+       pr_buf(out, "nr direct reclaim:\t%llu\n",       j->nr_direct_reclaim);
+       pr_buf(out, "nr background reclaim:\t%llu\n",   j->nr_background_reclaim);
+       pr_buf(out, "reclaim kicked:\t\t%u\n",          j->reclaim_kicked);
+       pr_buf(out, "reclaim runs in:\t%u ms\n",        time_after(j->next_reclaim, now)
+              ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+       pr_buf(out, "current entry sectors:\t%u\n",     j->cur_entry_sectors);
+       pr_buf(out, "current entry error:\t%u\n",       j->cur_entry_error);
+       pr_buf(out, "current entry:\t\t");
 
        switch (s.cur_entry_offset) {
        case JOURNAL_ENTRY_ERROR_VAL:
@@ -1225,15 +1213,11 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                pr_buf(out, "closed\n");
                break;
        default:
-               pr_buf(out, "%u/%u\n",
-                      s.cur_entry_offset,
-                      j->cur_entry_u64s);
+               pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s);
                break;
        }
 
-       pr_buf(out,
-              "current entry:\t\tidx %u refcount %u\n",
-              s.idx, journal_state_count(s, s.idx));
+       pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx));
 
        i = s.idx;
        while (i != s.unwritten_idx) {
@@ -1273,22 +1257,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
                if (!ja->nr)
                        continue;
 
-               pr_buf(out,
-                      "dev %u:\n"
-                      "\tnr\t\t%u\n"
-                      "\tbucket size\t%u\n"
-                      "\tavailable\t%u:%u\n"
-                      "\tdiscard_idx\t%u\n"
-                      "\tdirty_ondisk\t%u (seq %llu)\n"
-                      "\tdirty_idx\t%u (seq %llu)\n"
-                      "\tcur_idx\t\t%u (seq %llu)\n",
-                      i, ja->nr, ca->mi.bucket_size,
-                      bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
-                      ja->sectors_free,
-                      ja->discard_idx,
-                      ja->dirty_idx_ondisk,    ja->bucket_seq[ja->dirty_idx_ondisk],
-                      ja->dirty_idx,           ja->bucket_seq[ja->dirty_idx],
-                      ja->cur_idx,             ja->bucket_seq[ja->cur_idx]);
+               pr_buf(out, "dev %u:\n",                i);
+               pr_buf(out, "\tnr\t\t%u\n",             ja->nr);
+               pr_buf(out, "\tbucket size\t%u\n",      ca->mi.bucket_size);
+               pr_buf(out, "\tavailable\t%u:%u\n",     bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
+               pr_buf(out, "\tdiscard_idx\t%u\n",      ja->discard_idx);
+               pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk,    ja->bucket_seq[ja->dirty_idx_ondisk]);
+               pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx,              ja->bucket_seq[ja->dirty_idx]);
+               pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx,                ja->bucket_seq[ja->cur_idx]);
        }
 
        rcu_read_unlock();
index 1d556790b38ee09d3eb2450efbc9904b9dd1dcb0..b298873212d2e598dff056b4328f4ce5a3a8e0f4 100644 (file)
@@ -141,7 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j)
        return j->pin.back - 1;
 }
 
-u64 bch2_inode_journal_seq(struct journal *, u64);
 void bch2_journal_set_has_inum(struct journal *, u64, u64);
 
 static inline int journal_state_count(union journal_res_state s, int idx)
@@ -163,18 +162,6 @@ static inline void journal_state_inc(union journal_res_state *s)
        s->buf3_count += s->idx == 3;
 }
 
-static inline void bch2_journal_set_has_inode(struct journal *j,
-                                             struct journal_res *res,
-                                             u64 inum)
-{
-       struct journal_buf *buf = &j->buf[res->idx];
-       unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
-
-       /* avoid atomic op if possible */
-       if (unlikely(!test_bit(bit, buf->has_inode)))
-               set_bit(bit, buf->has_inode);
-}
-
 /*
  * Amount of space that will be taken up by some keys in the journal (i.e.
  * including the jset header)
@@ -446,6 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
                ret = 0;
 
                if ((flags & JOURNAL_RES_GET_RESERVED) ||
+                   test_bit(JOURNAL_NOCHANGES, &j->flags) ||
                    new.reserved + d < new.remaining) {
                        new.reserved += d;
                        ret = 1;
@@ -489,6 +477,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *);
 
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
+bool bch2_journal_noflush_seq(struct journal *, u64);
 int bch2_journal_meta(struct journal *);
 
 void bch2_journal_halt(struct journal *);
index 66a0e267b3f4b29adadbe89396f765dd08307130..b5c204e7c5690616a69171522f6b71e25cad3210 100644 (file)
@@ -274,7 +274,7 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_btree_keys(struct bch_fs *c,
+static int journal_entry_btree_keys_validate(struct bch_fs *c,
                                             const char *where,
                                             struct jset_entry *entry,
                                             unsigned version, int big_endian, int write)
@@ -295,7 +295,24 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c,
        return 0;
 }
 
-static int journal_entry_validate_btree_root(struct bch_fs *c,
+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
+                                            struct jset_entry *entry)
+{
+       struct bkey_i *k;
+       bool first = true;
+
+       vstruct_for_each(entry, k) {
+               if (!first) {
+                       printbuf_newline(out);
+                       pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+               }
+               pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+               bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
+               first = false;
+       }
+}
+
+static int journal_entry_btree_root_validate(struct bch_fs *c,
                                             const char *where,
                                             struct jset_entry *entry,
                                             unsigned version, int big_endian, int write)
@@ -323,7 +340,13 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c,
+                                            struct jset_entry *entry)
+{
+       journal_entry_btree_keys_to_text(out, c, entry);
+}
+
+static int journal_entry_prio_ptrs_validate(struct bch_fs *c,
                                            const char *where,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
@@ -332,7 +355,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
        return 0;
 }
 
-static int journal_entry_validate_blacklist(struct bch_fs *c,
+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+}
+
+static int journal_entry_blacklist_validate(struct bch_fs *c,
                                            const char *where,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
@@ -347,7 +375,16 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+       struct jset_entry_blacklist *bl =
+               container_of(entry, struct jset_entry_blacklist, entry);
+
+       pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+}
+
+static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
                                               const char *where,
                                               struct jset_entry *entry,
                                               unsigned version, int big_endian, int write)
@@ -373,7 +410,18 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_usage(struct bch_fs *c,
+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c,
+                                              struct jset_entry *entry)
+{
+       struct jset_entry_blacklist_v2 *bl =
+               container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+       pr_buf(out, "start=%llu end=%llu",
+              le64_to_cpu(bl->start),
+              le64_to_cpu(bl->end));
+}
+
+static int journal_entry_usage_validate(struct bch_fs *c,
                                        const char *where,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
@@ -394,7 +442,18 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_data_usage(struct bch_fs *c,
+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
+                                       struct jset_entry *entry)
+{
+       struct jset_entry_usage *u =
+               container_of(entry, struct jset_entry_usage, entry);
+
+       pr_buf(out, "type=%s v=%llu",
+              bch2_fs_usage_types[u->entry.btree_id],
+              le64_to_cpu(u->v));
+}
+
+static int journal_entry_data_usage_validate(struct bch_fs *c,
                                        const char *where,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
@@ -416,7 +475,17 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_clock(struct bch_fs *c,
+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c,
+                                            struct jset_entry *entry)
+{
+       struct jset_entry_data_usage *u =
+               container_of(entry, struct jset_entry_data_usage, entry);
+
+       bch2_replicas_entry_to_text(out, &u->r);
+       pr_buf(out, "=%llu", le64_to_cpu(u->v));
+}
+
+static int journal_entry_clock_validate(struct bch_fs *c,
                                        const char *where,
                                        struct jset_entry *entry,
                                        unsigned version, int big_endian, int write)
@@ -442,7 +511,16 @@ fsck_err:
        return ret;
 }
 
-static int journal_entry_validate_dev_usage(struct bch_fs *c,
+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c,
+                                       struct jset_entry *entry)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+
+       pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+}
+
+static int journal_entry_dev_usage_validate(struct bch_fs *c,
                                            const char *where,
                                            struct jset_entry *entry,
                                            unsigned version, int big_endian, int write)
@@ -479,15 +557,59 @@ fsck_err:
        return ret;
 }
 
+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+       struct jset_entry_dev_usage *u =
+               container_of(entry, struct jset_entry_dev_usage, entry);
+       unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+
+       pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+
+       for (i = 0; i < nr_types; i++) {
+               if (i < BCH_DATA_NR)
+                       pr_buf(out, " %s", bch2_data_types[i]);
+               else
+                       pr_buf(out, " (unknown data type %u)", i);
+               pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+                      le64_to_cpu(u->d[i].buckets),
+                      le64_to_cpu(u->d[i].sectors),
+                      le64_to_cpu(u->d[i].fragmented));
+       }
+
+       pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu",
+              le64_to_cpu(u->buckets_ec),
+              le64_to_cpu(u->buckets_unavailable));
+}
+
+static int journal_entry_log_validate(struct bch_fs *c,
+                                     const char *where,
+                                     struct jset_entry *entry,
+                                     unsigned version, int big_endian, int write)
+{
+       return 0;
+}
+
+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c,
+                                     struct jset_entry *entry)
+{
+       struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
+       unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
+
+       bch_scnmemcpy(out, l->d, strnlen(l->d, bytes));
+}
+
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, const char *,
                        struct jset_entry *, unsigned, int, int);
+       void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *);
 };
 
 static const struct jset_entry_ops bch2_jset_entry_ops[] = {
 #define x(f, nr)                                               \
        [BCH_JSET_ENTRY_##f]    = (struct jset_entry_ops) {     \
-               .validate       = journal_entry_validate_##f,   \
+               .validate       = journal_entry_##f##_validate, \
+               .to_text        = journal_entry_##f##_to_text,  \
        },
        BCH_JSET_ENTRY_TYPES()
 #undef x
@@ -503,6 +625,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
                : 0;
 }
 
+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
+                               struct jset_entry *entry)
+{
+       if (entry->type < BCH_JSET_ENTRY_NR) {
+               pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+               bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
+       } else {
+               pr_buf(out, "(unknown type %u)", entry->type);
+       }
+}
+
 static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
                                 int write)
 {
@@ -710,7 +843,7 @@ reread:
                case JOURNAL_ENTRY_NONE:
                        if (!saw_bad)
                                return 0;
-                       sectors = c->opts.block_size;
+                       sectors = block_sectors(c);
                        goto next_block;
                case JOURNAL_ENTRY_BAD:
                        saw_bad = true;
@@ -719,7 +852,7 @@ reread:
                         * field of the journal entry we read, so try reading
                         * again at next block boundary:
                         */
-                       sectors = c->opts.block_size;
+                       sectors = block_sectors(c);
                        break;
                default:
                        return ret;
@@ -766,12 +899,13 @@ static void bch2_journal_read_device(struct closure *cl)
        struct journal_device *ja =
                container_of(cl, struct journal_device, read);
        struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+       struct bch_fs *c = ca->fs;
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
        struct journal_read_buf buf = { NULL, 0 };
        u64 min_seq = U64_MAX;
        unsigned i;
-       int ret;
+       int ret = 0;
 
        if (!ja->nr)
                goto out;
@@ -817,6 +951,7 @@ static void bch2_journal_read_device(struct closure *cl)
        ja->discard_idx = ja->dirty_idx_ondisk =
                ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
+       bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
        kvpfree(buf.data, buf.size);
        percpu_ref_put(&ca->io_ref);
        closure_return(cl);
@@ -1238,7 +1373,9 @@ static void journal_write_done(struct closure *cl)
        u64 v, seq;
        int err = 0;
 
-       bch2_time_stats_update(j->write_time, j->write_start_time);
+       bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
+                              ? j->flush_write_time
+                              : j->noflush_write_time, j->write_start_time);
 
        if (!w->devs_written.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
@@ -1259,14 +1396,15 @@ static void journal_write_done(struct closure *cl)
        if (seq >= j->pin.front)
                journal_seq_pin(j, seq)->devs = w->devs_written;
 
-       j->seq_ondisk           = seq;
-       if (err && (!j->err_seq || seq < j->err_seq))
-               j->err_seq      = seq;
+       if (!err) {
+               j->seq_ondisk           = seq;
 
-       if (!JSET_NO_FLUSH(w->data)) {
-               j->flushed_seq_ondisk = seq;
-               j->last_seq_ondisk = w->last_seq;
-       }
+               if (!JSET_NO_FLUSH(w->data)) {
+                       j->flushed_seq_ondisk = seq;
+                       j->last_seq_ondisk = w->last_seq;
+               }
+       } else if (!j->err_seq || seq < j->err_seq)
+               j->err_seq      = seq;
 
        /*
         * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
@@ -1396,9 +1534,10 @@ void bch2_journal_write(struct closure *cl)
 
        spin_lock(&j->lock);
        if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
-           !w->must_flush &&
-           (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
-           test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
+           (w->noflush ||
+            (!w->must_flush &&
+             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
                w->noflush = true;
                SET_JSET_NO_FLUSH(jset, true);
                jset->last_seq  = 0;
@@ -1445,7 +1584,7 @@ void bch2_journal_write(struct closure *cl)
        SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
-       if (journal_entry_empty(jset))
+       if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
                j->last_empty_seq = le64_to_cpu(jset->seq);
 
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
@@ -1515,7 +1654,7 @@ retry_alloc:
 
        w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
 
-       if (c->opts.nochanges)
+       if (test_bit(JOURNAL_NOCHANGES, &j->flags))
                goto no_io;
 
        for_each_rw_member(ca, c, i)
@@ -1538,16 +1677,12 @@ retry_alloc:
                }
        }
 
-       bch2_bucket_seq_cleanup(c);
-
        continue_at(cl, do_journal_write, c->io_complete_wq);
        return;
 no_io:
-       bch2_bucket_seq_cleanup(c);
-
        continue_at(cl, journal_write_done, c->io_complete_wq);
        return;
 err:
-       bch2_inconsistent_error(c);
+       bch2_fatal_error(c);
        continue_at(cl, journal_write_done, c->io_complete_wq);
 }
index f34281a28f12bc64f06dc62383c16af1f3389129..d8425fe0d67b6826c2de50196d3af23d95f16d55 100644 (file)
@@ -40,8 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
-int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
-                               unsigned, int, int);
+int bch2_journal_entry_validate(struct bch_fs *, const char *,
+                               struct jset_entry *, unsigned, int, int);
+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *,
+                               struct jset_entry *);
 
 int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
 
index 7a0ae5d3431c0aa29e7e075e6628b64365a4916f..52a3935cff530748ce5d5adb55ed55411b1d80cb 100644 (file)
@@ -34,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
                                            struct journal_device *ja,
                                            enum journal_space_from from)
 {
-       unsigned available = (journal_space_from(ja, from) -
-                             ja->cur_idx - 1 + ja->nr) % ja->nr;
+       unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags)
+               ? ((journal_space_from(ja, from) -
+                   ja->cur_idx - 1 + ja->nr) % ja->nr)
+               : ja->nr;
 
        /*
         * Don't use the last bucket unless writing the new last_seq
@@ -487,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
        u64 seq;
        int err;
 
-       if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-               return 0;
-
        lockdep_assert_held(&j->reclaim_lock);
 
        while (1) {
@@ -635,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                 * make sure to flush at least one journal pin:
                 */
                if (time_after(jiffies, j->last_flushed +
-                              msecs_to_jiffies(j->reclaim_delay_ms)))
+                              msecs_to_jiffies(c->opts.journal_reclaim_delay)))
                        min_nr = 1;
 
                if (j->prereserved.reserved * 4 > j->prereserved.remaining)
@@ -644,6 +643,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                if (fifo_free(&j->pin) <= 32)
                        min_nr = 1;
 
+               if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used)
+                       min_nr = 1;
+
                trace_journal_reclaim_start(c,
                                min_nr,
                                j->prereserved.reserved,
@@ -653,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                                atomic_long_read(&c->btree_key_cache.nr_dirty),
                                atomic_long_read(&c->btree_key_cache.nr_keys));
 
-               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
+               min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128);
 
                nr_flushed = journal_flush_pins(j, seq_to_flush,
                                                min_nr, min_key_cache);
@@ -681,13 +683,12 @@ int bch2_journal_reclaim(struct journal *j)
 static int bch2_journal_reclaim_thread(void *arg)
 {
        struct journal *j = arg;
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        unsigned long delay, now;
        int ret = 0;
 
        set_freezable();
 
-       kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
-
        j->last_flushed = jiffies;
 
        while (!ret && !kthread_should_stop()) {
@@ -698,7 +699,7 @@ static int bch2_journal_reclaim_thread(void *arg)
                mutex_unlock(&j->reclaim_lock);
 
                now = jiffies;
-               delay = msecs_to_jiffies(j->reclaim_delay_ms);
+               delay = msecs_to_jiffies(c->opts.journal_reclaim_delay);
                j->next_reclaim = j->last_flushed + delay;
 
                if (!time_in_range(j->next_reclaim, now, now + delay))
index f2060f903cbcf90489de1712511c2925b69d7198..3cc63fc202ab4cbc83017cb6cad4412720e03797 100644 (file)
@@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c,
        return bl;
 }
 
+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
+                                       u64 start, u64 end)
+{
+       return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
+}
+
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 {
        struct bch_sb_field_journal_seq_blacklist *bl;
@@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
        bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
        nr = blacklist_nr_entries(bl);
 
-       if (bl) {
-               for (i = 0; i < nr; i++) {
-                       struct journal_seq_blacklist_entry *e =
-                               bl->start + i;
-
-                       if (start == le64_to_cpu(e->start) &&
-                           end   == le64_to_cpu(e->end))
-                               goto out;
-
-                       if (start <= le64_to_cpu(e->start) &&
-                           end   >= le64_to_cpu(e->end)) {
-                               e->start = cpu_to_le64(start);
-                               e->end  = cpu_to_le64(end);
-
-                               if (i + 1 < nr)
-                                       bl = blacklist_entry_try_merge(c,
-                                                               bl, i);
-                               if (i)
-                                       bl = blacklist_entry_try_merge(c,
-                                                               bl, i - 1);
-                               goto out_write_sb;
-                       }
+       for (i = 0; i < nr; i++) {
+               struct journal_seq_blacklist_entry *e =
+                       bl->start + i;
+
+               if (bl_entry_contig_or_overlaps(e, start, end)) {
+                       e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
+                       e->end  = cpu_to_le64(max(end, le64_to_cpu(e->end)));
+
+                       if (i + 1 < nr)
+                               bl = blacklist_entry_try_merge(c,
+                                                       bl, i);
+                       if (i)
+                               bl = blacklist_entry_try_merge(c,
+                                                       bl, i - 1);
+                       goto out_write_sb;
                }
        }
 
@@ -189,27 +188,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
        return 0;
 }
 
-static const char *
-bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
-                                      struct bch_sb_field *f)
+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
+                                                 struct bch_sb_field *f,
+                                                 struct printbuf *err)
 {
        struct bch_sb_field_journal_seq_blacklist *bl =
                field_to_type(f, journal_seq_blacklist);
-       struct journal_seq_blacklist_entry *i;
-       unsigned nr = blacklist_nr_entries(bl);
+       unsigned i, nr = blacklist_nr_entries(bl);
 
-       for (i = bl->start; i < bl->start + nr; i++) {
-               if (le64_to_cpu(i->start) >=
-                   le64_to_cpu(i->end))
-                       return "entry start >= end";
-
-               if (i + 1 < bl->start + nr &&
-                   le64_to_cpu(i[0].end) >
-                   le64_to_cpu(i[1].start))
-                       return "entries out of order";
+       for (i = 0; i < nr; i++) {
+               struct journal_seq_blacklist_entry *e = bl->start + i;
+
+               if (le64_to_cpu(e->start) >=
+                   le64_to_cpu(e->end)) {
+                       pr_buf(err, "entry %u start >= end (%llu >= %llu)",
+                              i, le64_to_cpu(e->start), le64_to_cpu(e->end));
+                       return -EINVAL;
+               }
+
+               if (i + 1 < nr &&
+                   le64_to_cpu(e[0].end) >
+                   le64_to_cpu(e[1].start)) {
+                       pr_buf(err, "entry %u out of order with next entry (%llu > %llu)",
+                              i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start));
+                       return -EINVAL;
+               }
        }
 
-       return NULL;
+       return 0;
 }
 
 static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
@@ -250,19 +256,28 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
        bch2_trans_init(&trans, c, 0, 0);
 
        for (i = 0; i < BTREE_ID_NR; i++) {
-               struct btree_iter *iter;
+               struct btree_iter iter;
                struct btree *b;
 
-               for_each_btree_node(&trans, iter, i, POS_MIN,
-                                   BTREE_ITER_PREFETCH, b)
-                       if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-                               bch2_trans_exit(&trans);
-                               return;
-                       }
-               bch2_trans_iter_free(&trans, iter);
+               bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN,
+                                         0, 0, BTREE_ITER_PREFETCH);
+retry:
+               bch2_trans_begin(&trans);
+
+               b = bch2_btree_iter_peek_node(&iter);
+
+               while (!(ret = PTR_ERR_OR_ZERO(b)) &&
+                      b &&
+                      !test_bit(BCH_FS_STOPPING, &c->flags))
+                       b = bch2_btree_iter_next_node(&iter);
+
+               if (ret == -EINTR)
+                       goto retry;
+
+               bch2_trans_iter_exit(&trans, &iter);
        }
 
-       ret = bch2_trans_exit(&trans);
+       bch2_trans_exit(&trans);
        if (ret)
                return;
 
index 61674ae1ab5fee1e3adc5696b8e193968552a4d6..d6d7512141167a8f338b19ac64ec9703e9a085f2 100644 (file)
@@ -34,8 +34,6 @@ struct journal_buf {
        bool                    noflush;        /* write has already been kicked off, and was noflush */
        bool                    must_flush;     /* something wants a flush */
        bool                    separate_flush;
-       /* bloom filter: */
-       unsigned long           has_inode[1024 / sizeof(unsigned long)];
 };
 
 /*
@@ -150,10 +148,10 @@ enum journal_space_from {
 enum {
        JOURNAL_REPLAY_DONE,
        JOURNAL_STARTED,
-       JOURNAL_RECLAIM_STARTED,
        JOURNAL_NEED_WRITE,
        JOURNAL_MAY_GET_UNRESERVED,
        JOURNAL_MAY_SKIP_FLUSH,
+       JOURNAL_NOCHANGES,
 };
 
 /* Embedded in struct bch_fs */
@@ -263,8 +261,6 @@ struct journal {
        struct mutex            discard_lock;
        bool                    can_discard;
 
-       unsigned                write_delay_ms;
-       unsigned                reclaim_delay_ms;
        unsigned long           last_flush_write;
 
        u64                     res_get_blocked_start;
@@ -274,8 +270,8 @@ struct journal {
        u64                     nr_flush_writes;
        u64                     nr_noflush_writes;
 
-       struct time_stats       *write_time;
-       struct time_stats       *delay_time;
+       struct time_stats       *flush_write_time;
+       struct time_stats       *noflush_write_time;
        struct time_stats       *blocked_time;
        struct time_stats       *flush_seq_time;
 
index 1f65eca48c6ef48d20c033d119a04f0215f7a607..6defc33322b3b24bd5f9a076165accce6f204aa0 100644 (file)
@@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                                   enum btree_id btree_id)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf sk;
        int ret = 0;
@@ -47,13 +47,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-       while ((k = bch2_btree_iter_peek(iter)).k &&
+       while ((bch2_trans_begin(&trans),
+               (k = bch2_btree_iter_peek(&iter)).k) &&
               !(ret = bkey_err(k))) {
                if (!bch2_bkey_has_device(k, dev_idx)) {
-                       bch2_btree_iter_advance(iter);
+                       bch2_btree_iter_advance(&iter);
                        continue;
                }
 
@@ -71,10 +73,18 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                 */
                bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-               bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+               /*
+                * Since we're not inserting through an extent iterator
+                * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+                * we aren't using the extent overwrite path to delete, we're
+                * just using the normal key deletion path:
+                */
+               if (bkey_deleted(&sk.k->k))
+                       sk.k->k.size = 0;
 
-               ret   = bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, sk.k, 0) ?:
+               ret   = bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, sk.k,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_NOFAIL);
 
@@ -88,9 +98,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        BUG_ON(ret == -EINTR);
@@ -107,7 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct closure cl;
        struct btree *b;
        struct bkey_buf k;
@@ -123,12 +133,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        closure_init_stack(&cl);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
-               for_each_btree_node(&trans, iter, id, POS_MIN,
-                                   BTREE_ITER_PREFETCH, b) {
+               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+                                         BTREE_ITER_PREFETCH);
 retry:
+               ret = 0;
+               while (bch2_trans_begin(&trans),
+                      (b = bch2_btree_iter_peek_node(&iter)) &&
+                      !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
                                                  dev_idx))
-                               continue;
+                               goto next;
 
                        bch2_bkey_buf_copy(&k, c, &b->key);
 
@@ -139,18 +153,23 @@ retry:
                                break;
                        }
 
-                       ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false);
+                       ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false);
                        if (ret == -EINTR) {
-                               b = bch2_btree_iter_peek_node(iter);
                                ret = 0;
-                               goto retry;
+                               continue;
                        }
+
                        if (ret) {
                                bch_err(c, "Error updating btree node key: %i", ret);
                                break;
                        }
+next:
+                       bch2_btree_iter_next_node(&iter);
                }
-               bch2_trans_iter_free(&trans, iter);
+               if (ret == -EINTR)
+                       goto retry;
+
+               bch2_trans_iter_exit(&trans, &iter);
 
                if (ret)
                        goto err;
@@ -162,7 +181,7 @@ retry:
 
        ret = 0;
 err:
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&k, c);
 
        BUG_ON(ret == -EINTR);
index ee0f155fda6c85628c800516506b28e1d375c6ff..7ca7ce394135cef68ed81420a1d064a2148b1321 100644 (file)
@@ -8,11 +8,13 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "io.h"
 #include "journal_reclaim.h"
 #include "move.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 #include "keylist.h"
 
@@ -53,13 +55,89 @@ struct moving_context {
        wait_queue_head_t       wait;
 };
 
+static int insert_snapshot_whiteouts(struct btree_trans *trans,
+                                    enum btree_id id,
+                                    struct bpos old_pos,
+                                    struct bpos new_pos)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter, update_iter;
+       struct bkey_s_c k;
+       struct snapshots_seen s;
+       int ret;
+
+       if (!btree_type_has_snapshots(id))
+               return 0;
+
+       snapshots_seen_init(&s);
+
+       if (!bkey_cmp(old_pos, new_pos))
+               return 0;
+
+       if (!snapshot_t(c, old_pos.snapshot)->children[0])
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, id, old_pos,
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       while (1) {
+next:
+               k = bch2_btree_iter_prev(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       break;
+
+               if (bkey_cmp(old_pos, k.k->p))
+                       break;
+
+               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) {
+                       struct bkey_i *update;
+                       size_t i;
+
+                       for (i = 0; i < s.nr; i++)
+                               if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i]))
+                                       goto next;
+
+                       update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
+
+                       ret = PTR_ERR_OR_ZERO(update);
+                       if (ret)
+                               break;
+
+                       bkey_init(&update->k);
+                       update->k.p = new_pos;
+                       update->k.p.snapshot = k.k->p.snapshot;
+
+                       bch2_trans_iter_init(trans, &update_iter, id, update->k.p,
+                                            BTREE_ITER_NOT_EXTENTS|
+                                            BTREE_ITER_ALL_SNAPSHOTS|
+                                            BTREE_ITER_INTENT);
+                       ret   = bch2_btree_iter_traverse(&update_iter) ?:
+                               bch2_trans_update(trans, &update_iter, update,
+                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+                       bch2_trans_iter_exit(trans, &update_iter);
+                       if (ret)
+                               break;
+
+                       ret = snapshots_seen_add(c, &s, k.k->p.snapshot);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(trans, &iter);
+       kfree(s.d);
+
+       return ret;
+}
+
 static int bch2_migrate_index_update(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct migrate_write *m =
                container_of(op, struct migrate_write, op);
+       struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets);
        struct keylist *keys = &op->insert_keys;
        struct bkey_buf _new, _insert;
        int ret = 0;
@@ -70,9 +148,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
-       iter = bch2_trans_get_iter(&trans, m->btree_id,
-                                  bkey_start_pos(&bch2_keylist_front(keys)->k),
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, m->btree_id,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        while (1) {
                struct bkey_s_c k;
@@ -80,13 +158,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                struct bkey_i_extent *new;
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
+               struct bpos next_pos;
                bool did_work = false;
-               bool extending = false, should_check_enospc;
+               bool should_check_enospc;
                s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
                bch2_trans_begin(&trans);
 
-               k = bch2_btree_iter_peek_slot(iter);
+               k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
                        goto err;
@@ -102,9 +181,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
                bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
                new = bkey_i_to_extent(_new.k);
-               bch2_cut_front(iter->pos, &new->k_i);
+               bch2_cut_front(iter.pos, &new->k_i);
 
-               bch2_cut_front(iter->pos,       insert);
+               bch2_cut_front(iter.pos,        insert);
                bch2_cut_back(new->k.p,         insert);
                bch2_cut_back(insert->k.p,      &new->k_i);
 
@@ -119,7 +198,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                extent_for_each_ptr(extent_i_to_s(new), new_ptr)
                                        new_ptr->cached = true;
 
-                       bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+                       __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
                }
 
                extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
@@ -146,8 +225,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                               op->opts.background_target,
                                               op->opts.data_replicas);
 
-               ret = bch2_sum_sector_overwrites(&trans, iter, insert,
-                                                &extending,
+               ret = bch2_sum_sector_overwrites(&trans, &iter, insert,
                                                 &should_check_enospc,
                                                 &i_sectors_delta,
                                                 &disk_sectors_delta);
@@ -163,20 +241,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                goto out;
                }
 
-               ret   = bch2_trans_update(&trans, iter, insert, 0) ?:
+               next_pos = insert->k.p;
+
+               ret   = insert_snapshot_whiteouts(&trans, m->btree_id,
+                                                 k.k->p, insert->k.p) ?:
+                       bch2_trans_update(&trans, &iter, insert,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
                                BTREE_INSERT_NOFAIL|
                                m->data_opts.btree_insert_flags);
-err:
-               if (!ret)
+               if (!ret) {
+                       bch2_btree_iter_set_pos(&iter, next_pos);
                        atomic_long_inc(&c->extent_migrate_done);
+                       if (ec_ob)
+                               bch2_ob_add_backpointer(c, ec_ob, &insert->k);
+               }
+err:
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
                        break;
 next:
-               while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
+               while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
                        bch2_keylist_pop_front(keys);
                        if (bch2_keylist_empty(keys))
                                goto out;
@@ -184,18 +271,18 @@ next:
                continue;
 nomatch:
                if (m->ctxt) {
-                       BUG_ON(k.k->p.offset <= iter->pos.offset);
+                       BUG_ON(k.k->p.offset <= iter.pos.offset);
                        atomic64_inc(&m->ctxt->stats->keys_raced);
-                       atomic64_add(k.k->p.offset - iter->pos.offset,
+                       atomic64_add(k.k->p.offset - iter.pos.offset,
                                     &m->ctxt->stats->sectors_raced);
                }
                atomic_long_inc(&c->extent_migrate_raced);
                trace_move_race(&new->k);
-               bch2_btree_iter_advance(iter);
+               bch2_btree_iter_advance(&iter);
                goto next;
        }
 out:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&_insert, c);
        bch2_bkey_buf_exit(&_new, c);
@@ -216,11 +303,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
        m->op.crc       = rbio->pick.crc;
        m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 
-       if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
-               m->op.nonce     = m->op.crc.nonce + m->op.crc.offset;
-               m->op.csum_type = m->op.crc.csum_type;
-       }
-
        if (m->data_cmd == DATA_REWRITE)
                bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
 }
@@ -235,6 +317,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
        struct extent_ptr_decoded p;
        int ret;
 
@@ -255,6 +338,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
        m->op.target    = data_opts.target,
        m->op.write_point = wp;
 
+       /*
+        * op->csum_type is normally initialized from the fs/file's current
+        * options - but if an extent is encrypted, we require that it stays
+        * encrypted:
+        */
+       bkey_for_each_crc(k.k, ptrs, crc, entry)
+               if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                       m->op.nonce     = crc.nonce + crc.offset;
+                       m->op.csum_type = crc.csum_type;
+                       break;
+               }
+
        if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
                m->op.alloc_reserve = RESERVE_MOVINGGC;
                m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
@@ -299,10 +394,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
                unsigned compressed_sectors = 0;
 
                bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-                       if (p.ptr.dev == data_opts.rewrite_dev &&
-                           !p.ptr.cached &&
-                           crc_is_compressed(p.crc))
-                               compressed_sectors += p.crc.compressed_size;
+                       if (p.ptr.dev == data_opts.rewrite_dev) {
+                               if (p.ptr.cached)
+                                       m->op.flags |= BCH_WRITE_CACHED;
+
+                               if (!p.ptr.cached &&
+                                   crc_is_compressed(p.crc))
+                                       compressed_sectors += p.crc.compressed_size;
+                       }
 
                if (compressed_sectors) {
                        ret = bch2_disk_reservation_add(c, &m->op.res,
@@ -388,19 +487,22 @@ static void move_read_endio(struct bio *bio)
        closure_put(&ctxt->cl);
 }
 
-static void do_pending_writes(struct moving_context *ctxt)
+static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans)
 {
        struct moving_io *io;
 
+       if (trans)
+               bch2_trans_unlock(trans);
+
        while ((io = next_pending_write(ctxt))) {
                list_del(&io->list);
                closure_call(&io->cl, move_write, NULL, &ctxt->cl);
        }
 }
 
-#define move_ctxt_wait_event(_ctxt, _cond)                     \
+#define move_ctxt_wait_event(_ctxt, _trans, _cond)             \
 do {                                                           \
-       do_pending_writes(_ctxt);                               \
+       do_pending_writes(_ctxt, _trans);                       \
                                                                \
        if (_cond)                                              \
                break;                                          \
@@ -408,11 +510,12 @@ do {                                                              \
                     next_pending_write(_ctxt) || (_cond));     \
 } while (1)
 
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
+                                      struct btree_trans *trans)
 {
        unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 
-       move_ctxt_wait_event(ctxt,
+       move_ctxt_wait_event(ctxt, trans,
                !atomic_read(&ctxt->write_sectors) ||
                atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
@@ -434,14 +537,6 @@ static int bch2_move_extent(struct btree_trans *trans,
        unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
-       move_ctxt_wait_event(ctxt,
-               atomic_read(&ctxt->write_sectors) <
-               SECTORS_IN_FLIGHT_PER_DEVICE);
-
-       move_ctxt_wait_event(ctxt,
-               atomic_read(&ctxt->read_sectors) <
-               SECTORS_IN_FLIGHT_PER_DEVICE);
-
        /* write path might have to decompress data: */
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
                sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
@@ -511,13 +606,13 @@ err:
 static int lookup_inode(struct btree_trans *trans, struct bpos pos,
                        struct bch_inode_unpacked *inode)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
-                                  BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -527,15 +622,15 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
                goto err;
        }
 
-       ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
+       ret = bkey_is_inode(k.k) ? 0 : -EIO;
        if (ret)
                goto err;
 
-       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
+       ret = bch2_inode_unpack(k, inode);
        if (ret)
                goto err;
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -553,7 +648,7 @@ static int __bch2_move_data(struct bch_fs *c,
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct bkey_buf sk;
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct data_opts data_opts;
        enum data_cmd data_cmd;
@@ -567,8 +662,9 @@ static int __bch2_move_data(struct bch_fs *c,
        stats->btree_id = btree_id;
        stats->pos      = start;
 
-       iter = bch2_trans_get_iter(&trans, btree_id, start,
-                                  BTREE_ITER_PREFETCH);
+       bch2_trans_iter_init(&trans, &iter, btree_id, start,
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
        if (rate)
                bch2_ratelimit_reset(rate);
@@ -591,26 +687,36 @@ static int __bch2_move_data(struct bch_fs *c,
                                schedule_timeout(delay);
 
                        if (unlikely(freezing(current))) {
-                               bch2_trans_unlock(&trans);
-                               move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+                               move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads));
                                try_to_freeze();
                        }
                } while (delay);
 
-               bch2_trans_begin(&trans);
+               move_ctxt_wait_event(ctxt, &trans,
+                       atomic_read(&ctxt->write_sectors) <
+                       SECTORS_IN_FLIGHT_PER_DEVICE);
 
-               k = bch2_btree_iter_peek(iter);
+               move_ctxt_wait_event(ctxt, &trans,
+                       atomic_read(&ctxt->read_sectors) <
+                       SECTORS_IN_FLIGHT_PER_DEVICE);
 
-               stats->pos = iter->pos;
+               bch2_trans_begin(&trans);
 
+               k = bch2_btree_iter_peek(&iter);
                if (!k.k)
                        break;
+
                ret = bkey_err(k);
+               if (ret == -EINTR)
+                       continue;
                if (ret)
                        break;
+
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
+               stats->pos = iter.pos;
+
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
@@ -645,22 +751,22 @@ static int __bch2_move_data(struct bch_fs *c,
                        BUG();
                }
 
-               /* unlock before doing IO: */
+               /*
+                * The iterator gets unlocked by __bch2_read_extent - need to
+                * save a copy of @k elsewhere:
+                 */
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
-               bch2_trans_unlock(&trans);
 
                ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);
                if (ret2) {
-                       if (ret2 == -EINTR) {
-                               bch2_trans_begin(&trans);
+                       if (ret2 == -EINTR)
                                continue;
-                       }
 
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt);
+                               bch2_move_ctxt_wait_for_io(ctxt, &trans);
                                continue;
                        }
 
@@ -671,21 +777,43 @@ static int __bch2_move_data(struct bch_fs *c,
                if (rate)
                        bch2_ratelimit_increment(rate, k.k->size);
 next:
-               atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
-                            &stats->sectors_seen);
+               atomic64_add(k.k->size, &stats->sectors_seen);
 next_nondata:
-               bch2_btree_iter_advance(iter);
-               bch2_trans_cond_resched(&trans);
+               bch2_btree_iter_advance(&iter);
        }
 out:
 
-       bch2_trans_iter_put(&trans, iter);
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
 
+inline void bch_move_stats_init(struct bch_move_stats *stats, char *name)
+{
+       memset(stats, 0, sizeof(*stats));
+
+       scnprintf(stats->name, sizeof(stats->name),
+                       "%s", name);
+}
+
+static inline void progress_list_add(struct bch_fs *c,
+                                    struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_add(&stats->list, &c->data_progress_list);
+       mutex_unlock(&c->data_progress_lock);
+}
+
+static inline void progress_list_del(struct bch_fs *c,
+                                    struct bch_move_stats *stats)
+{
+       mutex_lock(&c->data_progress_lock);
+       list_del(&stats->list);
+       mutex_unlock(&c->data_progress_lock);
+}
+
 int bch2_move_data(struct bch_fs *c,
                   enum btree_id start_btree_id, struct bpos start_pos,
                   enum btree_id end_btree_id,   struct bpos end_pos,
@@ -698,6 +826,7 @@ int bch2_move_data(struct bch_fs *c,
        enum btree_id id;
        int ret;
 
+       progress_list_add(c, stats);
        closure_init_stack(&ctxt.cl);
        INIT_LIST_HEAD(&ctxt.reads);
        init_waitqueue_head(&ctxt.wait);
@@ -722,7 +851,7 @@ int bch2_move_data(struct bch_fs *c,
        }
 
 
-       move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
+       move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads));
        closure_sync(&ctxt.cl);
 
        EBUG_ON(atomic_read(&ctxt.write_sectors));
@@ -731,6 +860,7 @@ int bch2_move_data(struct bch_fs *c,
                        atomic64_read(&stats->sectors_moved),
                        atomic64_read(&stats->keys_moved));
 
+       progress_list_del(c, stats);
        return ret;
 }
 
@@ -747,7 +877,7 @@ static int bch2_move_btree(struct bch_fs *c,
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
        struct data_opts data_opts;
@@ -755,6 +885,7 @@ static int bch2_move_btree(struct bch_fs *c,
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
+       progress_list_add(c, stats);
 
        stats->data_type = BCH_DATA_btree;
 
@@ -763,9 +894,13 @@ static int bch2_move_btree(struct bch_fs *c,
             id++) {
                stats->btree_id = id;
 
-               for_each_btree_node(&trans, iter, id,
-                                   id == start_btree_id ? start_pos : POS_MIN,
-                                   BTREE_ITER_PREFETCH, b) {
+               bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0,
+                                         BTREE_ITER_PREFETCH);
+retry:
+               ret = 0;
+               while (bch2_trans_begin(&trans),
+                      (b = bch2_btree_iter_peek_node(&iter)) &&
+                      !(ret = PTR_ERR_OR_ZERO(b))) {
                        if (kthread && kthread_should_stop())
                                break;
 
@@ -773,7 +908,7 @@ static int bch2_move_btree(struct bch_fs *c,
                             bpos_cmp(b->key.k.p, end_pos)) > 0)
                                break;
 
-                       stats->pos = iter->pos;
+                       stats->pos = iter.pos;
 
                        switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
                        case DATA_SKIP:
@@ -787,13 +922,19 @@ static int bch2_move_btree(struct bch_fs *c,
                                BUG();
                        }
 
-                       ret = bch2_btree_node_rewrite(&trans, iter,
-                                       b->data->keys.seq, 0) ?: ret;
+                       ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret;
+                       if (ret == -EINTR)
+                               continue;
+                       if (ret)
+                               break;
 next:
-                       bch2_trans_cond_resched(&trans);
+                       bch2_btree_iter_next_node(&iter);
                }
+               if (ret == -EINTR)
+                       goto retry;
+
+               bch2_trans_iter_exit(&trans, &iter);
 
-               ret = bch2_trans_iter_free(&trans, iter) ?: ret;
                if (kthread && kthread_should_stop())
                        break;
        }
@@ -803,6 +944,11 @@ next:
        if (ret)
                bch_err(c, "error %i in bch2_move_btree", ret);
 
+       /* flush relevant btree updates */
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
+
+       progress_list_del(c, stats);
        return ret;
 }
 
@@ -822,16 +968,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
                                      struct data_opts *data_opts)
 {
        unsigned nr_good = bch2_bkey_durability(c, k);
-       unsigned replicas = 0;
-
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr:
-               replicas = c->opts.metadata_replicas;
-               break;
-       case KEY_TYPE_extent:
-               replicas = io_opts->data_replicas;
-               break;
-       }
+       unsigned replicas = bkey_is_btree_ptr(k.k)
+               ? c->opts.metadata_replicas
+               : io_opts->data_replicas;
 
        if (!nr_good || nr_good >= replicas)
                return DATA_SKIP;
@@ -944,6 +1083,7 @@ int bch2_data_job(struct bch_fs *c,
 
        switch (op.op) {
        case BCH_DATA_OP_REREPLICATE:
+               bch_move_stats_init(stats, "rereplicate");
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
@@ -951,10 +1091,6 @@ int bch2_data_job(struct bch_fs *c,
                                      op.start_btree,   op.start_pos,
                                      op.end_btree,     op.end_pos,
                                      rereplicate_btree_pred, c, stats) ?: ret;
-
-               closure_wait_event(&c->btree_interior_update_wait,
-                                  !bch2_btree_interior_updates_nr_pending(c));
-
                ret = bch2_replicas_gc2(c) ?: ret;
 
                ret = bch2_move_data(c,
@@ -968,6 +1104,7 @@ int bch2_data_job(struct bch_fs *c,
                if (op.migrate.dev >= c->sb.nr_devices)
                        return -EINVAL;
 
+               bch_move_stats_init(stats, "migrate");
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
@@ -985,6 +1122,7 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
                break;
        case BCH_DATA_OP_REWRITE_OLD_NODES:
+               bch_move_stats_init(stats, "rewrite_old_nodes");
                ret = bch2_scan_old_btree_nodes(c, stats);
                break;
        default:
index 5076153689d18bd3a55049eff957df4f376a2a19..2a789a1158ca22e4e7efb1cd4298b4115da3216d 100644 (file)
@@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *,
                  struct bch_move_stats *,
                  struct bch_ioctl_data);
 
+inline void bch_move_stats_init(struct bch_move_stats *stats,
+                               char *name);
+
+
 #endif /* _BCACHEFS_MOVE_H */
index fc0de165af9fe354b246e24ca223904c87e7dc0b..9df6d18137a5e02655d6c34f10730b896f9d48d5 100644 (file)
@@ -6,6 +6,8 @@ struct bch_move_stats {
        enum bch_data_type      data_type;
        enum btree_id           btree_id;
        struct bpos             pos;
+       struct list_head        list;
+       char                    name[32];
 
        atomic64_t              keys_moved;
        atomic64_t              keys_raced;
index 2acca0ddb6fd64a140fffe7735a3239661425b6f..c82ecff3efe2b198eb541616e2f13fd4e4f4564e 100644 (file)
@@ -6,6 +6,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
@@ -69,10 +70,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                        .dev    = p.ptr.dev,
                        .offset = p.ptr.offset,
                };
+               ssize_t i;
 
-               ssize_t i = eytzinger0_find_le(h->data, h->used,
-                                              sizeof(h->data[0]),
-                                              bucket_offset_cmp, &search);
+               if (p.ptr.cached)
+                       continue;
+
+               i = eytzinger0_find_le(h->data, h->used,
+                                      sizeof(h->data[0]),
+                                      bucket_offset_cmp, &search);
 #if 0
                /* eytzinger search verify code: */
                ssize_t j = -1, k;
@@ -85,6 +90,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                BUG_ON(i != j);
 #endif
                if (i >= 0 &&
+                   p.ptr.dev == h->data[i].dev &&
                    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
                    p.ptr.gen == h->data[i].gen) {
                        /*
@@ -132,21 +138,110 @@ static inline int fragmentation_cmp(copygc_heap *heap,
        return cmp_int(l.fragmentation, r.fragmentation);
 }
 
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+       copygc_heap *h = &c->copygc_heap;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+               struct copygc_heap_entry e;
+
+               u = bch2_alloc_unpack(k);
+
+               if (u.data_type != BCH_DATA_user ||
+                   u.dirty_sectors >= ca->mi.bucket_size ||
+                   bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+                       continue;
+
+               e = (struct copygc_heap_entry) {
+                       .dev            = iter.pos.inode,
+                       .gen            = u.gen,
+                       .replicas       = 1 + u.stripe_redundancy,
+                       .fragmentation  = u.dirty_sectors * (1U << 15)
+                               / ca->mi.bucket_size,
+                       .sectors        = u.dirty_sectors,
+                       .offset         = bucket_to_sector(ca, iter.pos.offset),
+               };
+               heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int bucket_inorder_cmp(const void *_l, const void *_r)
+{
+       const struct copygc_heap_entry *l = _l;
+       const struct copygc_heap_entry *r = _r;
+
+       return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+                                u64 *sectors_not_moved,
+                                u64 *buckets_not_moved)
+{
+       copygc_heap *h = &c->copygc_heap;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       struct copygc_heap_entry *i;
+       int ret = 0;
+
+       sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+       bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+       for (i = h->data; i < h->data + h->used; i++) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+               bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+               ret = lockrestart_do(&trans,
+                               bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+               if (ret)
+                       break;
+
+               u = bch2_alloc_unpack(k);
+
+               if (u.gen == i->gen && u.dirty_sectors) {
+                       *sectors_not_moved += u.dirty_sectors;
+                       *buckets_not_moved += 1;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
        copygc_heap *h = &c->copygc_heap;
        struct copygc_heap_entry e, *i;
-       struct bucket_array *buckets;
        struct bch_move_stats move_stats;
-       u64 sectors_to_move = 0, sectors_not_moved = 0;
+       u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
        u64 sectors_reserved = 0;
        u64 buckets_to_move, buckets_not_moved = 0;
        struct bch_dev *ca;
        unsigned dev_idx;
-       size_t b, heap_size = 0;
+       size_t heap_size = 0;
        int ret;
 
-       memset(&move_stats, 0, sizeof(move_stats));
+       bch_move_stats_init(&move_stats, "copygc");
+
        /*
         * Find buckets with lowest sector counts, skipping completely
         * empty buckets, by building a maxheap sorted by sector count,
@@ -172,59 +267,45 @@ static int bch2_copygc(struct bch_fs *c)
                spin_lock(&ca->fs->freelist_lock);
                sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
                spin_unlock(&ca->fs->freelist_lock);
+       }
 
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-                       struct bucket *g = buckets->b + b;
-                       struct bucket_mark m = READ_ONCE(g->mark);
-                       struct copygc_heap_entry e;
-
-                       if (m.owned_by_allocator ||
-                           m.data_type != BCH_DATA_user ||
-                           !bucket_sectors_used(m) ||
-                           bucket_sectors_used(m) >= ca->mi.bucket_size)
-                               continue;
-
-                       WARN_ON(m.stripe && !g->stripe_redundancy);
-
-                       e = (struct copygc_heap_entry) {
-                               .dev            = dev_idx,
-                               .gen            = m.gen,
-                               .replicas       = 1 + g->stripe_redundancy,
-                               .fragmentation  = bucket_sectors_used(m) * (1U << 15)
-                                       / ca->mi.bucket_size,
-                               .sectors        = bucket_sectors_used(m),
-                               .offset         = bucket_to_sector(ca, b),
-                       };
-                       heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
-               }
-               up_read(&ca->bucket_lock);
+       ret = walk_buckets_to_copygc(c);
+       if (ret) {
+               bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+               return ret;
        }
 
-       if (!sectors_reserved) {
-               bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
-               return -1;
+       if (!h->used) {
+               bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!");
+               return 0;
        }
 
        /*
         * Our btree node allocations also come out of RESERVE_MOVINGGC:
         */
-       sectors_to_move = (sectors_to_move * 3) / 4;
+       sectors_reserved = (sectors_reserved * 3) / 4;
+       if (!sectors_reserved) {
+               bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
+               return -1;
+       }
 
-       for (i = h->data; i < h->data + h->used; i++)
-               sectors_to_move += i->sectors * i->replicas;
+       for (i = h->data; i < h->data + h->used; i++) {
+               sectors_to_move += i->sectors;
+               sectors_to_write += i->sectors * i->replicas;
+       }
 
-       while (sectors_to_move > sectors_reserved) {
+       while (sectors_to_write > sectors_reserved) {
                BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
-               sectors_to_move -= e.sectors * e.replicas;
+               sectors_to_write -= e.sectors * e.replicas;
        }
 
        buckets_to_move = h->used;
 
-       if (!buckets_to_move)
+       if (!buckets_to_move) {
+               bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!",
+                                   sectors_reserved);
                return 0;
+       }
 
        eytzinger0_sort(h->data, h->used,
                        sizeof(h->data[0]),
@@ -237,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c)
                             writepoint_ptr(&c->copygc_write_point),
                             copygc_pred, NULL,
                             &move_stats);
+       if (ret) {
+               bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+               return ret;
+       }
 
-       for_each_rw_member(ca, c, dev_idx) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-               for (i = h->data; i < h->data + h->used; i++) {
-                       struct bucket_mark m;
-                       size_t b;
-
-                       if (i->dev != dev_idx)
-                               continue;
-
-                       b = sector_to_bucket(ca, i->offset);
-                       m = READ_ONCE(buckets->b[b].mark);
-
-                       if (i->gen == m.gen &&
-                           bucket_sectors_used(m)) {
-                               sectors_not_moved += bucket_sectors_used(m);
-                               buckets_not_moved++;
-                       }
-               }
-               up_read(&ca->bucket_lock);
+       ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
+       if (ret) {
+               bch_err(c, "error %i from check_copygc_was_done()", ret);
+               return ret;
        }
 
-       if (sectors_not_moved && !ret)
+       if (sectors_not_moved)
                bch_warn_ratelimited(c,
                        "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
                         sectors_not_moved, sectors_to_move,
index 5de296078219fc42e749193624d4c7fca4a3f25f..71bf26eb13d5dbcc767cced43f14f55a4cfa9b5c 100644 (file)
@@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = {
        NULL
 };
 
+const char * const bch2_csum_types[] = {
+       BCH_CSUM_TYPES()
+       NULL
+};
+
 const char * const bch2_csum_opts[] = {
        BCH_CSUM_OPTS()
        NULL
 };
 
+const char * const bch2_compression_types[] = {
+       BCH_COMPRESSION_TYPES()
+       NULL
+};
+
 const char * const bch2_compression_opts[] = {
        BCH_COMPRESSION_OPTS()
        NULL
 };
 
 const char * const bch2_str_hash_types[] = {
+       BCH_STR_HASH_TYPES()
+       NULL
+};
+
+const char * const bch2_str_hash_opts[] = {
        BCH_STR_HASH_OPTS()
        NULL
 };
@@ -51,19 +66,24 @@ const char * const bch2_data_types[] = {
        NULL
 };
 
-const char * const bch2_cache_replacement_policies[] = {
-       BCH_CACHE_REPLACEMENT_POLICIES()
+const char * const bch2_member_states[] = {
+       BCH_MEMBER_STATES()
        NULL
 };
 
-const char * const bch2_member_states[] = {
-       BCH_MEMBER_STATES()
+const char * const bch2_jset_entry_types[] = {
+       BCH_JSET_ENTRY_TYPES()
+       NULL
+};
+
+const char * const bch2_fs_usage_types[] = {
+       BCH_FS_USAGE_TYPES()
        NULL
 };
 
 #undef x
 
-const char * const bch2_d_types[DT_MAX] = {
+const char * const bch2_d_types[BCH_DT_MAX] = {
        [DT_UNKNOWN]    = "unknown",
        [DT_FIFO]       = "fifo",
        [DT_CHR]        = "chr",
@@ -73,6 +93,7 @@ const char * const bch2_d_types[DT_MAX] = {
        [DT_LNK]        = "lnk",
        [DT_SOCK]       = "sock",
        [DT_WHT]        = "whiteout",
+       [DT_SUBVOL]     = "subvol",
 };
 
 void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
@@ -125,41 +146,27 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
        }
 }
 
-/*
- * Initial options from superblock - here we don't want any options undefined,
- * any options the superblock doesn't specify are set to 0:
- */
-struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
-{
-       struct bch_opts opts = bch2_opts_empty();
-
-#define x(_name, _bits, _mode, _type, _sb_opt, ...)                    \
-       if (_sb_opt != NO_SB_OPT)                                       \
-               opt_set(opts, _name, _sb_opt(sb));
-       BCH_OPTS()
-#undef x
-
-       return opts;
-}
-
 const struct bch_option bch2_opt_table[] = {
-#define OPT_BOOL()             .type = BCH_OPT_BOOL
-#define OPT_UINT(_min, _max)   .type = BCH_OPT_UINT, .min = _min, .max = _max
-#define OPT_SECTORS(_min, _max)        .type = BCH_OPT_SECTORS, .min = _min, .max = _max
-#define OPT_STR(_choices)      .type = BCH_OPT_STR, .choices = _choices
+#define OPT_BOOL()             .type = BCH_OPT_BOOL, .min = 0, .max = 2
+#define OPT_UINT(_min, _max)   .type = BCH_OPT_UINT,                   \
+                               .min = _min, .max = _max
+#define OPT_STR(_choices)      .type = BCH_OPT_STR,                    \
+                               .min = 0, .max = ARRAY_SIZE(_choices),\
+                               .choices = _choices
 #define OPT_FN(_fn)            .type = BCH_OPT_FN,                     \
                                .parse = _fn##_parse,                   \
                                .to_text = _fn##_to_text
 
-#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \
+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)        \
        [Opt_##_name] = {                                               \
                .attr   = {                                             \
                        .name   = #_name,                               \
-                       .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444,    \
+                       .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444,   \
                },                                                      \
-               .mode   = _mode,                                        \
+               .flags  = _flags,                                       \
                .hint   = _hint,                                        \
                .help   = _help,                                        \
+               .get_sb = _sb_opt,                                      \
                .set_sb = SET_##_sb_opt,                                \
                _type                                                   \
        },
@@ -202,7 +209,41 @@ static int bch2_mount_opt_lookup(const char *name)
        return bch2_opt_lookup(name);
 }
 
-int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
+static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v)
+{
+       if (v < opt->min) {
+               if (msg)
+                       pr_err("invalid %s%s: too small (min %llu)",
+                              msg, opt->attr.name, opt->min);
+               return -ERANGE;
+       }
+
+       if (opt->max && v >= opt->max) {
+               if (msg)
+                       pr_err("invalid %s%s: too big (max %llu)",
+                              msg, opt->attr.name, opt->max);
+               return -ERANGE;
+       }
+
+       if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) {
+               if (msg)
+                       pr_err("invalid %s %s: not a multiple of 512",
+                              msg, opt->attr.name);
+               return -EINVAL;
+       }
+
+       if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
+               if (msg)
+                       pr_err("invalid %s%s: must be a power of two",
+                              msg, opt->attr.name);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+int bch2_opt_parse(struct bch_fs *c, const char *msg,
+                  const struct bch_option *opt,
                   const char *val, u64 *res)
 {
        ssize_t ret;
@@ -212,30 +253,13 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
                ret = kstrtou64(val, 10, res);
                if (ret < 0)
                        return ret;
-
-               if (*res > 1)
-                       return -ERANGE;
                break;
        case BCH_OPT_UINT:
-               ret = kstrtou64(val, 10, res);
-               if (ret < 0)
-                       return ret;
-
-               if (*res < opt->min || *res >= opt->max)
-                       return -ERANGE;
-               break;
-       case BCH_OPT_SECTORS:
-               ret = bch2_strtou64_h(val, res);
+               ret = opt->flags & OPT_HUMAN_READABLE
+                       ? bch2_strtou64_h(val, res)
+                       : kstrtou64(val, 10, res);
                if (ret < 0)
                        return ret;
-
-               if (*res & 511)
-                       return -EINVAL;
-
-               *res >>= 9;
-
-               if (*res < opt->min || *res >= opt->max)
-                       return -ERANGE;
                break;
        case BCH_OPT_STR:
                ret = match_string(opt->choices, -1, val);
@@ -248,10 +272,12 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
                if (!c)
                        return 0;
 
-               return opt->parse(c, val, res);
+               ret = opt->parse(c, val, res);
+               if (ret < 0)
+                       return ret;
        }
 
-       return 0;
+       return bch2_opt_validate(opt, msg, *res);
 }
 
 void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
@@ -272,10 +298,10 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
        switch (opt->type) {
        case BCH_OPT_BOOL:
        case BCH_OPT_UINT:
-               pr_buf(out, "%lli", v);
-               break;
-       case BCH_OPT_SECTORS:
-               bch2_hprint(out, v);
+               if (opt->flags & OPT_HUMAN_READABLE)
+                       bch2_hprint(out, v);
+               else
+                       pr_buf(out, "%lli", v);
                break;
        case BCH_OPT_STR:
                if (flags & OPT_SHOW_FULL_LIST)
@@ -349,7 +375,8 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
                        if (id < 0)
                                goto bad_opt;
 
-                       ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
+                       ret = bch2_opt_parse(c, "mount option ",
+                                            &bch2_opt_table[id], val, &v);
                        if (ret < 0)
                                goto bad_val;
                } else {
@@ -369,7 +396,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
                                goto no_val;
                }
 
-               if (!(bch2_opt_table[id].mode & OPT_MOUNT))
+               if (!(bch2_opt_table[id].flags & OPT_MOUNT))
                        goto bad_opt;
 
                if (id == Opt_acl &&
@@ -404,6 +431,65 @@ out:
        return ret;
 }
 
+/*
+ * Initial options from superblock - here we don't want any options undefined,
+ * any options the superblock doesn't specify are set to 0:
+ */
+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb)
+{
+       unsigned id;
+       int ret;
+
+       for (id = 0; id < bch2_opts_nr; id++) {
+               const struct bch_option *opt = bch2_opt_table + id;
+               u64 v;
+
+               if (opt->get_sb == NO_SB_OPT)
+                       continue;
+
+               v = opt->get_sb(sb);
+
+               if (opt->flags & OPT_SB_FIELD_ILOG2)
+                       v = 1ULL << v;
+
+               if (opt->flags & OPT_SB_FIELD_SECTORS)
+                       v <<= 9;
+
+               ret = bch2_opt_validate(opt, "superblock option ", v);
+               if (ret)
+                       return ret;
+
+               bch2_opt_set_by_id(opts, id, v);
+       }
+
+       return 0;
+}
+
+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v)
+{
+       if (opt->set_sb == SET_NO_SB_OPT)
+               return;
+
+       if (opt->flags & OPT_SB_FIELD_SECTORS)
+               v >>= 9;
+
+       if (opt->flags & OPT_SB_FIELD_ILOG2)
+               v = ilog2(v);
+
+       opt->set_sb(sb, v);
+}
+
+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v)
+{
+       if (opt->set_sb == SET_NO_SB_OPT)
+               return;
+
+       mutex_lock(&c->sb_lock);
+       __bch2_opt_set_sb(c->disk_sb.sb, opt, v);
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+}
+
 /* io opts: */
 
 struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
index 003c00f2503730d0e3b792084ad5994689fa740e..affe9233d708094c0ab032b170bda23f3deba763 100644 (file)
@@ -12,14 +12,23 @@ extern const char * const bch2_error_actions[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const bch2_btree_ids[];
+extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
+extern const char * const bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
+extern const char * const bch2_str_hash_opts[];
 extern const char * const bch2_data_types[];
-extern const char * const bch2_cache_replacement_policies[];
 extern const char * const bch2_member_states[];
+extern const char * const bch2_jset_entry_types[];
+extern const char * const bch2_fs_usage_types[];
 extern const char * const bch2_d_types[];
 
+static inline const char *bch2_d_type_str(unsigned d_type)
+{
+       return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
+}
+
 /*
  * Mount options; we also store defaults in the superblock.
  *
@@ -36,18 +45,22 @@ extern const char * const bch2_d_types[];
 LE64_BITMASK(NO_SB_OPT,                struct bch_sb, flags[0], 0, 0);
 
 /* When can be set: */
-enum opt_mode {
-       OPT_FORMAT      = (1 << 0),
-       OPT_MOUNT       = (1 << 1),
-       OPT_RUNTIME     = (1 << 2),
-       OPT_INODE       = (1 << 3),
-       OPT_DEVICE      = (1 << 4),
+enum opt_flags {
+       OPT_FS          = (1 << 0),     /* Filesystem option */
+       OPT_DEVICE      = (1 << 1),     /* Device option */
+       OPT_INODE       = (1 << 2),     /* Inode option */
+       OPT_FORMAT      = (1 << 3),     /* May be specified at format time */
+       OPT_MOUNT       = (1 << 4),     /* May be specified at mount time */
+       OPT_RUNTIME     = (1 << 5),     /* May be specified at runtime */
+       OPT_HUMAN_READABLE = (1 << 6),
+       OPT_MUST_BE_POW_2 = (1 << 7),   /* Must be power of 2 */
+       OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */
+       OPT_SB_FIELD_ILOG2 = (1 << 9),  /* Superblock field is ilog2 of actual value */
 };
 
 enum opt_type {
        BCH_OPT_BOOL,
        BCH_OPT_UINT,
-       BCH_OPT_SECTORS,
        BCH_OPT_STR,
        BCH_OPT_FN,
 };
@@ -72,223 +85,242 @@ enum opt_type {
  */
 
 #ifdef __KERNEL__
-#define RATELIMIT_ERRORS true
+#define RATELIMIT_ERRORS_DEFAULT true
 #else
-#define RATELIMIT_ERRORS false
+#define RATELIMIT_ERRORS_DEFAULT false
 #endif
 
 #define BCH_OPTS()                                                     \
        x(block_size,                   u16,                            \
-         OPT_FORMAT,                                                   \
-         OPT_SECTORS(1, 128),                                          \
+         OPT_FS|OPT_FORMAT|                                            \
+         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,    \
+         OPT_UINT(512, 1U << 16),                                      \
          BCH_SB_BLOCK_SIZE,            8,                              \
          "size",       NULL)                                           \
-       x(btree_node_size,              u16,                            \
-         OPT_FORMAT,                                                   \
-         OPT_SECTORS(1, 512),                                          \
+       x(btree_node_size,              u32,                            \
+         OPT_FS|OPT_FORMAT|                                            \
+         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS,    \
+         OPT_UINT(512, 1U << 20),                                      \
          BCH_SB_BTREE_NODE_SIZE,       512,                            \
          "size",       "Btree node size, default 256k")                \
        x(errors,                       u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_STR(bch2_error_actions),                                  \
          BCH_SB_ERROR_ACTION,          BCH_ON_ERROR_ro,                \
          NULL,         "Action to take on filesystem error")           \
        x(metadata_replicas,            u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_UINT(1, BCH_REPLICAS_MAX),                                \
          BCH_SB_META_REPLICAS_WANT,    1,                              \
          "#",          "Number of metadata replicas")                  \
        x(data_replicas,                u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_UINT(1, BCH_REPLICAS_MAX),                                \
          BCH_SB_DATA_REPLICAS_WANT,    1,                              \
          "#",          "Number of data replicas")                      \
        x(metadata_replicas_required, u8,                               \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_UINT(1, BCH_REPLICAS_MAX),                                \
          BCH_SB_META_REPLICAS_REQ,     1,                              \
          "#",          NULL)                                           \
        x(data_replicas_required,       u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_UINT(1, BCH_REPLICAS_MAX),                                \
          BCH_SB_DATA_REPLICAS_REQ,     1,                              \
          "#",          NULL)                                           \
+       x(encoded_extent_max,           u32,                            \
+         OPT_FS|OPT_FORMAT|                                            \
+         OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\
+         OPT_UINT(4096, 2U << 20),                                     \
+         BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10,                     \
+         "size",       "Maximum size of checksummed/compressed extents")\
        x(metadata_checksum,            u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_STR(bch2_csum_opts),                                      \
          BCH_SB_META_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
          NULL,         NULL)                                           \
        x(data_checksum,                u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_STR(bch2_csum_opts),                                      \
          BCH_SB_DATA_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
          NULL,         NULL)                                           \
        x(compression,                  u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_STR(bch2_compression_opts),                               \
          BCH_SB_COMPRESSION_TYPE,      BCH_COMPRESSION_OPT_none,       \
          NULL,         NULL)                                           \
        x(background_compression,       u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_STR(bch2_compression_opts),                               \
          BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none,  \
          NULL,         NULL)                                           \
        x(str_hash,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
-         OPT_STR(bch2_str_hash_types),                                 \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+         OPT_STR(bch2_str_hash_opts),                                  \
          BCH_SB_STR_HASH_TYPE,         BCH_STR_HASH_OPT_siphash,       \
          NULL,         "Hash function for directory entries and xattrs")\
        x(metadata_target,              u16,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_METADATA_TARGET,       0,                              \
          "(target)",   "Device or disk group for metadata writes")     \
        x(foreground_target,            u16,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_FOREGROUND_TARGET,     0,                              \
          "(target)",   "Device or disk group for foreground writes")   \
        x(background_target,            u16,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_BACKGROUND_TARGET,     0,                              \
          "(target)",   "Device or disk group to move data to in the background")\
        x(promote_target,               u16,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_target),                                      \
          BCH_SB_PROMOTE_TARGET,        0,                              \
          "(target)",   "Device or disk group to promote data to on read")\
        x(erasure_code,                 u16,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_BOOL(),                                                   \
          BCH_SB_ERASURE_CODE,          false,                          \
          NULL,         "Enable erasure coding (DO NOT USE YET)")       \
        x(inodes_32bit,                 u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_BOOL(),                                                   \
          BCH_SB_INODE_32BIT,           true,                           \
          NULL,         "Constrain inode numbers to 32 bits")           \
        x(shard_inode_numbers,          u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_BOOL(),                                                   \
-         BCH_SB_SHARD_INUMS,           false,                          \
+         BCH_SB_SHARD_INUMS,           true,                           \
          NULL,         "Shard new inode numbers by CPU id")            \
        x(inodes_use_key_cache, u8,                                     \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_BOOL(),                                                   \
          BCH_SB_INODES_USE_KEY_CACHE,  true,                           \
          NULL,         "Use the btree key cache for the inodes btree") \
        x(btree_node_mem_ptr_optimization, u8,                          \
-         OPT_MOUNT|OPT_RUNTIME,                                        \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    true,                           \
          NULL,         "Stash pointer to in memory btree node in btree ptr")\
        x(gc_reserve_percent,           u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_UINT(5, 21),                                              \
          BCH_SB_GC_RESERVE,            8,                              \
          "%",          "Percentage of disk space to reserve for copygc")\
        x(gc_reserve_bytes,             u64,                            \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
-         OPT_SECTORS(0, U64_MAX),                                      \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|                      \
+         OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS,                      \
+         OPT_UINT(0, U64_MAX),                                         \
          BCH_SB_GC_RESERVE_BYTES,      0,                              \
          "%",          "Amount of disk space to reserve for copygc\n"  \
                        "Takes precedence over gc_reserve_percent if set")\
        x(root_reserve_percent,         u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_UINT(0, 100),                                             \
          BCH_SB_ROOT_RESERVE,          0,                              \
          "%",          "Percentage of disk space to reserve for superuser")\
        x(wide_macs,                    u8,                             \
-         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_BOOL(),                                                   \
          BCH_SB_128_BIT_MACS,          false,                          \
          NULL,         "Store full 128 bits of cryptographic MACs, instead of 80")\
        x(inline_data,                  u8,                             \
-         OPT_MOUNT|OPT_RUNTIME,                                        \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    true,                           \
          NULL,         "Enable inline data extents")                   \
        x(acl,                          u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_BOOL(),                                                   \
          BCH_SB_POSIX_ACL,             true,                           \
          NULL,         "Enable POSIX acls")                            \
        x(usrquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_BOOL(),                                                   \
          BCH_SB_USRQUOTA,              false,                          \
          NULL,         "Enable user quotas")                           \
        x(grpquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_BOOL(),                                                   \
          BCH_SB_GRPQUOTA,              false,                          \
          NULL,         "Enable group quotas")                          \
        x(prjquota,                     u8,                             \
-         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT,                                  \
          OPT_BOOL(),                                                   \
          BCH_SB_PRJQUOTA,              false,                          \
          NULL,         "Enable project quotas")                        \
        x(degraded,                     u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Allow mounting in degraded mode")              \
        x(very_degraded,                u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Allow mounting in when data will be missing")  \
        x(discard,                      u8,                             \
-         OPT_MOUNT|OPT_DEVICE,                                         \
+         OPT_FS|OPT_MOUNT|OPT_DEVICE,                                  \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Enable discard/TRIM support")                  \
        x(verbose,                      u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Extra debugging information during mount/recovery")\
+       x(journal_flush_delay,          u32,                            \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
+         OPT_UINT(0, U32_MAX),                                         \
+         BCH_SB_JOURNAL_FLUSH_DELAY,   1000,                           \
+         NULL,         "Delay in milliseconds before automatic journal commits")\
        x(journal_flush_disabled,       u8,                             \
-         OPT_MOUNT|OPT_RUNTIME,                                        \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    false,                          \
+         BCH_SB_JOURNAL_FLUSH_DISABLED,false,                          \
          NULL,         "Disable journal flush on sync/fsync\n"         \
                        "If enabled, writes can be lost, but only since the\n"\
                        "last journal write (default 1 second)")        \
+       x(journal_reclaim_delay,        u32,                            \
+         OPT_FS|OPT_MOUNT|OPT_RUNTIME,                                 \
+         OPT_UINT(0, U32_MAX),                                         \
+         BCH_SB_JOURNAL_RECLAIM_DELAY, 100,                            \
+         NULL,         "Delay in milliseconds before automatic journal reclaim")\
        x(fsck,                         u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Run fsck on mount")                            \
        x(fix_errors,                   u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Fix errors during fsck without asking")        \
        x(ratelimit_errors,             u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
-         NO_SB_OPT,                    RATELIMIT_ERRORS,               \
+         NO_SB_OPT,                    RATELIMIT_ERRORS_DEFAULT,       \
          NULL,         "Ratelimit error messages during fsck")         \
        x(nochanges,                    u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Super read only mode - no writes at all will be issued,\n"\
                        "even if we have to replay the journal")        \
        x(norecovery,                   u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Don't replay the journal")                     \
        x(rebuild_replicas,             u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Rebuild the superblock replicas section")      \
        x(keep_journal,                 u8,                             \
-         OPT_MOUNT,                                                    \
+         0,                                                            \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Don't free journal entries/keys after startup")\
@@ -297,8 +329,13 @@ enum opt_type {
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Read all journal entries, not just dirty ones")\
+       x(journal_transaction_names,    u8,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+         OPT_BOOL(),                                                   \
+         BCH_SB_JOURNAL_TRANSACTION_NAMES, true,                       \
+         NULL,         "Log transaction function names in journal")    \
        x(noexcl,                       u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Don't open device in exclusive mode")          \
@@ -308,7 +345,7 @@ enum opt_type {
          NO_SB_OPT,                    BCH_SB_SECTOR,                  \
          "offset",     "Sector offset of superblock")                  \
        x(read_only,                    u8,                             \
-         0,                                                            \
+         OPT_FS,                                                       \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         NULL)                                           \
@@ -318,16 +355,21 @@ enum opt_type {
          NO_SB_OPT,                    false,                          \
          NULL,         "Don\'t start filesystem, only open devices")   \
        x(reconstruct_alloc,            u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Reconstruct alloc btree")                      \
        x(version_upgrade,              u8,                             \
-         OPT_MOUNT,                                                    \
+         OPT_FS|OPT_MOUNT,                                             \
          OPT_BOOL(),                                                   \
          NO_SB_OPT,                    false,                          \
          NULL,         "Set superblock to latest version,\n"           \
                        "allowing any new features to be used")         \
+       x(buckets_nouse,                u8,                             \
+         0,                                                            \
+         OPT_BOOL(),                                                   \
+         NO_SB_OPT,                    false,                          \
+         NULL,         "Allocate the buckets_nouse bitmap")            \
        x(project,                      u8,                             \
          OPT_INODE,                                                    \
          OPT_BOOL(),                                                   \
@@ -335,12 +377,12 @@ enum opt_type {
          NULL,         NULL)                                           \
        x(fs_size,                      u64,                            \
          OPT_DEVICE,                                                   \
-         OPT_SECTORS(0, S64_MAX),                                      \
+         OPT_UINT(0, S64_MAX),                                         \
          NO_SB_OPT,                    0,                              \
          "size",       "Size of filesystem on device")                 \
        x(bucket,                       u32,                            \
          OPT_DEVICE,                                                   \
-         OPT_SECTORS(0, S64_MAX),                                      \
+         OPT_UINT(0, S64_MAX),                                         \
          NO_SB_OPT,                    0,                              \
          "size",       "Size of filesystem on device")                 \
        x(durability,                   u8,                             \
@@ -399,13 +441,14 @@ struct printbuf;
 
 struct bch_option {
        struct attribute        attr;
+       u64                     (*get_sb)(const struct bch_sb *);
        void                    (*set_sb)(struct bch_sb *, u64);
-       enum opt_mode           mode;
        enum opt_type           type;
+       enum opt_flags          flags;
+       u64                     min, max;
 
        union {
        struct {
-               u64             min, max;
        };
        struct {
                const char * const *choices;
@@ -427,10 +470,13 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
 u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
 void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
 
-struct bch_opts bch2_opts_from_sb(struct bch_sb *);
+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *);
+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64);
+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64);
 
 int bch2_opt_lookup(const char *);
-int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
+int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *,
+                  const char *, u64 *);
 
 #define OPT_SHOW_FULL_LIST     (1 << 0)
 #define OPT_SHOW_MOUNT_STYLE   (1 << 1)
index 7861781a4a7fea4de99f209070be22f61ec71b84..6fb8224f565e3a00a2960a5dde41f2182a0dbe5a 100644 (file)
@@ -3,17 +3,20 @@
 #include "btree_update.h"
 #include "inode.h"
 #include "quota.h"
+#include "subvolume.h"
 #include "super-io.h"
 
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
-                                         struct bch_sb_field *f)
+static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f,
+                                 struct printbuf *err)
 {
        struct bch_sb_field_quota *q = field_to_type(f, quota);
 
-       if (vstruct_bytes(&q->field) != sizeof(*q))
-               return "invalid field quota: wrong size";
+       if (vstruct_bytes(&q->field) < sizeof(*q)) {
+               pr_buf(err, "wrong size (got %llu should be %zu)",
+                      vstruct_bytes(&q->field), sizeof(*q));
+       }
 
-       return NULL;
+       return 0;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_quota = {
@@ -357,7 +360,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
 static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
@@ -372,9 +375,10 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
+       return ret;
 }
 
 void bch2_fs_quota_exit(struct bch_fs *c)
@@ -414,14 +418,55 @@ static void bch2_sb_quota_read(struct bch_fs *c)
        }
 }
 
+static int bch2_fs_quota_read_inode(struct btree_trans *trans,
+                                   struct btree_iter *iter)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_inode_unpacked u;
+       struct bch_subvolume subvolume;
+       struct bkey_s_c k;
+       int ret;
+
+       k = bch2_btree_iter_peek(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       if (!k.k)
+               return 1;
+
+       ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume);
+       if (ret)
+               return ret;
+
+       /*
+        * We don't do quota accounting in snapshots:
+        */
+       if (BCH_SUBVOLUME_SNAP(&subvolume))
+               goto advance;
+
+       if (!bkey_is_inode(k.k))
+               goto advance;
+
+       ret = bch2_inode_unpack(k, &u);
+       if (ret)
+               return ret;
+
+       bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
+                       KEY_TYPE_QUOTA_NOCHECK);
+       bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
+                       KEY_TYPE_QUOTA_NOCHECK);
+advance:
+       bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1));
+       return 0;
+}
+
 int bch2_fs_quota_read(struct bch_fs *c)
 {
        unsigned i, qtypes = enabled_qtypes(c);
        struct bch_memquota_type *q;
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bch_inode_unpacked u;
-       struct bkey_s_c k;
+       struct btree_iter iter;
        int ret;
 
        mutex_lock(&c->sb_lock);
@@ -436,23 +481,18 @@ int bch2_fs_quota_read(struct bch_fs *c)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
-                          BTREE_ITER_PREFETCH, k, ret) {
-               switch (k.k->type) {
-               case KEY_TYPE_inode:
-                       ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
-                       if (ret)
-                               return ret;
-
-                       bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
-                                       KEY_TYPE_QUOTA_NOCHECK);
-                       bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
-                                       KEY_TYPE_QUOTA_NOCHECK);
-               }
-       }
-       bch2_trans_iter_put(&trans, iter);
-
-       return bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       do {
+               ret = lockrestart_do(&trans,
+                                    bch2_fs_quota_read_inode(&trans, &iter));
+       } while (!ret);
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
 }
 
 /* Enable/disable/delete quotas for an entire filesystem: */
@@ -532,7 +572,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_USR, 0),
                                              POS(QTYP_USR + 1, 0),
-                                             NULL);
+                                             0, NULL);
                if (ret)
                        return ret;
        }
@@ -544,7 +584,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_GRP, 0),
                                              POS(QTYP_GRP + 1, 0),
-                                             NULL);
+                                             0, NULL);
                if (ret)
                        return ret;
        }
@@ -556,7 +596,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
                ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
                                              POS(QTYP_PRJ, 0),
                                              POS(QTYP_PRJ + 1, 0),
-                                             NULL);
+                                             0, NULL);
                if (ret)
                        return ret;
        }
@@ -717,13 +757,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
                                struct bkey_i_quota *new_quota,
                                struct qc_dqblk *qdq)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
-                                  BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p,
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
        if (unlikely(ret))
@@ -742,8 +782,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans,
        if (qdq->d_fieldmask & QC_INO_HARD)
                new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
 
-       ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
index a0dbf41d1d3763c432c0c0ba90f9ecc498623f74..a573fede05b11fba7a5ada92b9bbfae322608612 100644 (file)
@@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg)
        struct bch_fs_rebalance *r = &c->rebalance;
        struct io_clock *clock = &c->io_clock[WRITE];
        struct rebalance_work w, p;
+       struct bch_move_stats move_stats;
        unsigned long start, prev_start;
        unsigned long prev_run_time, prev_run_cputime;
        unsigned long cputime, prev_cputime;
@@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg)
        prev_start      = jiffies;
        prev_cputime    = curr_cputime();
 
+       bch_move_stats_init(&move_stats, "rebalance");
        while (!kthread_wait_freezable(r->enabled)) {
                cond_resched();
 
@@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg)
                prev_cputime    = cputime;
 
                r->state = REBALANCE_RUNNING;
-               memset(&r->move_stats, 0, sizeof(r->move_stats));
+               memset(&move_stats, 0, sizeof(move_stats));
                rebalance_work_reset(c);
 
                bch2_move_data(c,
@@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg)
                               NULL, /*  &r->pd.rate, */
                               writepoint_ptr(&c->rebalance_write_point),
                               rebalance_pred, NULL,
-                              &r->move_stats);
+                              &move_stats);
        }
 
        return 0;
@@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
                       h1);
                break;
        case REBALANCE_RUNNING:
-               pr_buf(out, "running\n"
-                      "pos ");
-               bch2_bpos_to_text(out, r->move_stats.pos);
-               pr_buf(out, "\n");
+               pr_buf(out, "running\n");
                break;
        }
 }
index 2f62a643c39fbb0c08f024fbf58a7f3325755875..7462a92e95985d91cdc454485d045659240dd0fc 100644 (file)
@@ -19,7 +19,6 @@ struct bch_fs_rebalance {
        enum rebalance_state    state;
        u64                     throttled_until_iotime;
        unsigned long           throttled_until_cputime;
-       struct bch_move_stats   move_stats;
 
        unsigned                enabled:1;
 };
index afb72648fe5416a0cba73f59548f1933ad78fc86..543db58ff4d6a087b57aae732da116ccd993998f 100644 (file)
@@ -20,6 +20,7 @@
 #include "quota.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -58,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
 static int __journal_key_cmp(enum btree_id     l_btree_id,
                             unsigned           l_level,
                             struct bpos        l_pos,
-                            struct journal_key *r)
+                            const struct journal_key *r)
 {
        return (cmp_int(l_btree_id,     r->btree_id) ?:
                cmp_int(l_level,        r->level) ?:
                bpos_cmp(l_pos, r->k->k.p));
 }
 
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
 {
-       return (cmp_int(l->btree_id,    r->btree_id) ?:
-               cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->k->k.p,     r->k->k.p));
+       return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
 }
 
-static size_t journal_key_search(struct journal_keys *journal_keys,
-                                enum btree_id id, unsigned level,
-                                struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+                              enum btree_id id, unsigned level,
+                              struct bpos pos)
 {
        size_t l = 0, r = journal_keys->nr, m;
 
@@ -108,18 +107,25 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign
                iter->idx++;
 }
 
-int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bkey_i *k)
+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
+                                unsigned level, struct bkey_i *k)
 {
        struct journal_key n = {
                .btree_id       = id,
                .level          = level,
                .k              = k,
-               .allocated      = true
+               .allocated      = true,
+               /*
+                * Ensure these keys are done last by journal replay, to unblock
+                * journal reclaim:
+                */
+               .journal_seq    = U32_MAX,
        };
        struct journal_keys *keys = &c->journal_keys;
        struct journal_iter *iter;
-       unsigned idx = journal_key_search(keys, id, level, k->k.p);
+       size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
+
+       BUG_ON(test_bit(BCH_FS_RW, &c->flags));
 
        if (idx < keys->nr &&
            journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -156,38 +162,66 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
        return 0;
 }
 
-int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
-                           unsigned level, struct bpos pos)
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bkey_i *k)
 {
-       struct bkey_i *whiteout =
-               kmalloc(sizeof(struct bkey), GFP_KERNEL);
+       struct bkey_i *n;
        int ret;
 
-       if (!whiteout) {
-               bch_err(c, "%s: error allocating new key", __func__);
+       n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL);
+       if (!n)
                return -ENOMEM;
-       }
-
-       bkey_init(&whiteout->k);
-       whiteout->k.p = pos;
 
-       ret = bch2_journal_key_insert(c, id, level, whiteout);
+       bkey_copy(n, k);
+       ret = bch2_journal_key_insert_take(c, id, level, n);
        if (ret)
-               kfree(whiteout);
+               kfree(n);
        return ret;
 }
 
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bpos pos)
+{
+       struct bkey_i whiteout;
+
+       bkey_init(&whiteout.k);
+       whiteout.k.p = pos;
+
+       return bch2_journal_key_insert(c, id, level, &whiteout);
+}
+
+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
+                                 unsigned level, struct bpos pos)
+{
+       struct journal_keys *keys = &c->journal_keys;
+       size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+       if (idx < keys->nr &&
+           keys->d[idx].btree_id       == btree &&
+           keys->d[idx].level          == level &&
+           !bpos_cmp(keys->d[idx].k->k.p, pos))
+               keys->d[idx].overwritten = true;
+}
+
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-       struct journal_key *k = iter->idx - iter->keys->nr
-               ? iter->keys->d + iter->idx : NULL;
+       struct journal_key *k = iter->keys->d + iter->idx;
+
+       while (k < iter->keys->d + iter->keys->nr &&
+              k->btree_id      == iter->btree_id &&
+              k->level         == iter->level) {
+               if (!k->overwritten)
+                       return k->k;
 
-       if (k &&
-           k->btree_id == iter->btree_id &&
-           k->level    == iter->level)
-               return k->k;
+               iter->idx++;
+               k = iter->keys->d + iter->idx;
+       }
 
-       iter->idx = iter->keys->nr;
        return NULL;
 }
 
@@ -210,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
        iter->btree_id  = id;
        iter->level     = level;
        iter->keys      = &c->journal_keys;
-       iter->idx       = journal_key_search(&c->journal_keys, id, level, pos);
-       list_add(&iter->list, &c->journal_iters);
+       iter->idx       = bch2_journal_key_search(&c->journal_keys, id, level, pos);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -297,104 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
        bch2_journal_iter_exit(&iter->journal);
 }
 
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct bch_fs *c,
-                                               struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                                 struct bch_fs *c,
+                                                 struct btree *b,
+                                                 struct btree_node_iter node_iter,
+                                                 struct bpos pos)
 {
        memset(iter, 0, sizeof(*iter));
 
        iter->b = b;
-       bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-       bch2_journal_iter_init(c, &iter->journal,
-                              b->c.btree_id, b->c.level, b->data->min_key);
-}
-
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
-                                          struct btree_and_journal_iter iter)
-{
-       unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
-       struct bkey_s_c k;
-       struct bkey_buf tmp;
-
-       BUG_ON(!b->c.level);
-
-       bch2_bkey_buf_init(&tmp);
-
-       while (i < nr &&
-              (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               bch2_bkey_buf_reassemble(&tmp, c, k);
-
-               bch2_btree_node_prefetch(c, NULL, tmp.k,
-                                       b->c.btree_id, b->c.level - 1);
-
-               bch2_btree_and_journal_iter_advance(&iter);
-               i++;
-       }
-
-       bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
-                               enum btree_id btree_id,
-                               btree_walk_key_fn key_fn)
-{
-       struct btree_and_journal_iter iter;
-       struct bkey_s_c k;
-       struct bkey_buf tmp;
-       struct btree *child;
-       int ret = 0;
-
-       bch2_bkey_buf_init(&tmp);
-       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
-       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
-               if (b->c.level) {
-                       bch2_bkey_buf_reassemble(&tmp, c, k);
-
-                       child = bch2_btree_node_get_noiter(c, tmp.k,
-                                               b->c.btree_id, b->c.level - 1,
-                                               false);
-
-                       ret = PTR_ERR_OR_ZERO(child);
-                       if (ret)
-                               break;
-
-                       btree_and_journal_iter_prefetch(c, b, iter);
-
-                       ret = bch2_btree_and_journal_walk_recurse(c, child,
-                                       btree_id, key_fn);
-                       six_unlock_read(&child->c.lock);
-               } else {
-                       ret = key_fn(c, k);
-               }
-
-               if (ret)
-                       break;
-
-               bch2_btree_and_journal_iter_advance(&iter);
-       }
-
-       bch2_btree_and_journal_iter_exit(&iter);
-       bch2_bkey_buf_exit(&tmp, c);
-       return ret;
+       iter->node_iter = node_iter;
+       bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+       INIT_LIST_HEAD(&iter->journal.list);
 }
 
-int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
-                               btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct bch_fs *c,
+                                               struct btree *b)
 {
-       struct btree *b = c->btree_roots[btree_id].b;
-       int ret = 0;
-
-       if (btree_node_fake(b))
-               return 0;
-
-       six_lock_read(&b->c.lock, NULL, NULL);
-       ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
-       six_unlock_read(&b->c.lock);
+       struct btree_node_iter node_iter;
 
-       return ret;
+       bch2_btree_node_iter_init_from_start(&node_iter, b);
+       __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+       list_add(&iter->journal.list, &c->journal_iters);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -419,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return  cmp_int(l->btree_id,    r->btree_id) ?:
-               cmp_int(l->level,       r->level) ?:
-               bpos_cmp(l->k->k.p, r->k->k.p) ?:
+       return  journal_key_cmp(l, r) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
 }
@@ -514,140 +474,84 @@ static void replay_now_at(struct journal *j, u64 seq)
                bch2_journal_pin_put(j, j->replay_journal_seq++);
 }
 
-static int __bch2_journal_replay_key(struct btree_trans *trans,
-                                    enum btree_id id, unsigned level,
-                                    struct bkey_i *k)
+static int bch2_journal_replay_key(struct btree_trans *trans,
+                                  struct journal_key *k)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
+       unsigned iter_flags =
+               BTREE_ITER_INTENT|
+               BTREE_ITER_NOT_EXTENTS;
        int ret;
 
-       iter = bch2_trans_get_node_iter(trans, id, k->k.p,
-                                       BTREE_MAX_DEPTH, level,
-                                       BTREE_ITER_INTENT|
-                                       BTREE_ITER_NOT_EXTENTS);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-       bch2_trans_iter_put(trans, iter);
-       return ret;
-}
-
-static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
-{
-       unsigned commit_flags = BTREE_INSERT_NOFAIL|
-               BTREE_INSERT_LAZY_RW;
-
-       if (!k->allocated)
-               commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+       if (!k->level && k->btree_id == BTREE_ID_alloc)
+               iter_flags |= BTREE_ITER_CACHED;
 
-       return bch2_trans_do(c, NULL, NULL, commit_flags,
-                            __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
-}
+       bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+                                 BTREE_MAX_DEPTH, k->level,
+                                 iter_flags);
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto out;
 
-static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
-{
-       struct btree_iter *iter;
-       int ret;
+       /* Must be checked with btree locked: */
+       if (k->overwritten)
+               goto out;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
-                                  BTREE_ITER_CACHED|
-                                  BTREE_ITER_CACHED_NOFILL|
-                                  BTREE_ITER_INTENT);
-       ret   = bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN);
+out:
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
-static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
-{
-       return bch2_trans_do(c, NULL, NULL,
-                            BTREE_INSERT_NOFAIL|
-                            BTREE_INSERT_USE_RESERVE|
-                            BTREE_INSERT_LAZY_RW|
-                            BTREE_INSERT_JOURNAL_REPLAY,
-                       __bch2_alloc_replay_key(&trans, k));
-}
-
 static int journal_sort_seq_cmp(const void *_l, const void *_r)
 {
-       const struct journal_key *l = _l;
-       const struct journal_key *r = _r;
+       const struct journal_key *l = *((const struct journal_key **)_l);
+       const struct journal_key *r = *((const struct journal_key **)_r);
 
-       return  cmp_int(r->level,       l->level) ?:
-               cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->btree_id,    r->btree_id) ?:
-               bpos_cmp(l->k->k.p,     r->k->k.p);
+       return cmp_int(l->journal_seq, r->journal_seq);
 }
 
-static int bch2_journal_replay(struct bch_fs *c,
-                              struct journal_keys keys)
+static int bch2_journal_replay(struct bch_fs *c)
 {
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_key **keys_sorted, *k;
        struct journal *j = &c->journal;
-       struct journal_key *i;
-       u64 seq;
+       size_t i;
        int ret;
 
-       sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
+       keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
+       if (!keys_sorted)
+               return -ENOMEM;
 
-       if (keys.nr)
-               replay_now_at(j, keys.journal_seq_base);
+       for (i = 0; i < keys->nr; i++)
+               keys_sorted[i] = &keys->d[i];
 
-       seq = j->replay_journal_seq;
+       sort(keys_sorted, keys->nr,
+            sizeof(keys_sorted[0]),
+            journal_sort_seq_cmp, NULL);
 
-       /*
-        * First replay updates to the alloc btree - these will only update the
-        * btree key cache:
-        */
-       for_each_journal_key(keys, i) {
-               cond_resched();
+       if (keys->nr)
+               replay_now_at(j, keys->journal_seq_base);
 
-               if (!i->level && i->btree_id == BTREE_ID_alloc) {
-                       j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-                       ret = bch2_alloc_replay_key(c, i->k);
-                       if (ret)
-                               goto err;
-               }
-       }
+       for (i = 0; i < keys->nr; i++) {
+               k = keys_sorted[i];
 
-       /*
-        * Next replay updates to interior btree nodes:
-        */
-       for_each_journal_key(keys, i) {
                cond_resched();
 
-               if (i->level) {
-                       j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-                       ret = bch2_journal_replay_key(c, i);
-                       if (ret)
-                               goto err;
-               }
-       }
-
-       /*
-        * Now that the btree is in a consistent state, we can start journal
-        * reclaim (which will be flushing entries from the btree key cache back
-        * to the btree:
-        */
-       set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-       set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
-       journal_reclaim_kick(j);
-
-       j->replay_journal_seq = seq;
-
-       /*
-        * Now replay leaf node updates:
-        */
-       for_each_journal_key(keys, i) {
-               cond_resched();
+               if (!k->allocated)
+                       replay_now_at(j, keys->journal_seq_base + k->journal_seq);
 
-               if (i->level || i->btree_id == BTREE_ID_alloc)
-                       continue;
-
-               replay_now_at(j, keys.journal_seq_base + i->journal_seq);
-
-               ret = bch2_journal_replay_key(c, i);
-               if (ret)
+               ret = bch2_trans_do(c, NULL, NULL,
+                                   BTREE_INSERT_LAZY_RW|
+                                   BTREE_INSERT_NOFAIL|
+                                   BTREE_INSERT_JOURNAL_RESERVED|
+                                   (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
+                            bch2_journal_replay_key(&trans, k));
+               if (ret) {
+                       bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+                               ret, bch2_btree_ids[k->btree_id], k->level);
                        goto err;
+               }
        }
 
        replay_now_at(j, j->replay_journal_seq_end);
@@ -655,10 +559,9 @@ static int bch2_journal_replay(struct bch_fs *c,
 
        bch2_journal_set_replay_done(j);
        bch2_journal_flush_all_pins(j);
-       return bch2_journal_error(j);
+       ret = bch2_journal_error(j);
 err:
-       bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
-               ret, bch2_btree_ids[i->btree_id], i->level);
+       kvfree(keys_sorted);
        return ret;
 }
 
@@ -696,15 +599,15 @@ static int journal_replay_entry_early(struct bch_fs *c,
                        container_of(entry, struct jset_entry_usage, entry);
 
                switch (entry->btree_id) {
-               case FS_USAGE_RESERVED:
+               case BCH_FS_USAGE_reserved:
                        if (entry->level < BCH_REPLICAS_MAX)
                                c->usage_base->persistent_reserved[entry->level] =
                                        le64_to_cpu(u->v);
                        break;
-               case FS_USAGE_INODES:
+               case BCH_FS_USAGE_inodes:
                        c->usage_base->nr_inodes = le64_to_cpu(u->v);
                        break;
-               case FS_USAGE_KEY_VERSION:
+               case BCH_FS_USAGE_key_version:
                        atomic64_set(&c->key_version,
                                     le64_to_cpu(u->v));
                        break;
@@ -724,10 +627,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
                struct jset_entry_dev_usage *u =
                        container_of(entry, struct jset_entry_dev_usage, entry);
                struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
-               unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
-               unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
-                       sizeof(struct jset_entry_dev_usage_type);
-               unsigned i;
+               unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
 
                ca->usage_base->buckets_ec              = le64_to_cpu(u->buckets_ec);
                ca->usage_base->buckets_unavailable     = le64_to_cpu(u->buckets_unavailable);
@@ -961,6 +861,73 @@ fsck_err:
        return ret;
 }
 
+static int bch2_fs_initialize_subvolumes(struct bch_fs *c)
+{
+       struct bkey_i_snapshot  root_snapshot;
+       struct bkey_i_subvolume root_volume;
+       int ret;
+
+       bkey_snapshot_init(&root_snapshot.k_i);
+       root_snapshot.k.p.offset = U32_MAX;
+       root_snapshot.v.flags   = 0;
+       root_snapshot.v.parent  = 0;
+       root_snapshot.v.subvol  = BCACHEFS_ROOT_SUBVOL;
+       root_snapshot.v.pad     = 0;
+       SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+       ret = bch2_btree_insert(c, BTREE_ID_snapshots,
+                               &root_snapshot.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+
+       bkey_subvolume_init(&root_volume.k_i);
+       root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+       root_volume.v.flags     = 0;
+       root_volume.v.snapshot  = cpu_to_le32(U32_MAX);
+       root_volume.v.inode     = cpu_to_le64(BCACHEFS_ROOT_INO);
+
+       ret = bch2_btree_insert(c, BTREE_ID_subvolumes,
+                               &root_volume.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       return 0;
+}
+
+static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_inode_unpacked inode;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
+                            SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (!bkey_is_inode(k.k)) {
+               bch_err(trans->c, "root inode not found");
+               ret = -ENOENT;
+               goto err;
+       }
+
+       ret = bch2_inode_unpack(k, &inode);
+       BUG_ON(ret);
+
+       inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+       ret = bch2_inode_write(trans, &iter, &inode);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
 int bch2_fs_recovery(struct bch_fs *c)
 {
        const char *err = "cannot allocate memory";
@@ -979,6 +946,8 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (c->sb.clean)
                bch_info(c, "recovering from clean shutdown, journal seq %llu",
                         le64_to_cpu(clean->journal_seq));
+       else
+               bch_info(c, "recovering from unclean shutdown");
 
        if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
                bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
@@ -997,7 +966,6 @@ int bch2_fs_recovery(struct bch_fs *c)
                bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
                ret = -EINVAL;
                goto err;
-
        }
 
        if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
@@ -1012,16 +980,20 @@ int bch2_fs_recovery(struct bch_fs *c)
                set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
        }
 
-       if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
-               bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
-               c->opts.version_upgrade = true;
-               c->opts.fsck            = true;
-               c->opts.fix_errors      = FSCK_OPT_YES;
-       }
-
-       if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) {
-               bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required");
-               c->opts.version_upgrade = true;
+       if (!c->opts.nochanges) {
+               if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
+                       bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
+                       c->opts.version_upgrade = true;
+                       c->opts.fsck            = true;
+                       c->opts.fix_errors      = FSCK_OPT_YES;
+               } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
+                       bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
+                       c->opts.version_upgrade = true;
+                       c->opts.fsck            = true;
+               } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
+                       bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+                       c->opts.version_upgrade = true;
+               }
        }
 
        ret = bch2_blacklist_table_initialize(c);
@@ -1033,6 +1005,7 @@ int bch2_fs_recovery(struct bch_fs *c)
        if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
                struct journal_replay *i;
 
+               bch_verbose(c, "starting journal read");
                ret = bch2_journal_read(c, &c->journal_entries,
                                        &blacklist_seq, &journal_seq);
                if (ret)
@@ -1122,7 +1095,11 @@ use_clean:
 
        bch_verbose(c, "starting alloc read");
        err = "error reading allocation information";
-       ret = bch2_alloc_read(c);
+
+       down_read(&c->gc_lock);
+       ret = bch2_alloc_read(c, false, false);
+       up_read(&c->gc_lock);
+
        if (ret)
                goto err;
        bch_verbose(c, "alloc read done");
@@ -1136,18 +1113,25 @@ use_clean:
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+       /*
+        * If we're not running fsck, this ensures bch2_fsck_err() calls are
+        * instead interpreted as bch2_inconsistent_err() calls:
+        */
+       if (!c->opts.fsck)
+               set_bit(BCH_FS_FSCK_DONE, &c->flags);
+
        if (c->opts.fsck ||
            !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
            !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
            test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
                bool metadata_only = c->opts.norecovery;
 
-               bch_info(c, "starting mark and sweep");
+               bch_info(c, "checking allocations");
                err = "error in mark and sweep";
                ret = bch2_gc(c, true, metadata_only);
                if (ret)
                        goto err;
-               bch_verbose(c, "mark and sweep done");
+               bch_verbose(c, "done checking allocations");
        }
 
        bch2_stripes_heap_start(c);
@@ -1165,29 +1149,37 @@ use_clean:
        if (c->opts.norecovery)
                goto out;
 
-       bch_verbose(c, "starting journal replay");
+       bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr);
        err = "journal replay failed";
-       ret = bch2_journal_replay(c, c->journal_keys);
+       ret = bch2_journal_replay(c);
        if (ret)
                goto err;
-       bch_verbose(c, "journal replay done");
+       if (c->opts.verbose || !c->sb.clean)
+               bch_info(c, "journal replay done");
 
-       if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
-           !c->opts.nochanges) {
-               /*
-                * note that even when filesystem was clean there might be work
-                * to do here, if we ran gc (because of fsck) which recalculated
-                * oldest_gen:
-                */
-               bch_verbose(c, "writing allocation info");
-               err = "error writing out alloc info";
-               ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
-                       bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
-               if (ret) {
-                       bch_err(c, "error writing alloc info");
+       if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+               bch2_fs_lazy_rw(c);
+
+               err = "error creating root snapshot node";
+               ret = bch2_fs_initialize_subvolumes(c);
+               if (ret)
+                       goto err;
+       }
+
+       bch_verbose(c, "reading snapshots table");
+       err = "error reading snapshots table";
+       ret = bch2_fs_snapshots_start(c);
+       if (ret)
+               goto err;
+       bch_verbose(c, "reading snapshots done");
+
+       if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
+               /* set bi_subvol on root inode */
+               err = "error upgrade root inode for subvolumes";
+               ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+                                   bch2_fs_upgrade_for_subvolumes(&trans));
+               if (ret)
                        goto err;
-               }
-               bch_verbose(c, "alloc write done");
        }
 
        if (c->opts.fsck) {
@@ -1214,21 +1206,6 @@ use_clean:
                bch_verbose(c, "quotas done");
        }
 
-       if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
-           !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
-               struct bch_move_stats stats = { 0 };
-
-               bch_info(c, "scanning for old btree nodes");
-               ret = bch2_fs_read_write(c);
-               if (ret)
-                       goto err;
-
-               ret = bch2_scan_old_btree_nodes(c, &stats);
-               if (ret)
-                       goto err;
-               bch_info(c, "scanning for old btree nodes done");
-       }
-
        mutex_lock(&c->sb_lock);
        if (c->opts.version_upgrade) {
                c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
@@ -1253,6 +1230,24 @@ use_clean:
                bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
+       if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
+           !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) ||
+           le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) {
+               struct bch_move_stats stats;
+
+               bch_move_stats_init(&stats, "recovery");
+
+               bch_info(c, "scanning for old btree nodes");
+               ret = bch2_fs_read_write(c);
+               if (ret)
+                       goto err;
+
+               ret = bch2_scan_old_btree_nodes(c, &stats);
+               if (ret)
+                       goto err;
+               bch_info(c, "scanning for old btree nodes done");
+       }
+
        if (c->journal_seq_blacklist_table &&
            c->journal_seq_blacklist_table->nr > 128)
                queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
@@ -1300,20 +1295,15 @@ int bch2_fs_initialize(struct bch_fs *c)
                c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
                bch2_write_super(c);
        }
-
-       for_each_online_member(ca, c, i)
-               bch2_mark_dev_superblock(c, ca, 0);
        mutex_unlock(&c->sb_lock);
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+       set_bit(BCH_FS_FSCK_DONE, &c->flags);
 
        for (i = 0; i < BTREE_ID_NR; i++)
                bch2_btree_root_alloc(c, i);
 
-       set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
-       set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
-
        err = "unable to allocate journal buckets";
        for_each_online_member(ca, c, i) {
                ret = bch2_dev_journal_alloc(ca);
@@ -1346,11 +1336,26 @@ int bch2_fs_initialize(struct bch_fs *c)
                        percpu_ref_put(&ca->ref);
                        goto err;
                }
+
+               ca->new_fs_bucket_idx = 0;
        }
 
+       err = "error creating root snapshot node";
+       ret = bch2_fs_initialize_subvolumes(c);
+       if (ret)
+               goto err;
+
+       bch_verbose(c, "reading snapshots table");
+       err = "error reading snapshots table";
+       ret = bch2_fs_snapshots_start(c);
+       if (ret)
+               goto err;
+       bch_verbose(c, "reading snapshots done");
+
        bch2_inode_init(c, &root_inode, 0, 0,
                        S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-       root_inode.bi_inum = BCACHEFS_ROOT_INO;
+       root_inode.bi_inum      = BCACHEFS_ROOT_INO;
+       root_inode.bi_subvol    = BCACHEFS_ROOT_SUBVOL;
        bch2_inode_pack(c, &packed_inode, &root_inode);
        packed_inode.inode.k.p.snapshot = U32_MAX;
 
@@ -1365,11 +1370,12 @@ int bch2_fs_initialize(struct bch_fs *c)
 
        err = "error creating lost+found";
        ret = bch2_trans_do(c, NULL, NULL, 0,
-               bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
+               bch2_create_trans(&trans,
+                                 BCACHEFS_ROOT_SUBVOL_INUM,
                                  &root_inode, &lostfound_inode,
                                  &lostfound,
                                  0, 0, S_IFDIR|0700, 0,
-                                 NULL, NULL));
+                                 NULL, NULL, (subvol_inum) { 0 }, 0));
        if (ret) {
                bch_err(c, "error creating lost+found");
                goto err;
@@ -1382,7 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c)
        }
 
        err = "error writing first journal entry";
-       ret = bch2_journal_meta(&c->journal);
+       ret = bch2_journal_flush(&c->journal);
        if (ret)
                goto err;
 
index e5565e4f335a535d27e960ab35571b21ca07171d..21bdad9db2493668b0f5c2a6bcf6f2913315b8db 100644 (file)
@@ -31,24 +31,30 @@ struct btree_and_journal_iter {
        }                       last;
 };
 
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+                              unsigned, struct bpos);
+
+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
+                                unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
                            unsigned, struct bkey_i *);
 int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
                            unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
+                                 unsigned, struct bpos);
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                               struct bch_fs *, struct btree *,
+                               struct btree_node_iter, struct bpos);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
                                                struct bch_fs *,
                                                struct btree *);
 
-typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
-
-int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
-
 void bch2_journal_keys_free(struct journal_keys *);
 void bch2_journal_entries_free(struct list_head *);
 
index 3d9c5c5b0eba75a7e548e6b1e48f539bddbf4365..c8d6d73681e010c0ec00221a7a473070c4680b0a 100644 (file)
@@ -7,6 +7,7 @@
 #include "inode.h"
 #include "io.h"
 #include "reflink.h"
+#include "subvolume.h"
 
 #include <linux/sched/signal.h>
 
@@ -31,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (bkey_val_bytes(p.k) != sizeof(*p.v))
                return "incorrect value size";
 
+       if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
+           le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
+               return "idx < front_pad";
+
        return NULL;
 }
 
@@ -39,7 +44,10 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
 
-       pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
+       pr_buf(out, "idx %llu front_pad %u back_pad %u",
+              le64_to_cpu(p.v->idx),
+              le32_to_cpu(p.v->front_pad),
+              le32_to_cpu(p.v->back_pad));
 }
 
 bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r)
@@ -116,7 +124,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
                                     struct bkey_i *orig)
 {
        struct bch_fs *c = trans->c;
-       struct btree_iter *reflink_iter;
+       struct btree_iter reflink_iter = { NULL };
        struct bkey_s_c k;
        struct bkey_i *r_v;
        struct bkey_i_reflink_p *r_p;
@@ -126,11 +134,11 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        if (orig->k.type == KEY_TYPE_inline_data)
                bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
 
-       for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
+       for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink,
                           POS(0, c->reflink_hint),
                           BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
-               if (reflink_iter->pos.inode) {
-                       bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
+               if (reflink_iter.pos.inode) {
+                       bch2_btree_iter_set_pos(&reflink_iter, POS_MIN);
                        continue;
                }
 
@@ -142,7 +150,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
                goto err;
 
        /* rewind iter to start of hole, if necessary: */
-       bch2_btree_iter_set_pos_to_extent_start(reflink_iter);
+       bch2_btree_iter_set_pos_to_extent_start(&reflink_iter);
 
        r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k));
        ret = PTR_ERR_OR_ZERO(r_v);
@@ -151,7 +159,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        bkey_init(&r_v->k);
        r_v->k.type     = bkey_type_to_indirect(&orig->k);
-       r_v->k.p        = reflink_iter->pos;
+       r_v->k.p        = reflink_iter.pos;
        bch2_key_resize(&r_v->k, orig->k.size);
        r_v->k.version  = orig->k.version;
 
@@ -161,20 +169,26 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
        *refcount       = 0;
        memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
-       ret = bch2_trans_update(trans, reflink_iter, r_v, 0);
+       ret = bch2_trans_update(trans, &reflink_iter, r_v, 0);
        if (ret)
                goto err;
 
+       /*
+        * orig is in a bkey_buf which statically allocates 5 64s for the val,
+        * so we know it will be big enough:
+        */
        orig->k.type = KEY_TYPE_reflink_p;
        r_p = bkey_i_to_reflink_p(orig);
        set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
+       memset(&r_p->v, 0, sizeof(r_p->v));
+
        r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
 
-       ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
+       ret = bch2_trans_update(trans, extent_iter, &r_p->k_i,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 err:
-       if (!IS_ERR(reflink_iter))
-               c->reflink_hint = reflink_iter->pos.offset;
-       bch2_trans_iter_put(trans, reflink_iter);
+       c->reflink_hint = reflink_iter.pos.offset;
+       bch2_trans_iter_exit(trans, &reflink_iter);
 
        return ret;
 }
@@ -184,7 +198,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
        struct bkey_s_c k;
        int ret;
 
-       for_each_btree_key_continue(iter, 0, k, ret) {
+       for_each_btree_key_continue_norestart(*iter, 0, k, ret) {
                if (bkey_cmp(iter->pos, end) >= 0)
                        break;
 
@@ -198,17 +212,21 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
-                    struct bpos dst_start, struct bpos src_start,
-                    u64 remap_sectors, u64 *journal_seq,
+                    subvol_inum dst_inum, u64 dst_offset,
+                    subvol_inum src_inum, u64 src_offset,
+                    u64 remap_sectors,
                     u64 new_i_size, s64 *i_sectors_delta)
 {
        struct btree_trans trans;
-       struct btree_iter *dst_iter, *src_iter;
+       struct btree_iter dst_iter, src_iter;
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
+       struct bpos dst_start = POS(dst_inum.inum, dst_offset);
+       struct bpos src_start = POS(src_inum.inum, src_offset);
        struct bpos dst_end = dst_start, src_end = src_start;
        struct bpos src_want;
        u64 dst_done;
+       u32 dst_snapshot, src_snapshot;
        int ret = 0, ret2 = 0;
 
        if (!percpu_ref_tryget(&c->writes))
@@ -223,13 +241,13 @@ s64 bch2_remap_range(struct bch_fs *c,
        bch2_bkey_buf_init(&new_src);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
-       src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
-                                      BTREE_ITER_INTENT);
-       dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
-                                      BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start,
+                            BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start,
+                            BTREE_ITER_INTENT);
 
        while ((ret == 0 || ret == -EINTR) &&
-              bkey_cmp(dst_iter->pos, dst_end) < 0) {
+              bkey_cmp(dst_iter.pos, dst_end) < 0) {
                struct disk_reservation disk_res = { 0 };
 
                bch2_trans_begin(&trans);
@@ -239,31 +257,45 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
                }
 
-               dst_done = dst_iter->pos.offset - dst_start.offset;
+               ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol,
+                                                 &src_snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&src_iter, src_snapshot);
+
+               ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol,
+                                                 &dst_snapshot);
+               if (ret)
+                       continue;
+
+               bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+
+               dst_done = dst_iter.pos.offset - dst_start.offset;
                src_want = POS(src_start.inode, src_start.offset + dst_done);
-               bch2_btree_iter_set_pos(src_iter, src_want);
+               bch2_btree_iter_set_pos(&src_iter, src_want);
 
-               src_k = get_next_src(src_iter, src_end);
+               src_k = get_next_src(&src_iter, src_end);
                ret = bkey_err(src_k);
                if (ret)
                        continue;
 
-               if (bkey_cmp(src_want, src_iter->pos) < 0) {
-                       ret = bch2_fpunch_at(&trans, dst_iter,
-                                       bpos_min(dst_end,
-                                                POS(dst_iter->pos.inode, dst_iter->pos.offset +
-                                                    src_iter->pos.offset - src_want.offset)),
-                                                journal_seq, i_sectors_delta);
+               if (bkey_cmp(src_want, src_iter.pos) < 0) {
+                       ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum,
+                                       min(dst_end.offset,
+                                           dst_iter.pos.offset +
+                                           src_iter.pos.offset - src_want.offset),
+                                       i_sectors_delta);
                        continue;
                }
 
                if (src_k.k->type != KEY_TYPE_reflink_p) {
-                       bch2_btree_iter_set_pos_to_extent_start(src_iter);
+                       bch2_btree_iter_set_pos_to_extent_start(&src_iter);
 
                        bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
-                       ret = bch2_make_extent_indirect(&trans, src_iter,
+                       ret = bch2_make_extent_indirect(&trans, &src_iter,
                                                new_src.k);
                        if (ret)
                                continue;
@@ -286,46 +318,47 @@ s64 bch2_remap_range(struct bch_fs *c,
                        BUG();
                }
 
-               new_dst.k->k.p = dst_iter->pos;
+               new_dst.k->k.p = dst_iter.pos;
                bch2_key_resize(&new_dst.k->k,
                                min(src_k.k->p.offset - src_want.offset,
-                                   dst_end.offset - dst_iter->pos.offset));
-               ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
-                                        &disk_res, journal_seq,
+                                   dst_end.offset - dst_iter.pos.offset));
+
+               ret = bch2_extent_update(&trans, dst_inum, &dst_iter,
+                                        new_dst.k, &disk_res, NULL,
                                         new_i_size, i_sectors_delta,
                                         true);
                bch2_disk_reservation_put(c, &disk_res);
        }
-       bch2_trans_iter_put(&trans, dst_iter);
-       bch2_trans_iter_put(&trans, src_iter);
+       bch2_trans_iter_exit(&trans, &dst_iter);
+       bch2_trans_iter_exit(&trans, &src_iter);
 
-       BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
-       BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
+       BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end));
+       BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0);
 
-       dst_done = dst_iter->pos.offset - dst_start.offset;
-       new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
+       dst_done = dst_iter.pos.offset - dst_start.offset;
+       new_i_size = min(dst_iter.pos.offset << 9, new_i_size);
 
        do {
                struct bch_inode_unpacked inode_u;
-               struct btree_iter *inode_iter;
+               struct btree_iter inode_iter = { NULL };
 
                bch2_trans_begin(&trans);
 
-               inode_iter = bch2_inode_peek(&trans, &inode_u,
-                               dst_start.inode, BTREE_ITER_INTENT);
-               ret2 = PTR_ERR_OR_ZERO(inode_iter);
+               ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u,
+                                      dst_inum, BTREE_ITER_INTENT);
 
                if (!ret2 &&
                    inode_u.bi_size < new_i_size) {
                        inode_u.bi_size = new_i_size;
-                       ret2  = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
-                               bch2_trans_commit(&trans, NULL, journal_seq, 0);
+                       ret2  = bch2_inode_write(&trans, &inode_iter, &inode_u) ?:
+                               bch2_trans_commit(&trans, NULL, NULL,
+                                                 BTREE_INSERT_NOFAIL);
                }
 
-               bch2_trans_iter_put(&trans, inode_iter);
+               bch2_trans_iter_exit(&trans, &inode_iter);
        } while (ret2 == -EINTR);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&new_src, c);
        bch2_bkey_buf_exit(&new_dst, c);
 
index 68c5cb5a2780ddd1552d41d03229e145f1d14d3c..3745873fd88d90947f610de256931cecec4d9181 100644 (file)
@@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k)
        }
 }
 
-s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
-                    u64, u64 *, u64, s64 *);
+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64,
+                    subvol_inum, u64, u64, u64, s64 *);
 
 #endif /* _BCACHEFS_REFLINK_H */
index dbbbcc6dcec6eadee454c9f1e0f24d07b4676425..96994b7a75a555699fc0aa2c2745f8353cdeeaba 100644 (file)
@@ -41,18 +41,19 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 {
        unsigned i;
 
-       pr_buf(out, "%s: %u/%u [",
-              bch2_data_types[e->data_type],
-              e->nr_required,
-              e->nr_devs);
+       if (e->data_type < BCH_DATA_NR)
+               pr_buf(out, "%s", bch2_data_types[e->data_type]);
+       else
+               pr_buf(out, "(invalid data type %u)", e->data_type);
 
+       pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs);
        for (i = 0; i < e->nr_devs; i++)
                pr_buf(out, i ? " %u" : "%u", e->devs[i]);
        pr_buf(out, "]");
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
-                             struct bch_replicas_cpu *r)
+                              struct bch_replicas_cpu *r)
 {
        struct bch_replicas_entry *e;
        bool first = true;
@@ -413,75 +414,14 @@ err:
        goto out;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-                               struct bch_replicas_entry *r,
-                               bool check)
-{
-       return likely(bch2_replicas_marked(c, r))       ? 0
-               : check                                 ? -1
-               : bch2_mark_replicas_slowpath(c, r);
-}
-
 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
 {
-       return __bch2_mark_replicas(c, r, false);
-}
-
-static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
-                                    bool check)
-{
-       struct bch_replicas_padded search;
-       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-       unsigned i;
-       int ret;
-
-       memset(&search, 0, sizeof(search));
-
-       for (i = 0; i < cached.nr; i++) {
-               bch2_replicas_entry_cached(&search.e, cached.devs[i]);
-
-               ret = __bch2_mark_replicas(c, &search.e, check);
-               if (ret)
-                       return ret;
-       }
-
-       bch2_bkey_to_replicas(&search.e, k);
-
-       ret = __bch2_mark_replicas(c, &search.e, check);
-       if (ret)
-               return ret;
-
-       if (search.e.data_type == BCH_DATA_parity) {
-               search.e.data_type = BCH_DATA_cached;
-               ret = __bch2_mark_replicas(c, &search.e, check);
-               if (ret)
-                       return ret;
-
-               search.e.data_type = BCH_DATA_user;
-               ret = __bch2_mark_replicas(c, &search.e, check);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
+       return likely(bch2_replicas_marked(c, r))
+               ? 0 : bch2_mark_replicas_slowpath(c, r);
 }
 
 /* replicas delta list: */
 
-bool bch2_replicas_delta_list_marked(struct bch_fs *c,
-                                    struct replicas_delta_list *r)
-{
-       struct replicas_delta *d = r->d;
-       struct replicas_delta *top = (void *) r->d + r->used;
-
-       percpu_rwsem_assert_held(&c->mark_lock);
-
-       for (d = r->d; d != top; d = replicas_delta_next(d))
-               if (bch2_replicas_entry_idx(c, &d->r) < 0)
-                       return false;
-       return true;
-}
-
 int bch2_replicas_delta_list_mark(struct bch_fs *c,
                                  struct replicas_delta_list *r)
 {
@@ -494,19 +434,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c,
        return ret;
 }
 
-/* bkey replicas: */
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-                              struct bkey_s_c k)
-{
-       return __bch2_mark_bkey_replicas(c, k, true) == 0;
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
-{
-       return __bch2_mark_bkey_replicas(c, k, false);
-}
-
 /*
  * Old replicas_gc mechanism: only used for journal replicas entries now, should
  * die at some point:
@@ -874,67 +801,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
        return 0;
 }
 
-static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
+                                     struct bch_sb *sb,
+                                     struct printbuf *err)
 {
-       unsigned i;
+       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+       unsigned i, j;
 
        sort_cmp_size(cpu_r->entries,
                      cpu_r->nr,
                      cpu_r->entry_size,
                      memcmp, NULL);
 
-       for (i = 0; i + 1 < cpu_r->nr; i++) {
-               struct bch_replicas_entry *l =
+       for (i = 0; i < cpu_r->nr; i++) {
+               struct bch_replicas_entry *e =
                        cpu_replicas_entry(cpu_r, i);
-               struct bch_replicas_entry *r =
-                       cpu_replicas_entry(cpu_r, i + 1);
-
-               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
 
-               if (!memcmp(l, r, cpu_r->entry_size))
-                       return "duplicate replicas entry";
-       }
+               if (e->data_type >= BCH_DATA_NR) {
+                       pr_buf(err, "invalid data type in entry ");
+                       bch2_replicas_entry_to_text(err, e);
+                       return -EINVAL;
+               }
 
-       return NULL;
-}
+               if (!e->nr_devs) {
+                       pr_buf(err, "no devices in entry ");
+                       bch2_replicas_entry_to_text(err, e);
+                       return -EINVAL;
+               }
 
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
-{
-       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-       struct bch_replicas_cpu cpu_r = { .entries = NULL };
-       struct bch_replicas_entry *e;
-       const char *err;
-       unsigned i;
+               if (e->nr_required > 1 &&
+                   e->nr_required >= e->nr_devs) {
+                       pr_buf(err, "bad nr_required in entry ");
+                       bch2_replicas_entry_to_text(err, e);
+                       return -EINVAL;
+               }
 
-       for_each_replicas_entry(sb_r, e) {
-               err = "invalid replicas entry: invalid data type";
-               if (e->data_type >= BCH_DATA_NR)
-                       goto err;
+               for (j = 0; j < e->nr_devs; j++)
+                       if (!bch2_dev_exists(sb, mi, e->devs[j])) {
+                               pr_buf(err, "invalid device %u in entry ", e->devs[j]);
+                               bch2_replicas_entry_to_text(err, e);
+                               return -EINVAL;
+                       }
 
-               err = "invalid replicas entry: no devices";
-               if (!e->nr_devs)
-                       goto err;
+               if (i + 1 < cpu_r->nr) {
+                       struct bch_replicas_entry *n =
+                               cpu_replicas_entry(cpu_r, i + 1);
 
-               err = "invalid replicas entry: bad nr_required";
-               if (e->nr_required > 1 &&
-                   e->nr_required >= e->nr_devs)
-                       goto err;
+                       BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
 
-               err = "invalid replicas entry: invalid device";
-               for (i = 0; i < e->nr_devs; i++)
-                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
-                               goto err;
+                       if (!memcmp(e, n, cpu_r->entry_size)) {
+                               pr_buf(err, "duplicate replicas entry ");
+                               bch2_replicas_entry_to_text(err, e);
+                               return -EINVAL;
+                       }
+               }
        }
 
-       err = "cannot allocate memory";
+       return 0;
+}
+
+static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f,
+                                    struct printbuf *err)
+{
+       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+       struct bch_replicas_cpu cpu_r;
+       int ret;
+
        if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
-               goto err;
+               return -ENOMEM;
 
-       err = check_dup_replicas_entries(&cpu_r);
-err:
+       ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
        kfree(cpu_r.entries);
-       return err;
+       return ret;
 }
 
 static void bch2_sb_replicas_to_text(struct printbuf *out,
@@ -959,38 +897,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
        .to_text        = bch2_sb_replicas_to_text,
 };
 
-static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
+static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f,
+                                       struct printbuf *err)
 {
        struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-       struct bch_replicas_cpu cpu_r = { .entries = NULL };
-       struct bch_replicas_entry_v0 *e;
-       const char *err;
-       unsigned i;
-
-       for_each_replicas_entry_v0(sb_r, e) {
-               err = "invalid replicas entry: invalid data type";
-               if (e->data_type >= BCH_DATA_NR)
-                       goto err;
-
-               err = "invalid replicas entry: no devices";
-               if (!e->nr_devs)
-                       goto err;
-
-               err = "invalid replicas entry: invalid device";
-               for (i = 0; i < e->nr_devs; i++)
-                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
-                               goto err;
-       }
+       struct bch_replicas_cpu cpu_r;
+       int ret;
 
-       err = "cannot allocate memory";
        if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
-               goto err;
+               return -ENOMEM;
 
-       err = check_dup_replicas_entries(&cpu_r);
-err:
+       ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
        kfree(cpu_r.entries);
-       return err;
+       return ret;
 }
 
 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
@@ -1010,6 +929,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
                unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
                bool metadata = e->data_type < BCH_DATA_user;
 
+               if (e->data_type == BCH_DATA_cached)
+                       continue;
+
                for (i = 0; i < e->nr_devs; i++) {
                        struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
 
index 72ac544f16d8d43cf0d456fba1716e79c3929347..d237d7c51ccb9b9faa771e72ba123fb505914c16 100644 (file)
@@ -48,12 +48,9 @@ replicas_delta_next(struct replicas_delta *d)
        return (void *) d + replicas_entry_bytes(&d->r) + 8;
 }
 
-bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
 int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
 
 void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
-bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
-int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
 static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
                                              unsigned dev)
index 23602349419161d7e7bd422f931a81dad44ba4e4..57d636740d2f81ffa8a19adf9dd3d3f036746a65 100644 (file)
@@ -8,6 +8,7 @@
 #include "error.h"
 #include "inode.h"
 #include "siphash.h"
+#include "subvolume.h"
 #include "super.h"
 
 #include <linux/crc32c.h>
@@ -19,13 +20,13 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
 {
        switch (opt) {
        case BCH_STR_HASH_OPT_crc32c:
-               return BCH_STR_HASH_CRC32C;
+               return BCH_STR_HASH_crc32c;
        case BCH_STR_HASH_OPT_crc64:
-               return BCH_STR_HASH_CRC64;
+               return BCH_STR_HASH_crc64;
        case BCH_STR_HASH_OPT_siphash:
                return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
-                       ? BCH_STR_HASH_SIPHASH
-                       : BCH_STR_HASH_SIPHASH_OLD;
+                       ? BCH_STR_HASH_siphash
+                       : BCH_STR_HASH_siphash_old;
        default:
             BUG();
        }
@@ -50,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
                .siphash_key = { .k0 = bi->bi_hash_seed }
        };
 
-       if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
+       if (unlikely(info.type == BCH_STR_HASH_siphash_old)) {
                SHASH_DESC_ON_STACK(desc, c->sha256);
                u8 digest[SHA256_DIGEST_SIZE];
 
@@ -76,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
                                     const struct bch_hash_info *info)
 {
        switch (info->type) {
-       case BCH_STR_HASH_CRC32C:
+       case BCH_STR_HASH_crc32c:
                ctx->crc32c = crc32c(~0, &info->siphash_key.k0,
                                     sizeof(info->siphash_key.k0));
                break;
-       case BCH_STR_HASH_CRC64:
+       case BCH_STR_HASH_crc64:
                ctx->crc64 = crc64_be(~0, &info->siphash_key.k0,
                                      sizeof(info->siphash_key.k0));
                break;
-       case BCH_STR_HASH_SIPHASH_OLD:
-       case BCH_STR_HASH_SIPHASH:
+       case BCH_STR_HASH_siphash_old:
+       case BCH_STR_HASH_siphash:
                SipHash24_Init(&ctx->siphash, &info->siphash_key);
                break;
        default:
@@ -98,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
                                       const void *data, size_t len)
 {
        switch (info->type) {
-       case BCH_STR_HASH_CRC32C:
+       case BCH_STR_HASH_crc32c:
                ctx->crc32c = crc32c(ctx->crc32c, data, len);
                break;
-       case BCH_STR_HASH_CRC64:
+       case BCH_STR_HASH_crc64:
                ctx->crc64 = crc64_be(ctx->crc64, data, len);
                break;
-       case BCH_STR_HASH_SIPHASH_OLD:
-       case BCH_STR_HASH_SIPHASH:
+       case BCH_STR_HASH_siphash_old:
+       case BCH_STR_HASH_siphash:
                SipHash24_Update(&ctx->siphash, data, len);
                break;
        default:
@@ -117,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
                                   const struct bch_hash_info *info)
 {
        switch (info->type) {
-       case BCH_STR_HASH_CRC32C:
+       case BCH_STR_HASH_crc32c:
                return ctx->crc32c;
-       case BCH_STR_HASH_CRC64:
+       case BCH_STR_HASH_crc64:
                return ctx->crc64 >> 1;
-       case BCH_STR_HASH_SIPHASH_OLD:
-       case BCH_STR_HASH_SIPHASH:
+       case BCH_STR_HASH_siphash_old:
+       case BCH_STR_HASH_siphash:
                return SipHash24_End(&ctx->siphash) >> 1;
        default:
                BUG();
@@ -137,28 +138,40 @@ struct bch_hash_desc {
        u64             (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
        bool            (*cmp_key)(struct bkey_s_c, const void *);
        bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
+       bool            (*is_visible)(subvol_inum inum, struct bkey_s_c);
 };
 
-static __always_inline struct btree_iter *
+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
+{
+       return k.k->type == desc.key_type &&
+               (!desc.is_visible || desc.is_visible(inum, k));
+}
+
+static __always_inline int
 bch2_hash_lookup(struct btree_trans *trans,
+                struct btree_iter *iter,
                 const struct bch_hash_desc desc,
                 const struct bch_hash_info *info,
-                u64 inode, const void *key,
+                subvol_inum inum, const void *key,
                 unsigned flags)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
-       for_each_btree_key(trans, iter, desc.btree_id,
-                          POS(inode, desc.hash_key(info, key)),
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
                           BTREE_ITER_SLOTS|flags, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter->pos.inode != inum.inum)
                        break;
 
-               if (k.k->type == desc.key_type) {
+               if (is_visible_key(desc, inum, k)) {
                        if (!desc.cmp_key(k, key))
-                               return iter;
+                               return 0;
                } else if (k.k->type == KEY_TYPE_hash_whiteout) {
                        ;
                } else {
@@ -166,35 +179,38 @@ bch2_hash_lookup(struct btree_trans *trans,
                        break;
                }
        }
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, iter);
 
-       return ERR_PTR(ret ?: -ENOENT);
+       return ret ?: -ENOENT;
 }
 
-static __always_inline struct btree_iter *
+static __always_inline int
 bch2_hash_hole(struct btree_trans *trans,
+              struct btree_iter *iter,
               const struct bch_hash_desc desc,
               const struct bch_hash_info *info,
-              u64 inode, const void *key)
+              subvol_inum inum, const void *key)
 {
-       struct btree_iter *iter;
        struct bkey_s_c k;
+       u32 snapshot;
        int ret;
 
-       for_each_btree_key(trans, iter, desc.btree_id,
-                          POS(inode, desc.hash_key(info, key)),
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       for_each_btree_key_norestart(trans, *iter, desc.btree_id,
+                          SPOS(inum.inum, desc.hash_key(info, key), snapshot),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter->pos.inode != inum.inum)
                        break;
 
-               if (k.k->type != desc.key_type)
-                       return iter;
+               if (!is_visible_key(desc, inum, k))
+                       return 0;
        }
+       bch2_trans_iter_exit(trans, iter);
 
-       iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
-       bch2_trans_iter_put(trans, iter);
-
-       return ERR_PTR(ret ?: -ENOSPC);
+       return ret ?: -ENOSPC;
 }
 
 static __always_inline
@@ -203,28 +219,27 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
                             const struct bch_hash_info *info,
                             struct btree_iter *start)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_copy_iter(trans, start);
+       bch2_trans_copy_iter(&iter, start);
 
-       bch2_btree_iter_advance(iter);
+       bch2_btree_iter_advance(&iter);
 
-       for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
+       for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) {
                if (k.k->type != desc.key_type &&
                    k.k->type != KEY_TYPE_hash_whiteout)
                        break;
 
                if (k.k->type == desc.key_type &&
                    desc.hash_bkey(info, k) <= start->pos.offset) {
-                       iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
                        ret = 1;
                        break;
                }
        }
 
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -232,20 +247,28 @@ static __always_inline
 int bch2_hash_set(struct btree_trans *trans,
                  const struct bch_hash_desc desc,
                  const struct bch_hash_info *info,
-                 u64 inode, struct bkey_i *insert, int flags)
+                 subvol_inum inum,
+                 struct bkey_i *insert, int flags)
 {
-       struct btree_iter *iter, *slot = NULL;
+       struct btree_iter iter, slot = { NULL };
        struct bkey_s_c k;
        bool found = false;
+       u32 snapshot;
        int ret;
 
-       for_each_btree_key(trans, iter, desc.btree_id,
-                          POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
+       ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+       if (ret)
+               return ret;
+
+       for_each_btree_key_norestart(trans, iter, desc.btree_id,
+                          SPOS(inum.inum,
+                               desc.hash_bkey(info, bkey_i_to_s_c(insert)),
+                               snapshot),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               if (iter->pos.inode != inode)
+               if (iter.pos.inode != inum.inum)
                        break;
 
-               if (k.k->type == desc.key_type) {
+               if (is_visible_key(desc, inum, k)) {
                        if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
                                goto found;
 
@@ -253,9 +276,9 @@ int bch2_hash_set(struct btree_trans *trans,
                        continue;
                }
 
-               if (!slot &&
+               if (!slot.path &&
                    !(flags & BCH_HASH_SET_MUST_REPLACE))
-                       slot = bch2_trans_copy_iter(trans, iter);
+                       bch2_trans_copy_iter(&slot, &iter);
 
                if (k.k->type != KEY_TYPE_hash_whiteout)
                        goto not_found;
@@ -264,8 +287,8 @@ int bch2_hash_set(struct btree_trans *trans,
        if (!ret)
                ret = -ENOSPC;
 out:
-       bch2_trans_iter_put(trans, slot);
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &slot);
+       bch2_trans_iter_exit(trans, &iter);
 
        return ret;
 found:
@@ -277,11 +300,11 @@ not_found:
        } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
                ret = -EEXIST;
        } else {
-               if (!found && slot)
+               if (!found && slot.path)
                        swap(iter, slot);
 
-               insert->k.p = iter->pos;
-               ret = bch2_trans_update(trans, iter, insert, 0);
+               insert->k.p = iter.pos;
+               ret = bch2_trans_update(trans, &iter, insert, 0);
        }
 
        goto out;
@@ -291,7 +314,8 @@ static __always_inline
 int bch2_hash_delete_at(struct btree_trans *trans,
                        const struct bch_hash_desc desc,
                        const struct bch_hash_info *info,
-                       struct btree_iter *iter)
+                       struct btree_iter *iter,
+                       unsigned update_flags)
 {
        struct bkey_i *delete;
        int ret;
@@ -309,25 +333,25 @@ int bch2_hash_delete_at(struct btree_trans *trans,
        delete->k.p = iter->pos;
        delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
 
-       return bch2_trans_update(trans, iter, delete, 0);
+       return bch2_trans_update(trans, iter, delete, update_flags);
 }
 
 static __always_inline
 int bch2_hash_delete(struct btree_trans *trans,
                     const struct bch_hash_desc desc,
                     const struct bch_hash_info *info,
-                    u64 inode, const void *key)
+                    subvol_inum inum, const void *key)
 {
-       struct btree_iter *iter;
+       struct btree_iter iter;
        int ret;
 
-       iter = bch2_hash_lookup(trans, desc, info, inode, key,
+       ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
                                BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+       if (ret)
+               return ret;
 
-       ret = bch2_hash_delete_at(trans, desc, info, iter);
-       bch2_trans_iter_put(trans, iter);
+       ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
new file mode 100644 (file)
index 0000000..6960332
--- /dev/null
@@ -0,0 +1,1089 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_key_cache.h"
+#include "btree_update.h"
+#include "error.h"
+#include "fs.h"
+#include "subvolume.h"
+
+/* Snapshot tree: */
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *);
+static void bch2_delete_dead_snapshots(struct bch_fs *);
+
+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
+
+       pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u",
+              BCH_SNAPSHOT_SUBVOL(s.v),
+              BCH_SNAPSHOT_DELETED(s.v),
+              le32_to_cpu(s.v->parent),
+              le32_to_cpu(s.v->children[0]),
+              le32_to_cpu(s.v->children[1]),
+              le32_to_cpu(s.v->subvol));
+}
+
+const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_snapshot s;
+       u32 i, id;
+
+       if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 ||
+           bkey_cmp(k.k->p, POS(0, 1)) < 0)
+               return "bad pos";
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot))
+               return "bad val size";
+
+       s = bkey_s_c_to_snapshot(k);
+
+       id = le32_to_cpu(s.v->parent);
+       if (id && id <= k.k->p.offset)
+               return "bad parent node";
+
+       if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]))
+               return "children not normalized";
+
+       if (s.v->children[0] &&
+           s.v->children[0] == s.v->children[1])
+               return "duplicate child nodes";
+
+       for (i = 0; i < 2; i++) {
+               id = le32_to_cpu(s.v->children[i]);
+
+               if (id >= k.k->p.offset)
+                       return "bad child node";
+       }
+
+       return NULL;
+}
+
+int bch2_mark_snapshot(struct btree_trans *trans,
+                      struct bkey_s_c old, struct bkey_s_c new,
+                      unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct snapshot_t *t;
+
+       t = genradix_ptr_alloc(&c->snapshots,
+                              U32_MAX - new.k->p.offset,
+                              GFP_KERNEL);
+       if (!t)
+               return -ENOMEM;
+
+       if (new.k->type == KEY_TYPE_snapshot) {
+               struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
+
+               t->parent       = le32_to_cpu(s.v->parent);
+               t->children[0]  = le32_to_cpu(s.v->children[0]);
+               t->children[1]  = le32_to_cpu(s.v->children[1]);
+               t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
+       } else {
+               t->parent       = 0;
+               t->children[0]  = 0;
+               t->children[1]  = 0;
+               t->subvol       = 0;
+       }
+
+       return 0;
+}
+
+static int snapshot_lookup(struct btree_trans *trans, u32 id,
+                          struct bch_snapshot *s)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT;
+
+       if (!ret)
+               *s = *bkey_s_c_to_snapshot(k).v;
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int snapshot_live(struct btree_trans *trans, u32 id)
+{
+       struct bch_snapshot v;
+       int ret;
+
+       if (!id)
+               return 0;
+
+       ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+       if (ret == -ENOENT)
+               bch_err(trans->c, "snapshot node %u not found", id);
+       if (ret)
+               return ret;
+
+       return !BCH_SNAPSHOT_DELETED(&v);
+}
+
+static int bch2_snapshots_set_equiv(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot snap;
+       unsigned i;
+       int ret;
+
+       for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               u32 id = k.k->p.offset, child[2];
+               unsigned nr_live = 0, live_idx;
+
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               child[0] = le32_to_cpu(snap.v->children[0]);
+               child[1] = le32_to_cpu(snap.v->children[1]);
+
+               for (i = 0; i < 2; i++) {
+                       ret = snapshot_live(trans, child[i]);
+                       if (ret < 0)
+                               break;
+
+                       if (ret)
+                               live_idx = i;
+                       nr_live += ret;
+               }
+
+               snapshot_t(c, id)->equiv = nr_live == 1
+                       ? snapshot_t(c, child[live_idx])->equiv
+                       : id;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       if (ret)
+               bch_err(c, "error walking snapshots: %i", ret);
+
+       return ret;
+}
+
+/* fsck: */
+static int bch2_snapshot_check(struct btree_trans *trans,
+                              struct bkey_s_c_snapshot s)
+{
+       struct bch_subvolume subvol;
+       struct bch_snapshot v;
+       u32 i, id;
+       int ret;
+
+       id = le32_to_cpu(s.v->subvol);
+       ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol));
+       if (ret == -ENOENT)
+               bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u",
+                       s.k->p.offset, id);
+       if (ret)
+               return ret;
+
+       if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) {
+               bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
+                       s.k->p.offset);
+               return -EINVAL;
+       }
+
+       id = le32_to_cpu(s.v->parent);
+       if (id) {
+               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "snapshot node %llu has nonexistent parent %u",
+                               s.k->p.offset, id);
+               if (ret)
+                       return ret;
+
+               if (le32_to_cpu(v.children[0]) != s.k->p.offset &&
+                   le32_to_cpu(v.children[1]) != s.k->p.offset) {
+                       bch_err(trans->c, "snapshot parent %u missing pointer to child %llu",
+                               id, s.k->p.offset);
+                       return -EINVAL;
+               }
+       }
+
+       for (i = 0; i < 2 && s.v->children[i]; i++) {
+               id = le32_to_cpu(s.v->children[i]);
+
+               ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v));
+               if (ret == -ENOENT)
+                       bch_err(trans->c, "snapshot node %llu has nonexistent child %u",
+                               s.k->p.offset, id);
+               if (ret)
+                       return ret;
+
+               if (le32_to_cpu(v.parent) != s.k->p.offset) {
+                       bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)",
+                               id, le32_to_cpu(v.parent), s.k->p.offset);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_snapshot s;
+       unsigned id;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k));
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error %i checking snapshots", ret);
+               goto err;
+       }
+
+       for_each_btree_key(&trans, iter, BTREE_ID_subvolumes,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_subvolume)
+                       continue;
+again_2:
+               id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot);
+               ret = snapshot_lookup(&trans, id, &s);
+
+               if (ret == -EINTR) {
+                       k = bch2_btree_iter_peek(&iter);
+                       goto again_2;
+               } else if (ret == -ENOENT)
+                       bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
+                               k.k->p.offset, id);
+               else if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+void bch2_fs_snapshots_exit(struct bch_fs *c)
+{
+       genradix_free(&c->snapshots);
+}
+
+int bch2_fs_snapshots_start(struct bch_fs *c)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       bool have_deleted = false;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+              if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0)
+                      break;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch_err(c, "found wrong key type %u in snapshot node table",
+                               k.k->type);
+                       continue;
+               }
+
+               if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+                       have_deleted = true;
+
+               ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0);
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret)
+               goto err;
+
+       ret = bch2_snapshots_set_equiv(&trans);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_exit(&trans);
+
+       if (!ret && have_deleted) {
+               bch_info(c, "restarting deletion of dead snapshots");
+               if (c->opts.fsck) {
+                       bch2_delete_dead_snapshots_work(&c->snapshot_delete_work);
+               } else {
+                       bch2_delete_dead_snapshots(c);
+               }
+       }
+
+       return ret;
+}
+
+/*
+ * Mark a snapshot as deleted, for future cleanup:
+ */
+static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_snapshot *s;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       /* already deleted? */
+       if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v))
+               goto err;
+
+       s = bch2_trans_kmalloc(trans, sizeof(*s));
+       ret = PTR_ERR_OR_ZERO(s);
+       if (ret)
+               goto err;
+
+       bkey_reassemble(&s->k_i, k);
+
+       SET_BCH_SNAPSHOT_DELETED(&s->v, true);
+       ret = bch2_trans_update(trans, &iter, &s->k_i, 0);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
+{
+       struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot s;
+       struct bkey_i_snapshot *parent;
+       u32 parent_id;
+       unsigned i;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_snapshot) {
+               bch2_fs_inconsistent(trans->c, "missing snapshot %u", id);
+               ret = -ENOENT;
+               goto err;
+       }
+
+       s = bkey_s_c_to_snapshot(k);
+
+       BUG_ON(!BCH_SNAPSHOT_DELETED(s.v));
+       parent_id = le32_to_cpu(s.v->parent);
+
+       if (parent_id) {
+               bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots,
+                                    POS(0, parent_id),
+                                    BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(&p_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               parent = bch2_trans_kmalloc(trans, sizeof(*parent));
+               ret = PTR_ERR_OR_ZERO(parent);
+               if (ret)
+                       goto err;
+
+               bkey_reassemble(&parent->k_i, k);
+
+               for (i = 0; i < 2; i++)
+                       if (le32_to_cpu(parent->v.children[i]) == id)
+                               break;
+
+               if (i == 2)
+                       bch_err(trans->c, "snapshot %u missing child pointer to %u",
+                               parent_id, id);
+               else
+                       parent->v.children[i] = 0;
+
+               if (le32_to_cpu(parent->v.children[0]) <
+                   le32_to_cpu(parent->v.children[1]))
+                       swap(parent->v.children[0],
+                            parent->v.children[1]);
+
+               ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0);
+               if (ret)
+                       goto err;
+       }
+
+       ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &p_iter);
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
+                             u32 *new_snapids,
+                             u32 *snapshot_subvols,
+                             unsigned nr_snapids)
+{
+       struct btree_iter iter;
+       struct bkey_i_snapshot *n;
+       struct bkey_s_c k;
+       unsigned i;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
+                            POS_MIN, BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       for (i = 0; i < nr_snapids; i++) {
+               k = bch2_btree_iter_prev_slot(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (!k.k || !k.k->p.offset) {
+                       ret = -ENOSPC;
+                       goto err;
+               }
+
+               n = bch2_trans_kmalloc(trans, sizeof(*n));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       goto err;
+
+               bkey_snapshot_init(&n->k_i);
+               n->k.p          = iter.pos;
+               n->v.flags      = 0;
+               n->v.parent     = cpu_to_le32(parent);
+               n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
+               n->v.pad        = 0;
+               SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
+
+               ret   = bch2_trans_update(trans, &iter, &n->k_i, 0) ?:
+                       bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
+               if (ret)
+                       goto err;
+
+               new_snapids[i]  = iter.pos.offset;
+       }
+
+       if (parent) {
+               bch2_btree_iter_set_pos(&iter, POS(0, parent));
+               k = bch2_btree_iter_peek(&iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_snapshot) {
+                       bch_err(trans->c, "snapshot %u not found", parent);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               n = bch2_trans_kmalloc(trans, sizeof(*n));
+               ret = PTR_ERR_OR_ZERO(n);
+               if (ret)
+                       goto err;
+
+               bkey_reassemble(&n->k_i, k);
+
+               if (n->v.children[0] || n->v.children[1]) {
+                       bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
+                       ret = -EINVAL;
+                       goto err;
+               }
+
+               n->v.children[0] = cpu_to_le32(new_snapids[0]);
+               n->v.children[1] = cpu_to_le32(new_snapids[1]);
+               SET_BCH_SNAPSHOT_SUBVOL(&n->v, false);
+               ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+               if (ret)
+                       goto err;
+       }
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int snapshot_id_add(struct snapshot_id_list *s, u32 id)
+{
+       BUG_ON(snapshot_list_has_id(s, id));
+
+       if (s->nr == s->size) {
+               size_t new_size = max(8U, s->size * 2);
+               void *n = krealloc(s->d,
+                                  new_size * sizeof(s->d[0]),
+                                  GFP_KERNEL);
+               if (!n) {
+                       pr_err("error allocating snapshot ID list");
+                       return -ENOMEM;
+               }
+
+               s->d    = n;
+               s->size = new_size;
+       };
+
+       s->d[s->nr++] = id;
+       return 0;
+}
+
+static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans,
+                                          struct snapshot_id_list *deleted,
+                                          enum btree_id btree_id)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct snapshot_id_list equiv_seen = { 0 };
+       struct bpos last_pos = POS_MIN;
+       int ret = 0;
+
+       /*
+        * XXX: We should also delete whiteouts that no longer overwrite
+        * anything
+        */
+
+       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_PREFETCH|
+                            BTREE_ITER_NOT_EXTENTS|
+                            BTREE_ITER_ALL_SNAPSHOTS);
+
+       while ((bch2_trans_begin(trans),
+               (k = bch2_btree_iter_peek(&iter)).k) &&
+              !(ret = bkey_err(k))) {
+               u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv;
+
+               if (bkey_cmp(k.k->p, last_pos))
+                       equiv_seen.nr = 0;
+               last_pos = k.k->p;
+
+               if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
+                   snapshot_list_has_id(&equiv_seen, equiv)) {
+                       if (btree_id == BTREE_ID_inodes &&
+                           bch2_btree_key_cache_flush(trans, btree_id, iter.pos))
+                               continue;
+
+                       ret = __bch2_trans_do(trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL,
+                               bch2_btree_iter_traverse(&iter) ?:
+                               bch2_btree_delete_at(trans, &iter,
+                                       BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE));
+                       if (ret)
+                               break;
+               } else {
+                       ret = snapshot_id_add(&equiv_seen, equiv);
+                       if (ret)
+                               break;
+               }
+
+               bch2_btree_iter_advance(&iter);
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       kfree(equiv_seen.d);
+
+       return ret;
+}
+
+static void bch2_delete_dead_snapshots_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_snapshot snap;
+       struct snapshot_id_list deleted = { 0 };
+       u32 i, id, children[2];
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       /*
+        * For every snapshot node: If we have no live children and it's not
+        * pointed to by a subvolume, delete it:
+        */
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               if (BCH_SNAPSHOT_DELETED(snap.v) ||
+                   BCH_SNAPSHOT_SUBVOL(snap.v))
+                       continue;
+
+               children[0] = le32_to_cpu(snap.v->children[0]);
+               children[1] = le32_to_cpu(snap.v->children[1]);
+
+               ret   = snapshot_live(&trans, children[0]) ?:
+                       snapshot_live(&trans, children[1]);
+               if (ret < 0)
+                       break;
+               if (ret)
+                       continue;
+
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_set_deleted(&trans, iter.pos.offset));
+               if (ret) {
+                       bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret);
+                       break;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error walking snapshots: %i", ret);
+               goto err;
+       }
+
+       ret = bch2_snapshots_set_equiv(&trans);
+       if (ret)
+               goto err;
+
+       for_each_btree_key(&trans, iter, BTREE_ID_snapshots,
+                          POS_MIN, 0, k, ret) {
+               if (k.k->type != KEY_TYPE_snapshot)
+                       continue;
+
+               snap = bkey_s_c_to_snapshot(k);
+               if (BCH_SNAPSHOT_DELETED(snap.v)) {
+                       ret = snapshot_id_add(&deleted, k.k->p.offset);
+                       if (ret)
+                               break;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       if (ret) {
+               bch_err(c, "error walking snapshots: %i", ret);
+               goto err;
+       }
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               if (!btree_type_has_snapshots(id))
+                       continue;
+
+               ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id);
+               if (ret) {
+                       bch_err(c, "error deleting snapshot keys: %i", ret);
+                       goto err;
+               }
+       }
+
+       for (i = 0; i < deleted.nr; i++) {
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_snapshot_node_delete(&trans, deleted.d[i]));
+               if (ret) {
+                       bch_err(c, "error deleting snapshot %u: %i",
+                               deleted.d[i], ret);
+                       goto err;
+               }
+       }
+err:
+       kfree(deleted.d);
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
+}
+
+static void bch2_delete_dead_snapshots(struct bch_fs *c)
+{
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return;
+
+       if (!queue_work(system_long_wq, &c->snapshot_delete_work))
+               percpu_ref_put(&c->writes);
+}
+
+static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
+                                          struct btree_trans_commit_hook *h)
+{
+       bch2_delete_dead_snapshots(trans->c);
+       return 0;
+}
+
+/* Subvolumes: */
+
+const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0)
+               return "invalid pos";
+
+       if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+               return "invalid pos";
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume))
+               return "bad val size";
+
+       return NULL;
+}
+
+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
+{
+       struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+       pr_buf(out, "root %llu snapshot id %u",
+              le64_to_cpu(s.v->inode),
+              le32_to_cpu(s.v->snapshot));
+}
+
+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol,
+                      bool inconsistent_if_not_found,
+                      int iter_flags,
+                      struct bch_subvolume *s)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol),
+                            iter_flags);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT;
+
+       if (ret == -ENOENT && inconsistent_if_not_found)
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol);
+       if (!ret)
+               *s = *bkey_s_c_to_subvolume(k).v;
+
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
+                            struct bch_subvolume *subvol)
+{
+       struct bch_snapshot snap;
+
+       return  snapshot_lookup(trans, snapshot, &snap) ?:
+               bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
+}
+
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol,
+                               u32 *snapid)
+{
+       struct bch_subvolume s;
+       int ret;
+
+       ret = bch2_subvolume_get(trans, subvol, true,
+                                BTREE_ITER_CACHED|
+                                BTREE_ITER_WITH_UPDATES,
+                                &s);
+
+       *snapid = le32_to_cpu(s.snapshot);
+       return ret;
+}
+
+/*
+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot
+ * deletion/cleanup:
+ */
+int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_subvolume subvol;
+       struct btree_trans_commit_hook *h;
+       struct bkey_i *delete;
+       u32 snapid;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvolid),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+               ret = -EIO;
+               goto err;
+       }
+
+       subvol = bkey_s_c_to_subvolume(k);
+       snapid = le32_to_cpu(subvol.v->snapshot);
+
+       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+       ret = PTR_ERR_OR_ZERO(delete);
+       if (ret)
+               goto err;
+
+       bkey_init(&delete->k);
+       delete->k.p = iter.pos;
+       ret = bch2_trans_update(trans, &iter, delete, 0);
+       if (ret)
+               goto err;
+
+       ret = bch2_snapshot_node_set_deleted(trans, snapid);
+
+       h = bch2_trans_kmalloc(trans, sizeof(*h));
+       ret = PTR_ERR_OR_ZERO(h);
+       if (ret)
+               goto err;
+
+       h->fn = bch2_delete_dead_snapshots_hook;
+       bch2_trans_commit_hook(trans, h);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs,
+                               snapshot_wait_for_pagecache_and_delete_work);
+       struct snapshot_id_list s;
+       u32 *id;
+       int ret = 0;
+
+       while (!ret) {
+               mutex_lock(&c->snapshots_unlinked_lock);
+               s = c->snapshots_unlinked;
+               memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked));
+               mutex_unlock(&c->snapshots_unlinked_lock);
+
+               if (!s.nr)
+                       break;
+
+               bch2_evict_subvolume_inodes(c, &s);
+
+               for (id = s.d; id < s.d + s.nr; id++) {
+                       ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
+                                     bch2_subvolume_delete(&trans, *id));
+                       if (ret) {
+                               bch_err(c, "error %i deleting subvolume %u", ret, *id);
+                               break;
+                       }
+               }
+
+               kfree(s.d);
+       }
+
+       percpu_ref_put(&c->writes);
+}
+
+struct subvolume_unlink_hook {
+       struct btree_trans_commit_hook  h;
+       u32                             subvol;
+};
+
+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
+                                                     struct btree_trans_commit_hook *_h)
+{
+       struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h);
+       struct bch_fs *c = trans->c;
+       int ret = 0;
+
+       mutex_lock(&c->snapshots_unlinked_lock);
+       if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol))
+               ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol);
+       mutex_unlock(&c->snapshots_unlinked_lock);
+
+       if (ret)
+               return ret;
+
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return -EROFS;
+
+       if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
+               percpu_ref_put(&c->writes);
+       return 0;
+}
+
+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_subvolume *n;
+       struct subvolume_unlink_hook *h;
+       int ret = 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes,
+                            POS(0, subvolid),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_subvolume) {
+               bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid);
+               ret = -EIO;
+               goto err;
+       }
+
+       n = bch2_trans_kmalloc(trans, sizeof(*n));
+       ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               goto err;
+
+       bkey_reassemble(&n->k_i, k);
+       SET_BCH_SUBVOLUME_UNLINKED(&n->v, true);
+
+       ret = bch2_trans_update(trans, &iter, &n->k_i, 0);
+       if (ret)
+               goto err;
+
+       h = bch2_trans_kmalloc(trans, sizeof(*h));
+       ret = PTR_ERR_OR_ZERO(h);
+       if (ret)
+               goto err;
+
+       h->h.fn         = bch2_subvolume_wait_for_pagecache_and_delete_hook;
+       h->subvol       = subvolid;
+       bch2_trans_commit_hook(trans, &h->h);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+                         u32 src_subvolid,
+                         u32 *new_subvolid,
+                         u32 *new_snapshotid,
+                         bool ro)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL };
+       struct bkey_i_subvolume *new_subvol = NULL;
+       struct bkey_i_subvolume *src_subvol = NULL;
+       struct bkey_s_c k;
+       u32 parent = 0, new_nodes[2], snapshot_subvols[2];
+       int ret = 0;
+
+       for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN,
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
+               if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0)
+                       break;
+
+               /*
+                * bch2_subvolume_delete() doesn't flush the btree key cache -
+                * ideally it would but that's tricky
+                */
+               if (bkey_deleted(k.k) &&
+                   !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos))
+                       goto found_slot;
+       }
+
+       if (!ret)
+               ret = -ENOSPC;
+       goto err;
+found_slot:
+       snapshot_subvols[0] = dst_iter.pos.offset;
+       snapshot_subvols[1] = src_subvolid;
+
+       if (src_subvolid) {
+               /* Creating a snapshot: */
+               src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol));
+               ret = PTR_ERR_OR_ZERO(src_subvol);
+               if (ret)
+                       goto err;
+
+               bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes,
+                                    POS(0, src_subvolid),
+                                    BTREE_ITER_CACHED|
+                                    BTREE_ITER_INTENT);
+               k = bch2_btree_iter_peek_slot(&src_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (k.k->type != KEY_TYPE_subvolume) {
+                       bch_err(c, "subvolume %u not found", src_subvolid);
+                       ret = -ENOENT;
+                       goto err;
+               }
+
+               bkey_reassemble(&src_subvol->k_i, k);
+               parent = le32_to_cpu(src_subvol->v.snapshot);
+       }
+
+       ret = bch2_snapshot_node_create(trans, parent, new_nodes,
+                                       snapshot_subvols,
+                                       src_subvolid ? 2 : 1);
+       if (ret)
+               goto err;
+
+       if (src_subvolid) {
+               src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]);
+               ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0);
+               if (ret)
+                       goto err;
+       }
+
+       new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+       ret = PTR_ERR_OR_ZERO(new_subvol);
+       if (ret)
+               goto err;
+
+       bkey_subvolume_init(&new_subvol->k_i);
+       new_subvol->v.flags     = 0;
+       new_subvol->v.snapshot  = cpu_to_le32(new_nodes[0]);
+       new_subvol->v.inode     = cpu_to_le64(inode);
+       SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
+       SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
+       new_subvol->k.p         = dst_iter.pos;
+       ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0);
+       if (ret)
+               goto err;
+
+       *new_subvolid   = new_subvol->k.p.offset;
+       *new_snapshotid = new_nodes[0];
+err:
+       bch2_trans_iter_exit(trans, &src_iter);
+       bch2_trans_iter_exit(trans, &dst_iter);
+       return ret;
+}
+
+int bch2_fs_subvolumes_init(struct bch_fs *c)
+{
+       INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
+       INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work,
+                 bch2_subvolume_wait_for_pagecache_and_delete);
+       mutex_init(&c->snapshots_unlinked_lock);
+       return 0;
+}
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
new file mode 100644 (file)
index 0000000..4abe53d
--- /dev/null
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_H
+#define _BCACHEFS_SUBVOLUME_H
+
+#include "subvolume_types.h"
+
+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_snapshot (struct bkey_ops) {             \
+       .key_invalid    = bch2_snapshot_invalid,                \
+       .val_to_text    = bch2_snapshot_to_text,                \
+}
+
+int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c,
+                      struct bkey_s_c, unsigned);
+
+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
+{
+       return genradix_ptr(&c->snapshots, U32_MAX - id);
+}
+
+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
+{
+       return snapshot_t(c, id)->parent;
+}
+
+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *s = snapshot_t(c, id);
+
+       return s->children[0] || s->children[1];
+}
+
+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+{
+       struct snapshot_t *s;
+       u32 parent = bch2_snapshot_parent(c, id);
+
+       if (!parent)
+               return 0;
+
+       s = snapshot_t(c, bch2_snapshot_parent(c, id));
+       if (id == s->children[0])
+               return s->children[1];
+       if (id == s->children[1])
+               return s->children[0];
+       return 0;
+}
+
+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
+{
+       while (id && id < ancestor)
+               id = bch2_snapshot_parent(c, id);
+
+       return id == ancestor;
+}
+
+struct snapshots_seen {
+       struct bpos                     pos;
+       size_t                          nr;
+       size_t                          size;
+       u32                             *d;
+};
+
+static inline void snapshots_seen_exit(struct snapshots_seen *s)
+{
+       kfree(s->d);
+       s->d = NULL;
+}
+
+static inline void snapshots_seen_init(struct snapshots_seen *s)
+{
+       memset(s, 0, sizeof(*s));
+}
+
+static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id)
+{
+       if (s->nr == s->size) {
+               size_t new_size = max(s->size, (size_t) 128) * 2;
+               u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL);
+
+               if (!d) {
+                       bch_err(c, "error reallocating snapshots_seen table (new size %zu)",
+                               new_size);
+                       return -ENOMEM;
+               }
+
+               s->size = new_size;
+               s->d    = d;
+       }
+
+       s->d[s->nr++] = id;
+       return 0;
+}
+
+static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id)
+{
+       unsigned i;
+
+       for (i = 0; i < s->nr; i++)
+               if (id == s->d[i])
+                       return true;
+       return false;
+}
+
+int bch2_fs_snapshots_check(struct bch_fs *);
+void bch2_fs_snapshots_exit(struct bch_fs *);
+int bch2_fs_snapshots_start(struct bch_fs *);
+
+const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_subvolume (struct bkey_ops) {            \
+       .key_invalid    = bch2_subvolume_invalid,               \
+       .val_to_text    = bch2_subvolume_to_text,               \
+}
+
+int bch2_subvolume_get(struct btree_trans *, unsigned,
+                      bool, int, struct bch_subvolume *);
+int bch2_snapshot_get_subvol(struct btree_trans *, u32,
+                            struct bch_subvolume *);
+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
+
+/* only exported for tests: */
+int bch2_snapshot_node_create(struct btree_trans *, u32,
+                             u32 *, u32 *, unsigned);
+
+int bch2_subvolume_delete(struct btree_trans *, u32);
+int bch2_subvolume_unlink(struct btree_trans *, u32);
+int bch2_subvolume_create(struct btree_trans *, u64, u32,
+                         u32 *, u32 *, bool);
+
+int bch2_fs_subvolumes_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_SUBVOLUME_H */
diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h
new file mode 100644 (file)
index 0000000..9410b95
--- /dev/null
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H
+#define _BCACHEFS_SUBVOLUME_TYPES_H
+
+struct snapshot_id_list {
+       u32             nr;
+       u32             size;
+       u32             *d;
+};
+
+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */
index 3903b730bba31bd8f0f61433109a0fe76d4e9095..49dafdad77cd9d4bb7990b9f73687a754970d808 100644 (file)
@@ -27,8 +27,8 @@ const char * const bch2_sb_fields[] = {
        NULL
 };
 
-static const char *bch2_sb_field_validate(struct bch_sb *,
-                                         struct bch_sb_field *);
+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
+                                 struct printbuf *);
 
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
                                      enum bch_sb_field_type type)
@@ -202,22 +202,31 @@ static inline void __bch2_sb_layout_size_assert(void)
        BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
 }
 
-static const char *validate_sb_layout(struct bch_sb_layout *layout)
+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out)
 {
        u64 offset, prev_offset, max_sectors;
        unsigned i;
 
-       if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
-               return "Not a bcachefs superblock layout";
+       if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
+               pr_buf(out, "Not a bcachefs superblock layout");
+               return -EINVAL;
+       }
 
-       if (layout->layout_type != 0)
-               return "Invalid superblock layout type";
+       if (layout->layout_type != 0) {
+               pr_buf(out, "Invalid superblock layout type %u",
+                      layout->layout_type);
+               return -EINVAL;
+       }
 
-       if (!layout->nr_superblocks)
-               return "Invalid superblock layout: no superblocks";
+       if (!layout->nr_superblocks) {
+               pr_buf(out, "Invalid superblock layout: no superblocks");
+               return -EINVAL;
+       }
 
-       if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
-               return "Invalid superblock layout: too many superblocks";
+       if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) {
+               pr_buf(out, "Invalid superblock layout: too many superblocks");
+               return -EINVAL;
+       }
 
        max_sectors = 1 << layout->sb_max_size_bits;
 
@@ -226,126 +235,134 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
        for (i = 1; i < layout->nr_superblocks; i++) {
                offset = le64_to_cpu(layout->sb_offset[i]);
 
-               if (offset < prev_offset + max_sectors)
-                       return "Invalid superblock layout: superblocks overlap";
+               if (offset < prev_offset + max_sectors) {
+                       pr_buf(out, "Invalid superblock layout: superblocks overlap\n"
+                              "  (sb %u ends at %llu next starts at %llu",
+                              i - 1, prev_offset + max_sectors, offset);
+                       return -EINVAL;
+               }
                prev_offset = offset;
        }
 
-       return NULL;
+       return 0;
 }
 
-const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out)
 {
        struct bch_sb *sb = disk_sb->sb;
        struct bch_sb_field *f;
        struct bch_sb_field_members *mi;
-       const char *err;
        u32 version, version_min;
        u16 block_size;
+       int ret;
 
        version         = le16_to_cpu(sb->version);
        version_min     = version >= bcachefs_metadata_version_new_versioning
                ? le16_to_cpu(sb->version_min)
                : version;
 
-       if (version    >= bcachefs_metadata_version_max ||
-           version_min < bcachefs_metadata_version_min)
-               return "Unsupported superblock version";
+       if (version    >= bcachefs_metadata_version_max) {
+               pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+                      version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+               return -EINVAL;
+       }
+
+       if (version_min < bcachefs_metadata_version_min) {
+               pr_buf(out, "Unsupported superblock version %u (min %u, max %u)",
+                      version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+               return -EINVAL;
+       }
 
-       if (version_min > version)
-               return "Bad minimum version";
+       if (version_min > version) {
+               pr_buf(out, "Bad minimum version %u, greater than version field %u",
+                      version_min, version);
+               return -EINVAL;
+       }
 
        if (sb->features[1] ||
-           (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
-               return "Filesystem has incompatible features";
+           (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) {
+               pr_buf(out, "Filesystem has incompatible features");
+               return -EINVAL;
+       }
 
        block_size = le16_to_cpu(sb->block_size);
 
-       if (!is_power_of_2(block_size) ||
-           block_size > PAGE_SECTORS)
-               return "Bad block size";
+       if (block_size > PAGE_SECTORS) {
+               pr_buf(out, "Block size too big (got %u, max %u)",
+                      block_size, PAGE_SECTORS);
+               return -EINVAL;
+       }
 
-       if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
-               return "Bad user UUID";
+       if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) {
+               pr_buf(out, "Bad user UUID (got zeroes)");
+               return -EINVAL;
+       }
 
-       if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
-               return "Bad internal UUID";
+       if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) {
+               pr_buf(out, "Bad intenal UUID (got zeroes)");
+               return -EINVAL;
+       }
 
        if (!sb->nr_devices ||
-           sb->nr_devices <= sb->dev_idx ||
-           sb->nr_devices > BCH_SB_MEMBERS_MAX)
-               return "Bad number of member devices";
-
-       if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-           BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-               return "Invalid number of metadata replicas";
-
-       if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-           BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-               return "Invalid number of metadata replicas";
-
-       if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-           BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
-               return "Invalid number of data replicas";
-
-       if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-           BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
-               return "Invalid number of data replicas";
-
-       if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-               return "Invalid metadata checksum type";
-
-       if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
-               return "Invalid metadata checksum type";
-
-       if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
-               return "Invalid compression type";
-
-       if (!BCH_SB_BTREE_NODE_SIZE(sb))
-               return "Btree node size not set";
-
-       if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
-               return "Btree node size not a power of two";
+           sb->nr_devices > BCH_SB_MEMBERS_MAX) {
+               pr_buf(out, "Bad number of member devices %u (max %u)",
+                      sb->nr_devices, BCH_SB_MEMBERS_MAX);
+               return -EINVAL;
+       }
 
-       if (BCH_SB_GC_RESERVE(sb) < 5)
-               return "gc reserve percentage too small";
+       if (sb->dev_idx >= sb->nr_devices) {
+               pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)",
+                      sb->dev_idx, sb->nr_devices);
+               return -EINVAL;
+       }
 
        if (!sb->time_precision ||
-           le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
-               return "invalid time precision";
+           le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) {
+               pr_buf(out, "Invalid time precision: %u (min 1, max %lu)",
+                      le32_to_cpu(sb->time_precision), NSEC_PER_SEC);
+               return -EINVAL;
+       }
 
        /* validate layout */
-       err = validate_sb_layout(&sb->layout);
-       if (err)
-               return err;
+       ret = validate_sb_layout(&sb->layout, out);
+       if (ret)
+               return ret;
 
        vstruct_for_each(sb, f) {
-               if (!f->u64s)
-                       return "Invalid superblock: invalid optional field";
+               if (!f->u64s) {
+                       pr_buf(out, "Invalid superblock: optional with size 0 (type %u)",
+                              le32_to_cpu(f->type));
+                       return -EINVAL;
+               }
 
-               if (vstruct_next(f) > vstruct_last(sb))
-                       return "Invalid superblock: invalid optional field";
+               if (vstruct_next(f) > vstruct_last(sb)) {
+                       pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)",
+                              le32_to_cpu(f->type));
+                       return -EINVAL;
+               }
        }
 
        /* members must be validated first: */
        mi = bch2_sb_get_members(sb);
-       if (!mi)
-               return "Invalid superblock: member info area missing";
+       if (!mi) {
+               pr_buf(out, "Invalid superblock: member info area missing");
+               return -EINVAL;
+       }
 
-       err = bch2_sb_field_validate(sb, &mi->field);
-       if (err)
-               return err;
+       ret = bch2_sb_field_validate(sb, &mi->field, out);
+       if (ret)
+               return ret;
 
        vstruct_for_each(sb, f) {
                if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
                        continue;
 
-               err = bch2_sb_field_validate(sb, f);
-               if (err)
-                       return err;
+               ret = bch2_sb_field_validate(sb, f, out);
+               if (ret)
+                       return ret;
        }
 
-       return NULL;
+       return 0;
 }
 
 /* device open: */
@@ -366,7 +383,6 @@ static void bch2_sb_update(struct bch_fs *c)
        c->sb.nr_devices        = src->nr_devices;
        c->sb.clean             = BCH_SB_CLEAN(src);
        c->sb.encryption_type   = BCH_SB_ENCRYPTION_TYPE(src);
-       c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
 
        c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
        c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
@@ -439,10 +455,8 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
        __copy_super(&c->disk_sb, src);
 
-       if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
-               set_bit(BCH_FS_ERROR, &c->flags);
-       if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
-               set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+       if (BCH_SB_INITIALIZED(c->disk_sb.sb))
+               set_bit(BCH_FS_INITIALIZED, &c->flags);
 
        ret = bch2_sb_replicas_to_cpu_replicas(c);
        if (ret)
@@ -477,10 +491,12 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
 
 /* read superblock: */
 
-static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
 {
        struct bch_csum csum;
+       u32 version, version_min;
        size_t bytes;
+       int ret;
 reread:
        bio_reset(sb->bio);
        bio_set_dev(sb->bio, sb->bdev);
@@ -488,40 +504,65 @@ reread:
        bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
        bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
 
-       if (submit_bio_wait(sb->bio))
-               return "IO error";
+       ret = submit_bio_wait(sb->bio);
+       if (ret) {
+               pr_buf(err, "IO error: %i", ret);
+               return ret;
+       }
 
-       if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
-               return "Not a bcachefs superblock";
+       if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
+               pr_buf(err, "Not a bcachefs superblock");
+               return -EINVAL;
+       }
 
-       if (le16_to_cpu(sb->sb->version) <  bcachefs_metadata_version_min ||
-           le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
-               return "Unsupported superblock version";
+       version         = le16_to_cpu(sb->sb->version);
+       version_min     = version >= bcachefs_metadata_version_new_versioning
+               ? le16_to_cpu(sb->sb->version_min)
+               : version;
+
+       if (version    >= bcachefs_metadata_version_max) {
+               pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+                      version, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+               return -EINVAL;
+       }
+
+       if (version_min < bcachefs_metadata_version_min) {
+               pr_buf(err, "Unsupported superblock version %u (min %u, max %u)",
+                      version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max);
+               return -EINVAL;
+       }
 
        bytes = vstruct_bytes(sb->sb);
 
-       if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
-               return "Bad superblock: too big";
+       if (bytes > 512 << sb->sb->layout.sb_max_size_bits) {
+               pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)",
+                      bytes, 512UL << sb->sb->layout.sb_max_size_bits);
+               return -EINVAL;
+       }
 
        if (bytes > sb->buffer_size) {
                if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
-                       return "cannot allocate memory";
+                       return -ENOMEM;
                goto reread;
        }
 
-       if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
-               return "unknown csum type";
+       if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+               pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
+               return -EINVAL;
+       }
 
        /* XXX: verify MACs */
        csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
                            null_nonce(), sb->sb);
 
-       if (bch2_crc_cmp(csum, sb->sb->csum))
-               return "bad checksum reading superblock";
+       if (bch2_crc_cmp(csum, sb->sb->csum)) {
+               pr_buf(err, "bad checksum");
+               return -EINVAL;
+       }
 
        sb->seq = le64_to_cpu(sb->sb->seq);
 
-       return NULL;
+       return 0;
 }
 
 int bch2_read_super(const char *path, struct bch_opts *opts,
@@ -529,10 +570,16 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
 {
        u64 offset = opt_get(*opts, sb);
        struct bch_sb_layout layout;
-       const char *err;
+       char *_err;
+       struct printbuf err;
        __le64 *i;
        int ret;
 
+       _err = kmalloc(4096, GFP_KERNEL);
+       if (!_err)
+               return -ENOMEM;
+       err = _PBUF(_err, 4096);
+
        pr_verbose_init(*opts, "");
 
        memset(sb, 0, sizeof(*sb));
@@ -561,25 +608,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
                goto out;
        }
 
-       err = "cannot allocate memory";
        ret = bch2_sb_realloc(sb, 0);
-       if (ret)
+       if (ret) {
+               pr_buf(&err, "error allocating memory for superblock");
                goto err;
+       }
 
-       ret = -EFAULT;
-       err = "dynamic fault";
-       if (bch2_fs_init_fault("read_super"))
+       if (bch2_fs_init_fault("read_super")) {
+               pr_buf(&err, "dynamic fault");
+               ret = -EFAULT;
                goto err;
+       }
 
-       ret = -EINVAL;
-       err = read_one_super(sb, offset);
-       if (!err)
+       ret = read_one_super(sb, offset, &err);
+       if (!ret)
                goto got_super;
 
        if (opt_defined(*opts, sb))
                goto err;
 
-       pr_err("error reading default superblock: %s", err);
+       printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s",
+              path, _err);
+       err = _PBUF(_err, 4096);
 
        /*
         * Error reading primary superblock - read location of backup
@@ -595,13 +645,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
         */
        bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
 
-       err = "IO error";
-       if (submit_bio_wait(sb->bio))
+       ret = submit_bio_wait(sb->bio);
+       if (ret) {
+               pr_buf(&err, "IO error: %i", ret);
                goto err;
+       }
 
        memcpy(&layout, sb->sb, sizeof(layout));
-       err = validate_sb_layout(&layout);
-       if (err)
+       ret = validate_sb_layout(&layout, &err);
+       if (ret)
                goto err;
 
        for (i = layout.sb_offset;
@@ -611,29 +663,41 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
                if (offset == opt_get(*opts, sb))
                        continue;
 
-               err = read_one_super(sb, offset);
-               if (!err)
+               ret = read_one_super(sb, offset, &err);
+               if (!ret)
                        goto got_super;
        }
 
-       ret = -EINVAL;
        goto err;
 
 got_super:
-       err = "Superblock block size smaller than device block size";
-       ret = -EINVAL;
        if (le16_to_cpu(sb->sb->block_size) << 9 <
-           bdev_logical_block_size(sb->bdev))
+           bdev_logical_block_size(sb->bdev)) {
+               pr_buf(&err, "block size (%u) smaller than device block size (%u)",
+                      le16_to_cpu(sb->sb->block_size) << 9,
+                      bdev_logical_block_size(sb->bdev));
+               ret = -EINVAL;
                goto err;
+       }
 
        ret = 0;
        sb->have_layout = true;
+
+       ret = bch2_sb_validate(sb, &err);
+       if (ret) {
+               printk(KERN_ERR "bcachefs (%s): error validating superblock: %s",
+                      path, _err);
+               goto err_no_print;
+       }
 out:
        pr_verbose_init(*opts, "ret %i", ret);
+       kfree(_err);
        return ret;
 err:
+       printk(KERN_ERR "bcachefs (%s): error reading superblock: %s",
+              path, _err);
+err_no_print:
        bch2_free_super(sb);
-       pr_err("error reading superblock: %s", err);
        goto out;
 }
 
@@ -706,7 +770,6 @@ int bch2_write_super(struct bch_fs *c)
        struct closure *cl = &c->sb_write;
        struct bch_dev *ca;
        unsigned i, sb = 0, nr_wrote;
-       const char *err;
        struct bch_devs_mask sb_written;
        bool wrote, can_mount_without_written, can_mount_with_written;
        unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -733,10 +796,19 @@ int bch2_write_super(struct bch_fs *c)
                bch2_sb_from_fs(c, ca);
 
        for_each_online_member(ca, c, i) {
-               err = bch2_sb_validate(&ca->disk_sb);
-               if (err) {
-                       bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
-                       ret = -1;
+               struct printbuf buf = { NULL, NULL };
+
+               ret = bch2_sb_validate(&ca->disk_sb, &buf);
+               if (ret) {
+                       char *_buf = kmalloc(4096, GFP_NOFS);
+                       if (_buf) {
+                               buf = _PBUF(_buf, 4096);
+                               bch2_sb_validate(&ca->disk_sb, &buf);
+                       }
+
+                       bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf);
+                       kfree(_buf);
+                       percpu_ref_put(&ca->io_ref);
                        goto out;
                }
        }
@@ -754,11 +826,24 @@ int bch2_write_super(struct bch_fs *c)
        closure_sync(cl);
 
        for_each_online_member(ca, c, i) {
-               if (!ca->sb_write_error &&
-                   ca->disk_sb.seq !=
-                   le64_to_cpu(ca->sb_read_scratch->seq)) {
+               if (ca->sb_write_error)
+                       continue;
+
+               if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
+                       bch2_fs_fatal_error(c,
+                               "Superblock write was silently dropped! (seq %llu expected %llu)",
+                               le64_to_cpu(ca->sb_read_scratch->seq),
+                               ca->disk_sb.seq);
+                       percpu_ref_put(&ca->io_ref);
+                       ret = -EROFS;
+                       goto out;
+               }
+
+               if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
                        bch2_fs_fatal_error(c,
-                               "Superblock modified by another process");
+                               "Superblock modified by another process (seq %llu expected %llu)",
+                               le64_to_cpu(ca->sb_read_scratch->seq),
+                               ca->disk_sb.seq);
                        percpu_ref_put(&ca->io_ref);
                        ret = -EROFS;
                        goto out;
@@ -807,7 +892,8 @@ int bch2_write_super(struct bch_fs *c)
                                 !can_mount_with_written ||
                                 (can_mount_without_written &&
                                  !can_mount_with_written), c,
-               "Unable to write superblock to sufficient devices"))
+               "Unable to write superblock to sufficient devices (from %ps)",
+               (void *) _RET_IP_))
                ret = -1;
 out:
        /* Make new options visible after they're persistent: */
@@ -835,54 +921,57 @@ static int u64_cmp(const void *_l, const void *_r)
        return l < r ? -1 : l > r ? 1 : 0;
 }
 
-static const char *bch2_sb_validate_journal(struct bch_sb *sb,
-                                           struct bch_sb_field *f)
+static int bch2_sb_validate_journal(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
 {
        struct bch_sb_field_journal *journal = field_to_type(f, journal);
        struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-       const char *err;
+       int ret = -EINVAL;
        unsigned nr;
        unsigned i;
        u64 *b;
 
-       journal = bch2_sb_get_journal(sb);
-       if (!journal)
-               return NULL;
-
        nr = bch2_nr_journal_buckets(journal);
        if (!nr)
-               return NULL;
+               return 0;
 
        b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
        if (!b)
-               return "cannot allocate memory";
+               return -ENOMEM;
 
        for (i = 0; i < nr; i++)
                b[i] = le64_to_cpu(journal->buckets[i]);
 
        sort(b, nr, sizeof(u64), u64_cmp, NULL);
 
-       err = "journal bucket at sector 0";
-       if (!b[0])
+       if (!b[0]) {
+               pr_buf(err, "journal bucket at sector 0");
                goto err;
+       }
 
-       err = "journal bucket before first bucket";
-       if (m && b[0] < le16_to_cpu(m->first_bucket))
+       if (b[0] < le16_to_cpu(m->first_bucket)) {
+               pr_buf(err, "journal bucket %llu before first bucket %u",
+                      b[0], le16_to_cpu(m->first_bucket));
                goto err;
+       }
 
-       err = "journal bucket past end of device";
-       if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
+       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+               pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+                      b[nr - 1], le64_to_cpu(m->nbuckets));
                goto err;
+       }
 
-       err = "duplicate journal buckets";
        for (i = 0; i + 1 < nr; i++)
-               if (b[i] == b[i + 1])
+               if (b[i] == b[i + 1]) {
+                       pr_buf(err, "duplicate journal buckets %llu", b[i]);
                        goto err;
+               }
 
-       err = NULL;
+       ret = 0;
 err:
        kfree(b);
-       return err;
+       return ret;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
@@ -891,39 +980,54 @@ static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
 
 /* BCH_SB_FIELD_members: */
 
-static const char *bch2_sb_validate_members(struct bch_sb *sb,
-                                           struct bch_sb_field *f)
+static int bch2_sb_validate_members(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
 {
        struct bch_sb_field_members *mi = field_to_type(f, members);
-       struct bch_member *m;
+       unsigned i;
 
        if ((void *) (mi->members + sb->nr_devices) >
-           vstruct_end(&mi->field))
-               return "Invalid superblock: bad member info";
+           vstruct_end(&mi->field)) {
+               pr_buf(err, "too many devices for section size");
+               return -EINVAL;
+       }
+
+       for (i = 0; i < sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
 
-       for (m = mi->members;
-            m < mi->members + sb->nr_devices;
-            m++) {
                if (!bch2_member_exists(m))
                        continue;
 
-               if (le64_to_cpu(m->nbuckets) > LONG_MAX)
-                       return "Too many buckets";
+               if (le64_to_cpu(m->nbuckets) > LONG_MAX) {
+                       pr_buf(err, "device %u: too many buckets (got %llu, max %lu)",
+                              i, le64_to_cpu(m->nbuckets), LONG_MAX);
+                       return -EINVAL;
+               }
 
                if (le64_to_cpu(m->nbuckets) -
-                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
-                       return "Not enough buckets";
+                   le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) {
+                       pr_buf(err, "device %u: not enough buckets (got %llu, max %u)",
+                              i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS);
+                       return -EINVAL;
+               }
 
                if (le16_to_cpu(m->bucket_size) <
-                   le16_to_cpu(sb->block_size))
-                       return "bucket size smaller than block size";
+                   le16_to_cpu(sb->block_size)) {
+                       pr_buf(err, "device %u: bucket size %u smaller than block size %u",
+                              i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size));
+                       return -EINVAL;
+               }
 
                if (le16_to_cpu(m->bucket_size) <
-                   BCH_SB_BTREE_NODE_SIZE(sb))
-                       return "bucket size smaller than btree node size";
+                   BCH_SB_BTREE_NODE_SIZE(sb)) {
+                       pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu",
+                              i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb));
+                       return -EINVAL;
+               }
        }
 
-       return NULL;
+       return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_members = {
@@ -932,18 +1036,24 @@ static const struct bch_sb_field_ops bch_sb_field_ops_members = {
 
 /* BCH_SB_FIELD_crypt: */
 
-static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
-                                         struct bch_sb_field *f)
+static int bch2_sb_validate_crypt(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
 {
        struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
 
-       if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
-               return "invalid field crypt: wrong size";
+       if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) {
+               pr_buf(err, "wrong size (got %llu should be %zu)",
+                      vstruct_bytes(&crypt->field), sizeof(*crypt));
+               return -EINVAL;
+       }
 
-       if (BCH_CRYPT_KDF_TYPE(crypt))
-               return "invalid field crypt: bad kdf type";
+       if (BCH_CRYPT_KDF_TYPE(crypt)) {
+               pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
@@ -1028,7 +1138,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
                                     struct jset_entry_usage, entry);
 
                u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = FS_USAGE_INODES;
+               u->entry.btree_id = BCH_FS_USAGE_inodes;
                u->v            = cpu_to_le64(c->usage_base->nr_inodes);
        }
 
@@ -1038,7 +1148,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
                                     struct jset_entry_usage, entry);
 
                u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = FS_USAGE_KEY_VERSION;
+               u->entry.btree_id = BCH_FS_USAGE_key_version;
                u->v            = cpu_to_le64(atomic64_read(&c->key_version));
        }
 
@@ -1048,7 +1158,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
                                     struct jset_entry_usage, entry);
 
                u->entry.type   = BCH_JSET_ENTRY_usage;
-               u->entry.btree_id = FS_USAGE_RESERVED;
+               u->entry.btree_id = BCH_FS_USAGE_reserved;
                u->entry.level  = i;
                u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
        }
@@ -1152,15 +1262,19 @@ out:
        mutex_unlock(&c->sb_lock);
 }
 
-static const char *bch2_sb_validate_clean(struct bch_sb *sb,
-                                         struct bch_sb_field *f)
+static int bch2_sb_validate_clean(struct bch_sb *sb,
+                                 struct bch_sb_field *f,
+                                 struct printbuf *err)
 {
        struct bch_sb_field_clean *clean = field_to_type(f, clean);
 
-       if (vstruct_bytes(&clean->field) < sizeof(*clean))
-               return "invalid field crypt: wrong size";
+       if (vstruct_bytes(&clean->field) < sizeof(*clean)) {
+               pr_buf(err, "wrong size (got %llu should be %zu)",
+                      vstruct_bytes(&clean->field), sizeof(*clean));
+               return -EINVAL;
+       }
 
-       return NULL;
+       return 0;
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
@@ -1174,14 +1288,26 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #undef x
 };
 
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
-                                         struct bch_sb_field *f)
+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
+                                 struct printbuf *orig_err)
 {
        unsigned type = le32_to_cpu(f->type);
+       struct printbuf err = *orig_err;
+       int ret;
 
-       return type < BCH_SB_FIELD_NR
-               ? bch2_sb_field_ops[type]->validate(sb, f)
-               : NULL;
+       if (type >= BCH_SB_FIELD_NR)
+               return 0;
+
+       pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]);
+
+       ret = bch2_sb_field_ops[type]->validate(sb, f, &err);
+       if (ret) {
+               pr_buf(&err, "\n");
+               bch2_sb_field_to_text(&err, sb, f);
+               *orig_err = err;
+       }
+
+       return ret;
 }
 
 void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
index b64ac2fbbf8bde6fdaff7f258fc84de88ba9948d..3b425bed17c48c51a552ae0e2b8e73bde01a91c3 100644 (file)
@@ -38,9 +38,8 @@ BCH_SB_FIELDS()
 extern const char * const bch2_sb_fields[];
 
 struct bch_sb_field_ops {
-       const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
-       void            (*to_text)(struct printbuf *, struct bch_sb *,
-                                  struct bch_sb_field *);
+       int     (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *);
+       void    (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *);
 };
 
 static inline __le64 bch2_sb_magic(struct bch_fs *c)
@@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
 void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 
-const char *bch2_sb_validate(struct bch_sb_handle *);
-
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
 void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -110,7 +107,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                .bucket_size    = le16_to_cpu(mi->bucket_size),
                .group          = BCH_MEMBER_GROUP(mi),
                .state          = BCH_MEMBER_STATE(mi),
-               .replacement    = BCH_MEMBER_REPLACEMENT(mi),
                .discard        = BCH_MEMBER_DISCARD(mi),
                .data_allowed   = BCH_MEMBER_DATA_ALLOWED(mi),
                .durability     = BCH_MEMBER_DURABILITY(mi)
index ce8e5d4843d0e8f4bd9a596d0421b9a04860bd4b..b36e6216a8a10a2ec12b0de39ba954424d81d45d 100644 (file)
@@ -16,6 +16,7 @@
 #include "btree_key_cache.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
+#include "buckets_waiting_for_journal.h"
 #include "chardev.h"
 #include "checksum.h"
 #include "clock.h"
@@ -39,6 +40,7 @@
 #include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
+#include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
@@ -404,13 +406,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        if (ret)
                goto err;
 
-       /*
-        * We need to write out a journal entry before we start doing btree
-        * updates, to ensure that on unclean shutdown new journal blacklist
-        * entries are created:
-        */
-       bch2_journal_meta(&c->journal);
-
        clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
 
        for_each_rw_member(ca, c, i)
@@ -468,11 +463,13 @@ static void __bch2_fs_free(struct bch_fs *c)
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_exit(&c->times[i]);
 
+       bch2_fs_snapshots_exit(c);
        bch2_fs_quota_exit(c);
        bch2_fs_fsio_exit(c);
        bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
        bch2_fs_io_exit(c);
+       bch2_fs_buckets_waiting_for_journal_exit(c);
        bch2_fs_btree_interior_update_exit(c);
        bch2_fs_btree_iter_exit(c);
        bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
@@ -486,12 +483,12 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_journal_entries_free(&c->journal_entries);
        percpu_free_rwsem(&c->mark_lock);
 
-       if (c->btree_iters_bufs)
+       if (c->btree_paths_bufs)
                for_each_possible_cpu(cpu)
-                       kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
+                       kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path);
 
        free_percpu(c->online_reserved);
-       free_percpu(c->btree_iters_bufs);
+       free_percpu(c->btree_paths_bufs);
        free_percpu(c->pcpu);
        mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
@@ -593,48 +590,53 @@ void bch2_fs_stop(struct bch_fs *c)
        bch2_fs_free(c);
 }
 
-static const char *bch2_fs_online(struct bch_fs *c)
+static int bch2_fs_online(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       const char *err = NULL;
        unsigned i;
-       int ret;
+       int ret = 0;
 
        lockdep_assert_held(&bch_fs_list_lock);
 
-       if (!list_empty(&c->list))
-               return NULL;
-
-       if (__bch2_uuid_to_fs(c->sb.uuid))
-               return "filesystem UUID already open";
+       if (__bch2_uuid_to_fs(c->sb.uuid)) {
+               bch_err(c, "filesystem UUID already open");
+               return -EINVAL;
+       }
 
        ret = bch2_fs_chardev_init(c);
-       if (ret)
-               return "error creating character device";
+       if (ret) {
+               bch_err(c, "error creating character device");
+               return ret;
+       }
 
        bch2_fs_debug_init(c);
 
-       if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
-           kobject_add(&c->internal, &c->kobj, "internal") ||
-           kobject_add(&c->opts_dir, &c->kobj, "options") ||
-           kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
-           bch2_opts_create_sysfs_files(&c->opts_dir))
-               return "error creating sysfs objects";
+       ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
+           kobject_add(&c->internal, &c->kobj, "internal") ?:
+           kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+           kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+           bch2_opts_create_sysfs_files(&c->opts_dir);
+       if (ret) {
+               bch_err(c, "error creating sysfs objects");
+               return ret;
+       }
 
        down_write(&c->state_lock);
 
-       err = "error creating sysfs objects";
-       for_each_member_device(ca, c, i)
-               if (bch2_dev_sysfs_online(c, ca)) {
+       for_each_member_device(ca, c, i) {
+               ret = bch2_dev_sysfs_online(c, ca);
+               if (ret) {
+                       bch_err(c, "error creating sysfs objects");
                        percpu_ref_put(&ca->ref);
                        goto err;
                }
+       }
 
+       BUG_ON(!list_empty(&c->list));
        list_add(&c->list, &bch_fs_list);
-       err = NULL;
 err:
        up_write(&c->state_lock);
-       return err;
+       return ret;
 }
 
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
@@ -642,13 +644,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        struct bch_sb_field_members *mi;
        struct bch_fs *c;
        unsigned i, iter_size;
-       const char *err;
+       int ret = 0;
 
        pr_verbose_init(opts, "");
 
        c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
-       if (!c)
+       if (!c) {
+               c = ERR_PTR(-ENOMEM);
                goto out;
+       }
 
        __module_get(THIS_MODULE);
 
@@ -670,6 +674,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
 
        init_rwsem(&c->gc_lock);
+       mutex_init(&c->gc_gens_lock);
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_init(&c->times[i]);
@@ -686,6 +691,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        mutex_init(&c->usage_scratch_lock);
 
        mutex_init(&c->bio_bounce_pages_lock);
+       mutex_init(&c->snapshot_table_lock);
 
        spin_lock_init(&c->btree_write_error_lock);
 
@@ -704,6 +710,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_LIST_HEAD(&c->ec_stripe_new_list);
        mutex_init(&c->ec_stripe_new_lock);
 
+       INIT_LIST_HEAD(&c->data_progress_list);
+       mutex_init(&c->data_progress_lock);
+
        spin_lock_init(&c->ec_stripes_heap_lock);
 
        seqcount_init(&c->gc_pos_lock);
@@ -716,38 +725,59 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        c->rebalance.enabled            = 1;
        c->promote_whole_extents        = true;
 
-       c->journal.write_time   = &c->times[BCH_TIME_journal_write];
-       c->journal.delay_time   = &c->times[BCH_TIME_journal_delay];
-       c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
-       c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
+       c->journal.flush_write_time     = &c->times[BCH_TIME_journal_flush_write];
+       c->journal.noflush_write_time   = &c->times[BCH_TIME_journal_noflush_write];
+       c->journal.blocked_time         = &c->times[BCH_TIME_blocked_journal];
+       c->journal.flush_seq_time       = &c->times[BCH_TIME_journal_flush_seq];
 
        bch2_fs_btree_cache_init_early(&c->btree_cache);
 
        mutex_init(&c->sectors_available_lock);
 
-       if (percpu_init_rwsem(&c->mark_lock))
+       ret = percpu_init_rwsem(&c->mark_lock);
+       if (ret)
                goto err;
 
        mutex_lock(&c->sb_lock);
+       ret = bch2_sb_to_fs(c, sb);
+       mutex_unlock(&c->sb_lock);
 
-       if (bch2_sb_to_fs(c, sb)) {
-               mutex_unlock(&c->sb_lock);
+       if (ret)
                goto err;
-       }
 
-       mutex_unlock(&c->sb_lock);
+       uuid_unparse_lower(c->sb.user_uuid.b, c->name);
+
+       /* Compat: */
+       if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+           !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
+               SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000);
 
-       scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
+       if (sb->version <= bcachefs_metadata_version_inode_v2 &&
+           !BCH_SB_JOURNAL_RECLAIM_DELAY(sb))
+               SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100);
 
        c->opts = bch2_opts_default;
-       bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
+       ret = bch2_opts_from_sb(&c->opts, sb);
+       if (ret)
+               goto err;
+
        bch2_opts_apply(&c->opts, opts);
 
-       c->block_bits           = ilog2(c->opts.block_size);
+       /* key cache currently disabled for inodes, because of snapshots: */
+       c->opts.inodes_use_key_cache = 0;
+
+       c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+       if (c->opts.inodes_use_key_cache)
+               c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
+       c->block_bits           = ilog2(block_sectors(c));
        c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
 
-       if (bch2_fs_init_fault("fs_alloc"))
+       if (bch2_fs_init_fault("fs_alloc")) {
+               bch_err(c, "fs_alloc fault injected");
+               ret = -EFAULT;
                goto err;
+       }
 
        iter_size = sizeof(struct sort_iter) +
                (btree_blocks(c) + 1) * 2 *
@@ -771,33 +801,45 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
-           !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
+           !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) ||
            !(c->online_reserved = alloc_percpu(u64)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
            mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
            !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
-                                             sizeof(u64), GFP_KERNEL)) ||
-           bch2_io_clock_init(&c->io_clock[READ]) ||
-           bch2_io_clock_init(&c->io_clock[WRITE]) ||
-           bch2_fs_journal_init(&c->journal) ||
-           bch2_fs_replicas_init(c) ||
-           bch2_fs_btree_cache_init(c) ||
-           bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
-           bch2_fs_btree_iter_init(c) ||
-           bch2_fs_btree_interior_update_init(c) ||
-           bch2_fs_io_init(c) ||
-           bch2_fs_encryption_init(c) ||
-           bch2_fs_compress_init(c) ||
-           bch2_fs_ec_init(c) ||
-           bch2_fs_fsio_init(c))
+                                             sizeof(u64), GFP_KERNEL))) {
+               ret = -ENOMEM;
                goto err;
+       }
+
+       ret = bch2_io_clock_init(&c->io_clock[READ]) ?:
+           bch2_io_clock_init(&c->io_clock[WRITE]) ?:
+           bch2_fs_journal_init(&c->journal) ?:
+           bch2_fs_replicas_init(c) ?:
+           bch2_fs_btree_cache_init(c) ?:
+           bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?:
+           bch2_fs_btree_iter_init(c) ?:
+           bch2_fs_btree_interior_update_init(c) ?:
+           bch2_fs_buckets_waiting_for_journal_init(c);
+           bch2_fs_subvolumes_init(c) ?:
+           bch2_fs_io_init(c) ?:
+           bch2_fs_encryption_init(c) ?:
+           bch2_fs_compress_init(c) ?:
+           bch2_fs_ec_init(c) ?:
+           bch2_fs_fsio_init(c);
+       if (ret)
+               goto err;
+
+       if (c->opts.nochanges)
+               set_bit(JOURNAL_NOCHANGES, &c->journal.flags);
 
        mi = bch2_sb_get_members(c->disk_sb.sb);
        for (i = 0; i < c->sb.nr_devices; i++)
                if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
-                   bch2_dev_alloc(c, i))
+                   bch2_dev_alloc(c, i)) {
+                       ret = -EEXIST;
                        goto err;
+               }
 
        bch2_journal_entry_res_resize(&c->journal,
                        &c->btree_root_journal_res,
@@ -808,18 +850,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
 
        mutex_lock(&bch_fs_list_lock);
-       err = bch2_fs_online(c);
+       ret = bch2_fs_online(c);
        mutex_unlock(&bch_fs_list_lock);
-       if (err) {
-               bch_err(c, "bch2_fs_online() error: %s", err);
+
+       if (ret)
                goto err;
-       }
 out:
-       pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
+       pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
        return c;
 err:
        bch2_fs_free(c);
-       c = NULL;
+       c = ERR_PTR(ret);
        goto out;
 }
 
@@ -842,7 +883,7 @@ static void print_mount_opts(struct bch_fs *c)
                const struct bch_option *opt = &bch2_opt_table[i];
                u64 v = bch2_opt_get_by_id(&c->opts, i);
 
-               if (!(opt->mode & OPT_MOUNT))
+               if (!(opt->flags & OPT_MOUNT))
                        continue;
 
                if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
@@ -859,7 +900,6 @@ static void print_mount_opts(struct bch_fs *c)
 
 int bch2_fs_start(struct bch_fs *c)
 {
-       const char *err = "cannot allocate memory";
        struct bch_sb_field_members *mi;
        struct bch_dev *ca;
        time64_t now = ktime_get_real_seconds();
@@ -895,10 +935,11 @@ int bch2_fs_start(struct bch_fs *c)
        if (ret)
                goto err;
 
-       err = "dynamic fault";
        ret = -EINVAL;
-       if (bch2_fs_init_fault("fs_start"))
+       if (bch2_fs_init_fault("fs_start")) {
+               bch_err(c, "fs_start fault injected");
                goto err;
+       }
 
        set_bit(BCH_FS_STARTED, &c->flags);
 
@@ -919,7 +960,6 @@ int bch2_fs_start(struct bch_fs *c)
        if (c->opts.read_only || c->opts.nochanges) {
                bch2_fs_read_only(c);
        } else {
-               err = "error going read write";
                ret = !test_bit(BCH_FS_RW, &c->flags)
                        ? bch2_fs_read_write(c)
                        : bch2_fs_read_write_late(c);
@@ -937,25 +977,22 @@ err:
        case BCH_FSCK_ERRORS_NOT_FIXED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
                pr_cont("mount with -o fix_errors to repair\n");
-               err = "fsck error";
                break;
        case BCH_FSCK_REPAIR_UNIMPLEMENTED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
                pr_cont("repair unimplemented: inform the developers so that it can be added\n");
-               err = "fsck error";
                break;
        case BCH_FSCK_REPAIR_IMPOSSIBLE:
                bch_err(c, "filesystem contains errors, but repair impossible");
-               err = "fsck error";
                break;
        case BCH_FSCK_UNKNOWN_VERSION:
-               err = "unknown metadata version";;
+               bch_err(c, "unknown metadata version");
                break;
        case -ENOMEM:
-               err = "cannot allocate memory";
+               bch_err(c, "cannot allocate memory");
                break;
        case -EIO:
-               err = "IO error";
+               bch_err(c, "IO error");
                break;
        }
 
@@ -972,7 +1009,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
        if (!sb_mi)
                return "Invalid superblock: member info area missing";
 
-       if (le16_to_cpu(sb->block_size) != c->opts.block_size)
+       if (le16_to_cpu(sb->block_size) != block_sectors(c))
                return "mismatched block size";
 
        if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
@@ -1230,6 +1267,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
                ca->disk_sb.bdev->bd_holder = ca;
        memset(sb, 0, sizeof(*sb));
 
+       ca->dev = ca->disk_sb.bdev->bd_dev;
+
        percpu_ref_reinit(&ca->io_ref);
 
        return 0;
@@ -1375,7 +1414,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
        bch2_copygc_start(c);
 }
 
-static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
        lockdep_assert_held(&c->state_lock);
 
@@ -1384,10 +1423,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
        bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       if (bch2_dev_allocator_start(ca))
-               return "error starting allocator thread";
-
-       return NULL;
+       return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1413,9 +1449,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       if (new_state == BCH_MEMBER_STATE_rw &&
-           __bch2_dev_read_write(c, ca))
-               ret = -ENOMEM;
+       if (new_state == BCH_MEMBER_STATE_rw)
+               ret = __bch2_dev_read_write(c, ca);
 
        rebalance_wakeup(c);
 
@@ -1445,20 +1480,23 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
        bch2_trans_init(&trans, c, 0, 0);
 
        for (i = 0; i < ca->mi.nbuckets; i++) {
-               ret = bch2_btree_key_cache_flush(&trans,
-                               BTREE_ID_alloc, POS(ca->dev_idx, i));
+               ret = lockrestart_do(&trans,
+                       bch2_btree_key_cache_flush(&trans,
+                               BTREE_ID_alloc, POS(ca->dev_idx, i)));
                if (ret)
                        break;
        }
        bch2_trans_exit(&trans);
 
-       if (ret)
+       if (ret) {
+               bch_err(c, "error %i removing dev alloc info", ret);
                return ret;
+       }
 
        return bch2_btree_delete_range(c, BTREE_ID_alloc,
                                       POS(ca->dev_idx, 0),
                                       POS(ca->dev_idx + 1, 0),
-                                      NULL);
+                                      0, NULL);
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1576,66 +1614,65 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        struct bch_sb_field_members *mi;
        struct bch_member dev_mi;
        unsigned dev_idx, nr_devices, u64s;
+       char *_errbuf;
+       struct printbuf errbuf;
        int ret;
 
-       ret = bch2_read_super(path, &opts, &sb);
-       if (ret)
-               return ret;
+       _errbuf = kmalloc(4096, GFP_KERNEL);
+       if (!_errbuf)
+               return -ENOMEM;
 
-       err = bch2_sb_validate(&sb);
-       if (err)
-               return -EINVAL;
+       errbuf = _PBUF(_errbuf, 4096);
+
+       ret = bch2_read_super(path, &opts, &sb);
+       if (ret) {
+               bch_err(c, "device add error: error reading super: %i", ret);
+               goto err;
+       }
 
        dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
 
        err = bch2_dev_may_add(sb.sb, c);
-       if (err)
-               return -EINVAL;
+       if (err) {
+               bch_err(c, "device add error: %s", err);
+               ret = -EINVAL;
+               goto err;
+       }
 
        ca = __bch2_dev_alloc(c, &dev_mi);
        if (!ca) {
                bch2_free_super(&sb);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto err;
        }
 
        ret = __bch2_dev_attach_bdev(ca, &sb);
        if (ret) {
                bch2_dev_free(ca);
-               return ret;
+               goto err;
        }
 
-       /*
-        * We want to allocate journal on the new device before adding the new
-        * device to the filesystem because allocating after we attach requires
-        * spinning up the allocator thread, and the allocator thread requires
-        * doing btree writes, which if the existing devices are RO isn't going
-        * to work
-        *
-        * So we have to mark where the superblocks are, but marking allocated
-        * data normally updates the filesystem usage too, so we have to mark,
-        * allocate the journal, reset all the marks, then remark after we
-        * attach...
-        */
-       bch2_mark_dev_superblock(NULL, ca, 0);
-
-       err = "journal alloc failed";
        ret = bch2_dev_journal_alloc(ca);
-       if (ret)
+       if (ret) {
+               bch_err(c, "device add error: journal alloc failed");
                goto err;
+       }
 
        down_write(&c->state_lock);
        mutex_lock(&c->sb_lock);
 
-       err = "insufficient space in new superblock";
        ret = bch2_sb_from_fs(c, ca);
-       if (ret)
+       if (ret) {
+               bch_err(c, "device add error: new device superblock too small");
                goto err_unlock;
+       }
 
        mi = bch2_sb_get_members(ca->disk_sb.sb);
 
        if (!bch2_sb_resize_members(&ca->disk_sb,
                                le32_to_cpu(mi->field.u64s) +
                                sizeof(dev_mi) / sizeof(u64))) {
+               bch_err(c, "device add error: new device superblock too small");
                ret = -ENOSPC;
                goto err_unlock;
        }
@@ -1648,7 +1685,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
                if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
                        goto have_slot;
 no_slot:
-       err = "no slots available in superblock";
+       bch_err(c, "device add error: already have maximum number of devices");
        ret = -ENOSPC;
        goto err_unlock;
 
@@ -1657,12 +1694,12 @@ have_slot:
        u64s = (sizeof(struct bch_sb_field_members) +
                sizeof(struct bch_member) * nr_devices) / sizeof(u64);
 
-       err = "no space in superblock for member info";
-       ret = -ENOSPC;
-
        mi = bch2_sb_resize_members(&c->disk_sb, u64s);
-       if (!mi)
+       if (!mi) {
+               bch_err(c, "device add error: no room in superblock for member info");
+               ret = -ENOSPC;
                goto err_unlock;
+       }
 
        /* success: */
 
@@ -1678,15 +1715,20 @@ have_slot:
 
        bch2_dev_usage_journal_reserve(c);
 
-       err = "error marking superblock";
        ret = bch2_trans_mark_dev_sb(c, ca);
-       if (ret)
+       if (ret) {
+               bch_err(c, "device add error: error marking new superblock: %i", ret);
                goto err_late;
+       }
+
+       ca->new_fs_bucket_idx = 0;
 
        if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               err = __bch2_dev_read_write(c, ca);
-               if (err)
+               ret = __bch2_dev_read_write(c, ca);
+               if (ret) {
+                       bch_err(c, "device add error: error going RW on new device: %i", ret);
                        goto err_late;
+               }
        }
 
        up_write(&c->state_lock);
@@ -1699,12 +1741,12 @@ err:
        if (ca)
                bch2_dev_free(ca);
        bch2_free_super(&sb);
-       bch_err(c, "Unable to add device: %s", err);
+       kfree(_errbuf);
        return ret;
 err_late:
        up_write(&c->state_lock);
-       bch_err(c, "Error going rw after adding device: %s", err);
-       return -EINVAL;
+       ca = NULL;
+       goto err;
 }
 
 /* Hot add existing device to running filesystem: */
@@ -1729,24 +1771,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
        dev_idx = sb.sb->dev_idx;
 
        err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
-       if (err)
+       if (err) {
+               bch_err(c, "error bringing %s online: %s", path, err);
                goto err;
+       }
 
-       if (bch2_dev_attach_bdev(c, &sb)) {
-               err = "bch2_dev_attach_bdev() error";
+       ret = bch2_dev_attach_bdev(c, &sb);
+       if (ret)
                goto err;
-       }
 
        ca = bch_dev_locked(c, dev_idx);
 
-       if (bch2_trans_mark_dev_sb(c, ca)) {
-               err = "bch2_trans_mark_dev_sb() error";
+       ret = bch2_trans_mark_dev_sb(c, ca);
+       if (ret) {
+               bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb",
+                       path, ret);
                goto err;
        }
 
        if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               err = __bch2_dev_read_write(c, ca);
-               if (err)
+               ret = __bch2_dev_read_write(c, ca);
+               if (ret)
                        goto err;
        }
 
@@ -1764,7 +1809,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 err:
        up_write(&c->state_lock);
        bch2_free_super(&sb);
-       bch_err(c, "error bringing %s online: %s", path, err);
        return -EINVAL;
 }
 
@@ -1836,20 +1880,14 @@ err:
 }
 
 /* return with ref on ca->ref: */
-struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
 {
        struct bch_dev *ca;
-       dev_t dev;
        unsigned i;
-       int ret;
-
-       ret = lookup_bdev(path, &dev);
-       if (ret)
-               return ERR_PTR(ret);
 
        rcu_read_lock();
        for_each_member_device_rcu(ca, c, i, NULL)
-               if (ca->disk_sb.bdev->bd_dev == dev)
+               if (!strcmp(name, ca->name))
                        goto found;
        ca = ERR_PTR(-ENOENT);
 found:
@@ -1868,32 +1906,39 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        struct bch_sb_field_members *mi;
        unsigned i, best_sb = 0;
        const char *err;
-       int ret = -ENOMEM;
+       char *_errbuf = NULL;
+       struct printbuf errbuf;
+       int ret = 0;
+
+       if (!try_module_get(THIS_MODULE))
+               return ERR_PTR(-ENODEV);
 
        pr_verbose_init(opts, "");
 
        if (!nr_devices) {
-               c = ERR_PTR(-EINVAL);
-               goto out2;
+               ret = -EINVAL;
+               goto err;
        }
 
-       if (!try_module_get(THIS_MODULE)) {
-               c = ERR_PTR(-ENODEV);
-               goto out2;
+       _errbuf = kmalloc(4096, GFP_KERNEL);
+       if (!_errbuf) {
+               ret = -ENOMEM;
+               goto err;
        }
 
+       errbuf = _PBUF(_errbuf, 4096);
+
        sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
-       if (!sb)
+       if (!sb) {
+               ret = -ENOMEM;
                goto err;
+       }
 
        for (i = 0; i < nr_devices; i++) {
                ret = bch2_read_super(devices[i], &opts, &sb[i]);
                if (ret)
                        goto err;
 
-               err = bch2_sb_validate(&sb[i]);
-               if (err)
-                       goto err_print;
        }
 
        for (i = 1; i < nr_devices; i++)
@@ -1921,18 +1966,20 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                i++;
        }
 
-       ret = -ENOMEM;
        c = bch2_fs_alloc(sb[best_sb].sb, opts);
-       if (!c)
+       if (IS_ERR(c)) {
+               ret = PTR_ERR(c);
                goto err;
+       }
 
-       err = "bch2_dev_online() error";
        down_write(&c->state_lock);
-       for (i = 0; i < nr_devices; i++)
-               if (bch2_dev_attach_bdev(c, &sb[i])) {
+       for (i = 0; i < nr_devices; i++) {
+               ret = bch2_dev_attach_bdev(c, &sb[i]);
+               if (ret) {
                        up_write(&c->state_lock);
-                       goto err_print;
+                       goto err;
                }
+       }
        up_write(&c->state_lock);
 
        err = "insufficient devices";
@@ -1946,8 +1993,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        }
 out:
        kfree(sb);
+       kfree(_errbuf);
        module_put(THIS_MODULE);
-out2:
        pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
        return c;
 err_print:
@@ -1955,89 +2002,15 @@ err_print:
               devices[0], err);
        ret = -EINVAL;
 err:
-       if (c)
+       if (!IS_ERR_OR_NULL(c))
                bch2_fs_stop(c);
-       for (i = 0; i < nr_devices; i++)
-               bch2_free_super(&sb[i]);
+       if (sb)
+               for (i = 0; i < nr_devices; i++)
+                       bch2_free_super(&sb[i]);
        c = ERR_PTR(ret);
        goto out;
 }
 
-static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
-                                             struct bch_opts opts)
-{
-       const char *err;
-       struct bch_fs *c;
-       bool allocated_fs = false;
-       int ret;
-
-       err = bch2_sb_validate(sb);
-       if (err)
-               return err;
-
-       mutex_lock(&bch_fs_list_lock);
-       c = __bch2_uuid_to_fs(sb->sb->uuid);
-       if (c) {
-               closure_get(&c->cl);
-
-               err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
-               if (err)
-                       goto err;
-       } else {
-               c = bch2_fs_alloc(sb->sb, opts);
-               err = "cannot allocate memory";
-               if (!c)
-                       goto err;
-
-               allocated_fs = true;
-       }
-
-       err = "bch2_dev_online() error";
-
-       mutex_lock(&c->sb_lock);
-       if (bch2_dev_attach_bdev(c, sb)) {
-               mutex_unlock(&c->sb_lock);
-               goto err;
-       }
-       mutex_unlock(&c->sb_lock);
-
-       if (!c->opts.nostart && bch2_fs_may_start(c)) {
-               err = "error starting filesystem";
-               ret = bch2_fs_start(c);
-               if (ret)
-                       goto err;
-       }
-
-       closure_put(&c->cl);
-       mutex_unlock(&bch_fs_list_lock);
-
-       return NULL;
-err:
-       mutex_unlock(&bch_fs_list_lock);
-
-       if (allocated_fs)
-               bch2_fs_stop(c);
-       else if (c)
-               closure_put(&c->cl);
-
-       return err;
-}
-
-const char *bch2_fs_open_incremental(const char *path)
-{
-       struct bch_sb_handle sb;
-       struct bch_opts opts = bch2_opts_empty();
-       const char *err;
-
-       if (bch2_read_super(path, &opts, &sb))
-               return "error reading superblock";
-
-       err = __bch2_fs_open_incremental(&sb, opts);
-       bch2_free_super(&sb);
-
-       return err;
-}
-
 /* Global interfaces/init */
 
 static void bcachefs_exit(void)
index 739e8fd181763c3298d20c52df5c0cbf21d5868e..3f24ca5a853d2144b25babf1b66352418a566231 100644 (file)
@@ -194,6 +194,27 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
        return devs;
 }
 
+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
+{
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       u64 b_offset    = bucket_to_sector(ca, b);
+       u64 b_end       = bucket_to_sector(ca, b + 1);
+       unsigned i;
+
+       if (!b)
+               return true;
+
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               u64 offset = le64_to_cpu(layout->sb_offset[i]);
+               u64 end = offset + (1 << layout->sb_max_size_bits);
+
+               if (!(offset >= b_end || end <= b_offset))
+                       return true;
+       }
+
+       return false;
+}
+
 struct bch_fs *bch2_dev_to_fs(dev_t);
 struct bch_fs *bch2_uuid_to_fs(uuid_le);
 
@@ -233,6 +254,5 @@ void bch2_fs_stop(struct bch_fs *);
 
 int bch2_fs_start(struct bch_fs *);
 struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
-const char *bch2_fs_open_incremental(const char *path);
 
 #endif /* _BCACHEFS_SUPER_H */
index 96023f37afea7c400c2eac983eded6e45187c085..d8b159a5b7f78ccacdd87236ffc32224bdc1baee 100644 (file)
@@ -29,7 +29,6 @@ struct bch_member_cpu {
        u16                     bucket_size;    /* sectors */
        u16                     group;
        u8                      state;
-       u8                      replacement;
        u8                      discard;
        u8                      data_allowed;
        u8                      durability;
index 9b1ffbf96e14784a63527ff6128ee925564eee8a..b727845dd64b73d4ea51fe8842c9f03ca899ad23 100644 (file)
@@ -10,6 +10,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "alloc_foreground.h"
 #include "sysfs.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -131,7 +132,6 @@ do {                                                                        \
                return strtoi_h(buf, &var) ?: (ssize_t) size;           \
 } while (0)
 
-write_attribute(trigger_journal_flush);
 write_attribute(trigger_gc);
 write_attribute(prune_cache);
 rw_attribute(btree_gc_periodic);
@@ -140,8 +140,6 @@ rw_attribute(gc_gens_pos);
 read_attribute(uuid);
 read_attribute(minor);
 read_attribute(bucket_size);
-read_attribute(block_size);
-read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
 read_attribute(durability);
@@ -155,11 +153,6 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(bucket_quantiles_last_read);
-read_attribute(bucket_quantiles_last_write);
-read_attribute(bucket_quantiles_fragmentation);
-read_attribute(bucket_quantiles_oldest_gen);
-
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@@ -183,11 +176,7 @@ read_attribute(read_realloc_races);
 read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
 
-rw_attribute(journal_write_delay_ms);
-rw_attribute(journal_reclaim_delay_ms);
-
 rw_attribute(discard);
-rw_attribute(cache_replacement_policy);
 rw_attribute(label);
 
 rw_attribute(copy_gc_enabled);
@@ -203,6 +192,8 @@ read_attribute(new_stripes);
 read_attribute(io_timers_read);
 read_attribute(io_timers_write);
 
+read_attribute(data_jobs);
+
 #ifdef CONFIG_BCACHEFS_TESTS
 write_attribute(perf_test);
 #endif /* CONFIG_BCACHEFS_TESTS */
@@ -239,28 +230,36 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c)
        return nr ? div64_u64(sectors, nr) : 0;
 }
 
-static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
+static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
 {
-       struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
+       long ret = 0;
+       struct bch_move_stats *stats;
 
-       if (!fs_usage)
-               return -ENOMEM;
-
-       bch2_fs_usage_to_text(out, c, fs_usage);
-
-       percpu_up_read(&c->mark_lock);
+       mutex_lock(&c->data_progress_lock);
+       list_for_each_entry(stats, &c->data_progress_list, list) {
+               pr_buf(out, "%s: data type %s btree_id %s position: ",
+                      stats->name,
+                      bch2_data_types[stats->data_type],
+                      bch2_btree_ids[stats->btree_id]);
+               bch2_bpos_to_text(out, stats->pos);
+               pr_buf(out, "%s", "\n");
+       }
 
-       kfree(fs_usage);
-       return 0;
+       mutex_unlock(&c->data_progress_lock);
+       return ret;
 }
 
 static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
-       u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
+       enum btree_id id;
+       u64 nr_uncompressed_extents = 0,
            nr_compressed_extents = 0,
+           nr_incompressible_extents = 0,
+           uncompressed_sectors = 0,
+           incompressible_sectors = 0,
            compressed_sectors_compressed = 0,
            compressed_sectors_uncompressed = 0;
        int ret;
@@ -270,46 +269,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
-               if (k.k->type == KEY_TYPE_extent) {
-                       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               if (!((1U << id) & BTREE_ID_HAS_PTRS))
+                       continue;
+
+               for_each_btree_key(&trans, iter, id, POS_MIN,
+                                  BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+                       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
                        const union bch_extent_entry *entry;
                        struct extent_ptr_decoded p;
-
-                       extent_for_each_ptr_decode(e, p, entry) {
-                               if (!crc_is_compressed(p.crc)) {
-                                       nr_uncompressed_extents++;
-                                       uncompressed_sectors += e.k->size;
-                               } else {
-                                       nr_compressed_extents++;
+                       bool compressed = false, uncompressed = false, incompressible = false;
+
+                       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                               switch (p.crc.compression_type) {
+                               case BCH_COMPRESSION_TYPE_none:
+                                       uncompressed = true;
+                                       uncompressed_sectors += k.k->size;
+                                       break;
+                               case BCH_COMPRESSION_TYPE_incompressible:
+                                       incompressible = true;
+                                       incompressible_sectors += k.k->size;
+                                       break;
+                               default:
                                        compressed_sectors_compressed +=
                                                p.crc.compressed_size;
                                        compressed_sectors_uncompressed +=
                                                p.crc.uncompressed_size;
+                                       compressed = true;
+                                       break;
                                }
-
-                               /* only looking at the first ptr */
-                               break;
                        }
+
+                       if (incompressible)
+                               nr_incompressible_extents++;
+                       else if (uncompressed)
+                               nr_uncompressed_extents++;
+                       else if (compressed)
+                               nr_compressed_extents++;
                }
+               bch2_trans_iter_exit(&trans, &iter);
+       }
+
+       bch2_trans_exit(&trans);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
        if (ret)
                return ret;
 
-       pr_buf(out,
-              "uncompressed data:\n"
-              "        nr extents:                     %llu\n"
-              "        size (bytes):                   %llu\n"
-              "compressed data:\n"
-              "        nr extents:                     %llu\n"
-              "        compressed size (bytes):        %llu\n"
-              "        uncompressed size (bytes):      %llu\n",
-              nr_uncompressed_extents,
-              uncompressed_sectors << 9,
-              nr_compressed_extents,
-              compressed_sectors_compressed << 9,
-              compressed_sectors_uncompressed << 9);
+       pr_buf(out, "uncompressed:\n");
+       pr_buf(out, "   nr extents:             %llu\n", nr_uncompressed_extents);
+       pr_buf(out, "   size:                   ");
+       bch2_hprint(out, uncompressed_sectors << 9);
+       pr_buf(out, "\n");
+
+       pr_buf(out, "compressed:\n");
+       pr_buf(out, "   nr extents:             %llu\n", nr_compressed_extents);
+       pr_buf(out, "   compressed size:        ");
+       bch2_hprint(out, compressed_sectors_compressed << 9);
+       pr_buf(out, "\n");
+       pr_buf(out, "   uncompressed size:      ");
+       bch2_hprint(out, compressed_sectors_uncompressed << 9);
+       pr_buf(out, "\n");
+
+       pr_buf(out, "incompressible:\n");
+       pr_buf(out, "   nr extents:             %llu\n", nr_incompressible_extents);
+       pr_buf(out, "   size:                   ");
+       bch2_hprint(out, incompressible_sectors << 9);
+       pr_buf(out, "\n");
        return 0;
 }
 
@@ -328,11 +353,6 @@ SHOW(bch2_fs)
        sysfs_print(minor,                      c->minor);
        sysfs_printf(internal_uuid, "%pU",      c->sb.uuid.b);
 
-       sysfs_print(journal_write_delay_ms,     c->journal.write_delay_ms);
-       sysfs_print(journal_reclaim_delay_ms,   c->journal.reclaim_delay_ms);
-
-       sysfs_print(block_size,                 block_bytes(c));
-       sysfs_print(btree_node_size,            btree_bytes(c));
        sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
        sysfs_hprint(btree_avg_write_size,      bch2_btree_avg_write_size(c));
 
@@ -367,9 +387,6 @@ SHOW(bch2_fs)
 
        /* Debugging: */
 
-       if (attr == &sysfs_alloc_debug)
-               return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
-
        if (attr == &sysfs_journal_debug) {
                bch2_journal_debug_to_text(&out, &c->journal);
                return out.pos - buf;
@@ -434,6 +451,11 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
+       if (attr == &sysfs_data_jobs) {
+               data_progress_to_text(&out, c);
+               return out.pos - buf;
+       }
+
        return 0;
 }
 
@@ -441,9 +463,6 @@ STORE(bch2_fs)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
 
-       sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
-       sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
-
        if (attr == &sysfs_btree_gc_periodic) {
                ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
                        ?: (ssize_t) size;
@@ -480,8 +499,16 @@ STORE(bch2_fs)
 
        /* Debugging: */
 
-       if (attr == &sysfs_trigger_journal_flush)
-               bch2_journal_meta(&c->journal);
+       if (!test_bit(BCH_FS_RW, &c->flags))
+               return -EROFS;
+
+       if (attr == &sysfs_prune_cache) {
+               struct shrink_control sc;
+
+               sc.gfp_mask = GFP_KERNEL;
+               sc.nr_to_scan = strtoul_or_return(buf);
+               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+       }
 
        if (attr == &sysfs_trigger_gc) {
                /*
@@ -496,14 +523,6 @@ STORE(bch2_fs)
 #endif
        }
 
-       if (attr == &sysfs_prune_cache) {
-               struct shrink_control sc;
-
-               sc.gfp_mask = GFP_KERNEL;
-               sc.nr_to_scan = strtoul_or_return(buf);
-               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
-       }
-
 #ifdef CONFIG_BCACHEFS_TESTS
        if (attr == &sysfs_perf_test) {
                char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -530,14 +549,9 @@ SYSFS_OPS(bch2_fs);
 
 struct attribute *bch2_fs_files[] = {
        &sysfs_minor,
-       &sysfs_block_size,
-       &sysfs_btree_node_size,
        &sysfs_btree_cache_size,
        &sysfs_btree_avg_write_size,
 
-       &sysfs_journal_write_delay_ms,
-       &sysfs_journal_reclaim_delay_ms,
-
        &sysfs_promote_whole_extents,
 
        &sysfs_compression_stats,
@@ -564,7 +578,6 @@ STORE(bch2_fs_internal)
 SYSFS_OPS(bch2_fs_internal);
 
 struct attribute *bch2_fs_internal_files[] = {
-       &sysfs_alloc_debug,
        &sysfs_journal_debug,
        &sysfs_journal_pins,
        &sysfs_btree_updates,
@@ -572,17 +585,20 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_btree_cache,
        &sysfs_btree_key_cache,
        &sysfs_btree_transactions,
+       &sysfs_new_stripes,
        &sysfs_stripes_heap,
        &sysfs_open_buckets,
+       &sysfs_io_timers_read,
+       &sysfs_io_timers_write,
+
+       &sysfs_trigger_gc,
+       &sysfs_prune_cache,
 
        &sysfs_read_realloc_races,
        &sysfs_extent_migrate_done,
        &sysfs_extent_migrate_raced,
 
-       &sysfs_trigger_journal_flush,
-       &sysfs_trigger_gc,
        &sysfs_gc_gens_pos,
-       &sysfs_prune_cache,
 
        &sysfs_copy_gc_enabled,
        &sysfs_copy_gc_wait,
@@ -591,10 +607,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_rebalance_work,
        sysfs_pd_controller_files(rebalance),
 
-       &sysfs_new_stripes,
-
-       &sysfs_io_timers_read,
-       &sysfs_io_timers_write,
+       &sysfs_data_jobs,
 
        &sysfs_internal_uuid,
        NULL
@@ -628,7 +641,7 @@ STORE(bch2_fs_opts_dir)
        if (!tmp)
                return -ENOMEM;
 
-       ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+       ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v);
        kfree(tmp);
 
        if (ret < 0)
@@ -638,13 +651,7 @@ STORE(bch2_fs_opts_dir)
        if (ret < 0)
                return ret;
 
-       if (opt->set_sb != SET_NO_SB_OPT) {
-               mutex_lock(&c->sb_lock);
-               opt->set_sb(c->disk_sb.sb, v);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-       }
-
+       bch2_opt_set_sb(c, opt, v);
        bch2_opt_set_by_id(&c->opts, id, v);
 
        if ((id == Opt_background_target ||
@@ -667,7 +674,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj)
        for (i = bch2_opt_table;
             i < bch2_opt_table + bch2_opts_nr;
             i++) {
-               if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
+               if (!(i->flags & OPT_FS))
                        continue;
 
                ret = sysfs_create_file(kobj, &i->attr);
@@ -710,76 +717,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
        NULL
 };
 
-typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
-                                size_t, void *);
-
-static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
-                                 size_t b, void *private)
-{
-       int rw = (private ? 1 : 0);
-
-       return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
-}
-
-static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
-                                      size_t b, void *private)
-{
-       struct bucket *g = bucket(ca, b);
-       return bucket_sectors_used(g->mark);
-}
-
-static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, void *private)
-{
-       return bucket_gc_gen(bucket(ca, b));
-}
-
-static int unsigned_cmp(const void *_l, const void *_r)
-{
-       const unsigned *l = _l;
-       const unsigned *r = _r;
-
-       return cmp_int(*l, *r);
-}
-
-static int quantiles_to_text(struct printbuf *out,
-                            struct bch_fs *c, struct bch_dev *ca,
-                            bucket_map_fn *fn, void *private)
-{
-       size_t i, n;
-       /* Compute 31 quantiles */
-       unsigned q[31], *p;
-
-       down_read(&ca->bucket_lock);
-       n = ca->mi.nbuckets;
-
-       p = vzalloc(n * sizeof(unsigned));
-       if (!p) {
-               up_read(&ca->bucket_lock);
-               return -ENOMEM;
-       }
-
-       for (i = ca->mi.first_bucket; i < n; i++)
-               p[i] = fn(c, ca, i, private);
-
-       sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
-       up_read(&ca->bucket_lock);
-
-       while (n &&
-              !p[n - 1])
-               --n;
-
-       for (i = 0; i < ARRAY_SIZE(q); i++)
-               q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
-
-       vfree(p);
-
-       for (i = 0; i < ARRAY_SIZE(q); i++)
-               pr_buf(out, "%u ", q[i]);
-       pr_buf(out, "\n");
-       return 0;
-}
-
 static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
 {
        enum alloc_reserve i;
@@ -807,7 +744,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
        memset(nr, 0, sizeof(nr));
 
        for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
-               nr[c->open_buckets[i].type]++;
+               nr[c->open_buckets[i].data_type]++;
 
        pr_buf(out,
               "\t\t buckets\t sectors      fragmented\n"
@@ -880,7 +817,6 @@ SHOW(bch2_dev)
        sysfs_printf(uuid,              "%pU\n", ca->uuid.b);
 
        sysfs_print(bucket_size,        bucket_bytes(ca));
-       sysfs_print(block_size,         block_bytes(c));
        sysfs_print(first_bucket,       ca->mi.first_bucket);
        sysfs_print(nbuckets,           ca->mi.nbuckets);
        sysfs_print(durability,         ca->mi.durability);
@@ -905,14 +841,6 @@ SHOW(bch2_dev)
                return out.pos - buf;
        }
 
-       if (attr == &sysfs_cache_replacement_policy) {
-               bch2_string_opt_to_text(&out,
-                                       bch2_cache_replacement_policies,
-                                       ca->mi.replacement);
-               pr_buf(&out, "\n");
-               return out.pos - buf;
-       }
-
        if (attr == &sysfs_state_rw) {
                bch2_string_opt_to_text(&out, bch2_member_states,
                                        ca->mi.state);
@@ -941,15 +869,6 @@ SHOW(bch2_dev)
                     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
                     * 100 / CONGESTED_MAX);
 
-       if (attr == &sysfs_bucket_quantiles_last_read)
-               return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
-       if (attr == &sysfs_bucket_quantiles_last_write)
-               return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
-       if (attr == &sysfs_bucket_quantiles_fragmentation)
-               return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL)  ?: out.pos - buf;
-       if (attr == &sysfs_bucket_quantiles_oldest_gen)
-               return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL)    ?: out.pos - buf;
-
        if (attr == &sysfs_reserve_stats) {
                reserve_stats_to_text(&out, ca);
                return out.pos - buf;
@@ -981,22 +900,6 @@ STORE(bch2_dev)
                mutex_unlock(&c->sb_lock);
        }
 
-       if (attr == &sysfs_cache_replacement_policy) {
-               ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
-
-               if (v < 0)
-                       return v;
-
-               mutex_lock(&c->sb_lock);
-               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
-
-               if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
-                       SET_BCH_MEMBER_REPLACEMENT(mi, v);
-                       bch2_write_super(c);
-               }
-               mutex_unlock(&c->sb_lock);
-       }
-
        if (attr == &sysfs_label) {
                char *tmp;
                int ret;
@@ -1021,14 +924,12 @@ SYSFS_OPS(bch2_dev);
 struct attribute *bch2_dev_files[] = {
        &sysfs_uuid,
        &sysfs_bucket_size,
-       &sysfs_block_size,
        &sysfs_first_bucket,
        &sysfs_nbuckets,
        &sysfs_durability,
 
        /* settings: */
        &sysfs_discard,
-       &sysfs_cache_replacement_policy,
        &sysfs_state_rw,
        &sysfs_label,
 
@@ -1041,12 +942,6 @@ struct attribute *bch2_dev_files[] = {
        &sysfs_io_latency_stats_write,
        &sysfs_congested,
 
-       /* alloc info - other stats: */
-       &sysfs_bucket_quantiles_last_read,
-       &sysfs_bucket_quantiles_last_write,
-       &sysfs_bucket_quantiles_fragmentation,
-       &sysfs_bucket_quantiles_oldest_gen,
-
        &sysfs_reserve_stats,
 
        /* debug: */
index 4d8d50fd76428520595fdab1a38dc0886b3a7af7..de84ce83497598a867cdaa9cb737ef5743f8ca59 100644 (file)
@@ -4,6 +4,7 @@
 #include "bcachefs.h"
 #include "btree_update.h"
 #include "journal_reclaim.h"
+#include "subvolume.h"
 #include "tests.h"
 
 #include "linux/kthread.h"
@@ -14,12 +15,14 @@ static void delete_test_keys(struct bch_fs *c)
        int ret;
 
        ret = bch2_btree_delete_range(c, BTREE_ID_extents,
-                                     POS(0, 0), POS(0, U64_MAX),
+                                     POS_MIN, SPOS_MAX,
+                                     BTREE_ITER_ALL_SNAPSHOTS,
                                      NULL);
        BUG_ON(ret);
 
        ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     POS(0, 0), POS(0, U64_MAX),
+                                     POS_MIN, SPOS_MAX,
+                                     BTREE_ITER_ALL_SNAPSHOTS,
                                      NULL);
        BUG_ON(ret);
 }
@@ -29,7 +32,7 @@ static void delete_test_keys(struct bch_fs *c)
 static int test_delete(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
 
@@ -37,13 +40,12 @@ static int test_delete(struct bch_fs *c, u64 nr)
        k.k.p.snapshot = U32_MAX;
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+                            BTREE_ITER_INTENT);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(&trans, iter, &k.k_i, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err(c, "update error in test_delete: %i", ret);
                goto err;
@@ -51,8 +53,8 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
        pr_info("deleting once");
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error (first) in test_delete: %i", ret);
                goto err;
@@ -60,14 +62,14 @@ static int test_delete(struct bch_fs *c, u64 nr)
 
        pr_info("deleting twice");
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error (second) in test_delete: %i", ret);
                goto err;
        }
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -75,7 +77,7 @@ err:
 static int test_delete_written(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_i_cookie k;
        int ret;
 
@@ -84,12 +86,12 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
-                                  BTREE_ITER_INTENT);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p,
+                            BTREE_ITER_INTENT);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_trans_update(&trans, iter, &k.k_i, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_trans_update(&trans, &iter, &k.k_i, 0));
        if (ret) {
                bch_err(c, "update error in test_delete_written: %i", ret);
                goto err;
@@ -99,14 +101,14 @@ static int test_delete_written(struct bch_fs *c, u64 nr)
        bch2_journal_flush_all_pins(&c->journal);
 
        ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-               bch2_btree_iter_traverse(iter) ?:
-               bch2_btree_delete_at(&trans, iter, 0));
+               bch2_btree_iter_traverse(&iter) ?:
+               bch2_btree_delete_at(&trans, &iter, 0));
        if (ret) {
                bch_err(c, "delete error in test_delete_written: %i", ret);
                goto err;
        }
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -114,7 +116,7 @@ err:
 static int test_iterate(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -145,7 +147,7 @@ static int test_iterate(struct bch_fs *c, u64 nr)
        i = 0;
 
        for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          POS_MIN, 0, k, ret) {
+                          SPOS(0, 0, U32_MAX), 0, k, ret) {
                if (k.k->p.inode)
                        break;
 
@@ -156,12 +158,12 @@ static int test_iterate(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k))
                BUG_ON(k.k->p.offset != --i);
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -169,7 +171,7 @@ err:
 static int test_iterate_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter = NULL;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -201,7 +203,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
        i = 0;
 
        for_each_btree_key(&trans, iter, BTREE_ID_extents,
-                          POS_MIN, 0, k, ret) {
+                          SPOS(0, 0, U32_MAX), 0, k, ret) {
                BUG_ON(bkey_start_offset(k.k) != i);
                i = k.k->p.offset;
        }
@@ -210,14 +212,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
 
        pr_info("iterating backwards");
 
-       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
+       while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) {
                BUG_ON(k.k->p.offset != i);
                i = bkey_start_offset(k.k);
        }
 
        BUG_ON(i);
 err:
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
@@ -225,7 +227,7 @@ err:
 static int test_iterate_slots(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -255,15 +257,15 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
-                          0, k, ret) {
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(0, 0, U32_MAX), 0, k, ret) {
                if (k.k->p.inode)
                        break;
 
                BUG_ON(k.k->p.offset != i);
                i += 2;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr * 2);
 
@@ -271,7 +273,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(0, 0, U32_MAX),
                           BTREE_ITER_SLOTS, k, ret) {
                BUG_ON(k.k->p.offset != i);
                BUG_ON(bkey_deleted(k.k) != (i & 1));
@@ -280,7 +283,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
                if (i == nr * 2)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 err:
        bch2_trans_exit(&trans);
        return ret;
@@ -289,7 +292,7 @@ err:
 static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter = { NULL };
        struct bkey_s_c k;
        u64 i;
        int ret = 0;
@@ -320,13 +323,13 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
-                          0, k, ret) {
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
+                          SPOS(0, 0, U32_MAX), 0, k, ret) {
                BUG_ON(bkey_start_offset(k.k) != i + 8);
                BUG_ON(k.k->size != 8);
                i += 16;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        BUG_ON(i != nr);
 
@@ -334,7 +337,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
 
        i = 0;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_extents,
+                          SPOS(0, 0, U32_MAX),
                           BTREE_ITER_SLOTS, k, ret) {
                BUG_ON(bkey_deleted(k.k) != !(i % 16));
 
@@ -345,7 +349,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
                if (i == nr)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 err:
        bch2_trans_exit(&trans);
        return 0;
@@ -358,21 +362,20 @@ err:
 static int test_peek_end(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
 
        bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            SPOS(0, 0, U32_MAX), 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
-
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       bch2_trans_iter_put(&trans, iter);
-
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -380,21 +383,20 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
 static int test_peek_end_extents(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
 
        bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents,
+                            SPOS(0, 0, U32_MAX), 0);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
-
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       k = bch2_btree_iter_peek(iter);
+       k = bch2_btree_iter_peek(&iter);
        BUG_ON(k.k);
 
-       bch2_trans_iter_put(&trans, iter);
-
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return 0;
 }
@@ -409,8 +411,6 @@ static int insert_test_extent(struct bch_fs *c,
        struct bkey_i_cookie k;
        int ret;
 
-       //pr_info("inserting %llu-%llu v %llu", start, end, test_version);
-
        bkey_cookie_init(&k.k_i);
        k.k_i.k.p.offset = end;
        k.k_i.k.p.snapshot = U32_MAX;
@@ -462,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
                __test_extent_overwrite(c, 32, 64, 32, 128);
 }
 
+/* snapshot unit tests */
+
+/* Test skipping over keys in unrelated snapshots: */
+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie cookie;
+       int ret;
+
+       bkey_cookie_init(&cookie.k_i);
+       cookie.k.p.snapshot = snapid_hi;
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            SPOS(0, 0, snapid_lo), 0);
+       k = bch2_btree_iter_peek(&iter);
+
+       BUG_ON(k.k->p.snapshot != U32_MAX);
+
+       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int test_snapshots(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i_cookie cookie;
+       u32 snapids[2];
+       u32 snapid_subvols[2] = { 1, 1 };
+       int ret;
+
+       bkey_cookie_init(&cookie.k_i);
+       cookie.k.p.snapshot = U32_MAX;
+       ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i,
+                               NULL, NULL, 0);
+       if (ret)
+               return ret;
+
+       ret = bch2_trans_do(c, NULL, NULL, 0,
+                     bch2_snapshot_node_create(&trans, U32_MAX,
+                                               snapids,
+                                               snapid_subvols,
+                                               2));
+       if (ret)
+               return ret;
+
+       if (snapids[0] > snapids[1])
+               swap(snapids[0], snapids[1]);
+
+       ret = test_snapshot_filter(c, snapids[0], snapids[1]);
+       if (ret) {
+               bch_err(c, "err %i from test_snapshot_filter", ret);
+               return ret;
+       }
+
+       return 0;
+}
+
 /* perf tests */
 
 static u64 test_rand(void)
@@ -540,18 +604,19 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr)
 static int rand_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
        u64 i;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
-               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
+               bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX));
 
-               k = bch2_btree_iter_peek(iter);
+               k = bch2_btree_iter_peek(&iter);
                ret = bkey_err(k);
                if (ret) {
                        bch_err(c, "error in rand_lookup: %i", ret);
@@ -559,63 +624,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr)
                }
        }
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
 
+static int rand_mixed_trans(struct btree_trans *trans,
+                           struct btree_iter *iter,
+                           struct bkey_i_cookie *cookie,
+                           u64 i, u64 pos)
+{
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX));
+
+       k = bch2_btree_iter_peek(iter);
+       ret = bkey_err(k);
+       if (ret && ret != -EINTR)
+               bch_err(trans->c, "lookup error in rand_mixed: %i", ret);
+       if (ret)
+               return ret;
+
+       if (!(i & 3) && k.k) {
+               bkey_cookie_init(&cookie->k_i);
+               cookie->k.p = iter->pos;
+               ret = bch2_trans_update(trans, iter, &cookie->k_i, 0);
+       }
+
+       return ret;
+}
+
 static int rand_mixed(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bkey_s_c k;
+       struct btree_iter iter;
+       struct bkey_i_cookie cookie;
        int ret = 0;
-       u64 i;
+       u64 i, rand;
 
        bch2_trans_init(&trans, c, 0, 0);
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs,
+                            SPOS(0, 0, U32_MAX), 0);
 
        for (i = 0; i < nr; i++) {
-               bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
-
-               k = bch2_btree_iter_peek(iter);
-               ret = bkey_err(k);
+               rand = test_rand();
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       rand_mixed_trans(&trans, &iter, &cookie, i, rand));
                if (ret) {
-                       bch_err(c, "lookup error in rand_mixed: %i", ret);
+                       bch_err(c, "update error in rand_mixed: %i", ret);
                        break;
                }
-
-               if (!(i & 3) && k.k) {
-                       struct bkey_i_cookie k;
-
-                       bkey_cookie_init(&k.k_i);
-                       k.k.p = iter->pos;
-
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                               bch2_btree_iter_traverse(iter) ?:
-                               bch2_trans_update(&trans, iter, &k.k_i, 0));
-                       if (ret) {
-                               bch_err(c, "update error in rand_mixed: %i", ret);
-                               break;
-                       }
-               }
        }
 
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
        bch2_trans_exit(&trans);
        return ret;
 }
 
 static int __do_delete(struct btree_trans *trans, struct bpos pos)
 {
-       struct btree_iter *iter;
-       struct bkey_i delete;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
-                                  BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek(iter);
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek(&iter);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -623,12 +698,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
        if (!k.k)
                goto err;
 
-       bkey_init(&delete.k);
-       delete.k.p = k.k->p;
-
-       ret = bch2_trans_update(trans, iter, &delete, 0);
+       ret = bch2_btree_delete_at(trans, &iter, 0);
 err:
-       bch2_trans_iter_put(trans, iter);
+       bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
 
@@ -641,7 +713,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
        bch2_trans_init(&trans, c, 0, 0);
 
        for (i = 0; i < nr; i++) {
-               struct bpos pos = POS(0, test_rand());
+               struct bpos pos = SPOS(0, test_rand(), U32_MAX);
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
                        __do_delete(&trans, pos));
@@ -658,7 +730,7 @@ static int rand_delete(struct bch_fs *c, u64 nr)
 static int seq_insert(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_i_cookie insert;
        int ret = 0;
@@ -668,13 +740,13 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX),
                           BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
-               insert.k.p = iter->pos;
+               insert.k.p = iter.pos;
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, &insert.k_i, 0));
+                       bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, &insert.k_i, 0));
                if (ret) {
                        bch_err(c, "error in seq_insert: %i", ret);
                        break;
@@ -683,7 +755,7 @@ static int seq_insert(struct bch_fs *c, u64 nr)
                if (++i == nr)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -692,15 +764,16 @@ static int seq_insert(struct bch_fs *c, u64 nr)
 static int seq_lookup(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(0, 0, U32_MAX), 0, k, ret)
                ;
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -709,27 +782,28 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
 static int seq_overwrite(struct bch_fs *c, u64 nr)
 {
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
+       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(0, 0, U32_MAX),
                           BTREE_ITER_INTENT, k, ret) {
                struct bkey_i_cookie u;
 
                bkey_reassemble(&u.k_i, k);
 
                ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                       bch2_btree_iter_traverse(iter) ?:
-                       bch2_trans_update(&trans, iter, &u.k_i, 0));
+                       bch2_btree_iter_traverse(&iter) ?:
+                       bch2_trans_update(&trans, &iter, &u.k_i, 0));
                if (ret) {
                        bch_err(c, "error in seq_overwrite: %i", ret);
                        break;
                }
        }
-       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_iter_exit(&trans, &iter);
 
        bch2_trans_exit(&trans);
        return ret;
@@ -740,7 +814,8 @@ static int seq_delete(struct bch_fs *c, u64 nr)
        int ret;
 
        ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
-                                     POS(0, 0), POS(0, U64_MAX),
+                                     POS_MIN, SPOS_MAX,
+                                     BTREE_ITER_ALL_SNAPSHOTS,
                                      NULL);
        if (ret)
                bch_err(c, "error in seq_delete: %i", ret);
@@ -778,9 +853,11 @@ static int btree_perf_test_thread(void *data)
                wait_event(j->ready_wait, !atomic_read(&j->ready));
        }
 
-       ret = j->fn(j->c, j->nr / j->nr_threads);
-       if (ret)
+       ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads));
+       if (ret) {
+               bch_err(j->c, "%ps: error %i", j->fn, ret);
                j->ret = ret;
+       }
 
        if (atomic_dec_and_test(&j->done)) {
                j->finish = sched_clock();
@@ -833,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
        perf_test(test_extent_overwrite_middle);
        perf_test(test_extent_overwrite_all);
 
+       perf_test(test_snapshots);
+
        if (!j.fn) {
                pr_err("unknown test %s", testname);
                return -EINVAL;
@@ -854,11 +933,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
 
        scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
        bch2_hprint(&PBUF(nr_buf), nr);
-       bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
+       bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time));
        printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
                name_buf, nr_buf, nr_threads,
-               time / NSEC_PER_SEC,
-               time * nr_threads / nr,
+               div_u64(time, NSEC_PER_SEC),
+               div_u64(time * nr_threads, nr),
                per_sec_buf);
        return j.ret;
 }
index 463260c0458575c2492c165087e4e6eb05c3d58f..0bbea332fcaaf7cecc6ce62695fe0367c011bab2 100644 (file)
@@ -114,7 +114,7 @@ void bch2_hprint(struct printbuf *buf, s64 v)
         * 103 is magic: t is in the range [-1023, 1023] and we want
         * to turn it into [-9, 9]
         */
-       if (u && v < 100 && v > -100)
+       if (u && t && v < 100 && v > -100)
                pr_buf(buf, ".%i", t / 103);
        if (u)
                pr_buf(buf, "%c", si_units[u]);
@@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
                if (!page)
                        return -ENOMEM;
 
-               BUG_ON(!bio_add_page(bio, page, len, 0));
+               if (unlikely(!bio_add_page(bio, page, len, 0))) {
+                       __free_page(page);
+                       break;
+               }
+
                size -= len;
        }
 
@@ -887,9 +891,14 @@ void eytzinger0_find_test(void)
  */
 u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
 {
-       u64 *ret = this_cpu_ptr(p);
+       u64 *ret;
        int cpu;
 
+       /* access to pcpu vars has to be blocked by other locking */
+       preempt_disable();
+       ret = this_cpu_ptr(p);
+       preempt_enable();
+
        for_each_possible_cpu(cpu) {
                u64 *i = per_cpu_ptr(p, cpu);
 
index 84ef4d6d36f62c31a62dcdc9efb1b5ec742a6cc1..e55407dc53249d1a93106cd9e4b37c9df74b523e 100644 (file)
@@ -18,9 +18,6 @@
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
 
-#define PAGE_SECTOR_SHIFT      (PAGE_SHIFT - 9)
-#define PAGE_SECTORS           (1UL << PAGE_SECTOR_SHIFT)
-
 struct closure;
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -241,6 +238,7 @@ do {                                                                        \
 struct printbuf {
        char            *pos;
        char            *end;
+       unsigned        indent;
 };
 
 static inline size_t printbuf_remaining(struct printbuf *buf)
@@ -262,6 +260,27 @@ do {                                                                       \
                                 __VA_ARGS__);                          \
 } while (0)
 
+static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces)
+{
+       buf->indent += spaces;
+       while (spaces--)
+               pr_buf(buf, " ");
+}
+
+static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces)
+{
+       buf->indent -= spaces;
+}
+
+static inline void printbuf_newline(struct printbuf *buf)
+{
+       unsigned i;
+
+       pr_buf(buf, "\n");
+       for (i = 0; i < buf->indent; i++)
+               pr_buf(buf, " ");
+}
+
 void bch_scnmemcpy(struct printbuf *, const char *, size_t);
 
 int bch2_strtoint_h(const char *, int *);
@@ -749,4 +768,13 @@ static inline int u8_cmp(u8 l, u8 r)
        return cmp_int(l, r);
 }
 
+#ifdef __KERNEL__
+static inline void uuid_unparse_lower(u8 *uuid, char *out)
+{
+       sprintf(out, "%plU", uuid);
+}
+#else
+#include <uuid/uuid.h>
+#endif
+
 #endif /* _BCACHEFS_UTIL_H */
index e6a041541792676d8936fdbc44dfac95314de076..a2d6bb7136c7d412d95469ac09b2c063fd0dd86d 100644 (file)
@@ -4,6 +4,10 @@
 #include <linux/string.h>
 #include <asm/unaligned.h>
 
+#ifdef CONFIG_VALGRIND
+#include <valgrind/memcheck.h>
+#endif
+
 #include "varint.h"
 
 /**
@@ -95,8 +99,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v)
  */
 int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
+#ifdef CONFIG_VALGRIND
+       VALGRIND_MAKE_MEM_DEFINED(in, 8);
+#endif
        u64 v = get_unaligned_le64(in);
-       unsigned bytes = ffz(v & 255) + 1;
+       unsigned bytes = ffz(*in) + 1;
 
        if (unlikely(in + bytes > end))
                return -1;
index e4d400b16dbaf8da85a211c55ddc9b74195d86e0..4d7db64e3ef3085602c7017618b411f281a06b6b 100644 (file)
@@ -122,23 +122,22 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
                                const char *name, void *buffer, size_t size, int type)
 {
        struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode);
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c_xattr xattr;
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash,
-                               inode->v.i_ino,
-                               &X_SEARCH(type, name, strlen(name)),
-                               0);
-       ret = PTR_ERR_OR_ZERO(iter);
+       ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash,
+                              inode_inum(inode),
+                              &X_SEARCH(type, name, strlen(name)),
+                              0);
        if (ret)
-               goto err;
+               goto err1;
 
-       k = bch2_btree_iter_peek_slot(iter);
+       k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
-               goto err;
+               goto err2;
 
        xattr = bkey_s_c_to_xattr(k);
        ret = le16_to_cpu(xattr.v->x_val_len);
@@ -148,8 +147,9 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info
                else
                        memcpy(buffer, xattr_val(xattr.v), ret);
        }
-       bch2_trans_iter_put(trans, iter);
-err:
+err2:
+       bch2_trans_iter_exit(trans, &iter);
+err1:
        return ret == -ENOENT ? -ENODATA : ret;
 }
 
@@ -160,13 +160,29 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                bch2_xattr_get_trans(&trans, inode, name, buffer, size, type));
 }
 
-int bch2_xattr_set(struct btree_trans *trans, u64 inum,
+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum,
                   const struct bch_hash_info *hash_info,
                   const char *name, const void *value, size_t size,
                   int type, int flags)
 {
+       struct btree_iter inode_iter = { NULL };
+       struct bch_inode_unpacked inode_u;
        int ret;
 
+       /*
+        * We need to do an inode update so that bi_journal_sync gets updated
+        * and fsync works:
+        *
+        * Perhaps we should be updating bi_mtime too?
+        */
+
+       ret   = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?:
+               bch2_inode_write(trans, &inode_iter, &inode_u);
+       bch2_trans_iter_exit(trans, &inode_iter);
+
+       if (ret)
+               return ret;
+
        if (value) {
                struct bkey_i_xattr *xattr;
                unsigned namelen = strlen(name);
@@ -279,16 +295,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        struct bch_fs *c = dentry->d_sb->s_fs_info;
        struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
        struct btree_trans trans;
-       struct btree_iter *iter;
+       struct btree_iter iter;
        struct bkey_s_c k;
        struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
-       u64 inum = dentry->d_inode->i_ino;
+       u64 offset = 0, inum = inode->ei_inode.bi_inum;
+       u32 snapshot;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
+retry:
+       bch2_trans_begin(&trans);
+       iter = (struct btree_iter) { NULL };
+
+       ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot);
+       if (ret)
+               goto err;
 
-       for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
-                          POS(inum, 0), 0, k, ret) {
+       for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs,
+                          SPOS(inum, offset, snapshot), 0, k, ret) {
                BUG_ON(k.k->p.inode < inum);
 
                if (k.k->p.inode > inum)
@@ -301,9 +325,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                if (ret)
                        break;
        }
-       bch2_trans_iter_put(&trans, iter);
 
-       ret = bch2_trans_exit(&trans) ?: ret;
+       offset = iter.pos.offset;
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       if (ret == -EINTR)
+               goto retry;
+
+       bch2_trans_exit(&trans);
 
        if (ret)
                return ret;
@@ -339,8 +368,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
 
-       return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
-                       bch2_xattr_set(&trans, inode->v.i_ino, &hash,
+       return bch2_trans_do(c, NULL, NULL, 0,
+                       bch2_xattr_set(&trans, inode_inum(inode), &hash,
                                       name, value, size,
                                       handler->flags, flags));
 }
@@ -496,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
                memcpy(buf, value, size);
                buf[size] = '\0';
 
-               ret = bch2_opt_parse(c, opt, buf, &v);
+               ret = bch2_opt_parse(c, NULL, opt, buf, &v);
                kfree(buf);
 
                if (ret < 0)
index 4151065ab853546c3f071a831cfba10d9af03010..f4f896545e1c29f0ff35018263bf6b227250567b 100644 (file)
@@ -39,7 +39,8 @@ struct bch_inode_info;
 int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
                  const char *, void *, size_t, int);
 
-int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
+int bch2_xattr_set(struct btree_trans *, subvol_inum,
+                  const struct bch_hash_info *,
                   const char *, const void *, size_t, int, int);
 
 ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
index 270d3c899471ec298cface7265cecc5fd5927eaa..762e5aa02a530e9a8b63e3da8da5227d87198f53 100644 (file)
 
 #include "tools-util.h"
 
+struct fops {
+       void (*init)(void);
+       void (*cleanup)(void);
+       void (*read)(struct bio *bio, struct iovec * iov, unsigned i);
+       void (*write)(struct bio *bio, struct iovec * iov, unsigned i);
+};
+
+static struct fops *fops;
 static io_context_t aio_ctx;
 static atomic_t running_requests;
 
@@ -66,35 +74,12 @@ void generic_make_request(struct bio *bio)
 #endif
        }
 
-       struct iocb iocb = {
-               .data           = bio,
-               .aio_fildes     = bio->bi_opf & REQ_FUA
-                       ? bio->bi_bdev->bd_sync_fd
-                       : bio->bi_bdev->bd_fd,
-       }, *iocbp = &iocb;
-
        switch (bio_op(bio)) {
        case REQ_OP_READ:
-               iocb.aio_lio_opcode     = IO_CMD_PREADV;
-               iocb.u.v.vec            = iov;
-               iocb.u.v.nr             = i;
-               iocb.u.v.offset         = bio->bi_iter.bi_sector << 9;
-
-               atomic_inc(&running_requests);
-               ret = io_submit(aio_ctx, 1, &iocbp);
-               if (ret != 1)
-                       die("io_submit err: %s", strerror(-ret));
+               fops->read(bio, iov, i);
                break;
        case REQ_OP_WRITE:
-               iocb.aio_lio_opcode     = IO_CMD_PWRITEV;
-               iocb.u.v.vec            = iov;
-               iocb.u.v.nr             = i;
-               iocb.u.v.offset         = bio->bi_iter.bi_sector << 9;
-
-               atomic_inc(&running_requests);
-               ret = io_submit(aio_ctx, 1, &iocbp);
-               if (ret != 1)
-                       die("io_submit err: %s", strerror(-ret));
+               fops->write(bio, iov, i);
                break;
        case REQ_OP_FLUSH:
                ret = fsync(bio->bi_bdev->bd_fd);
@@ -220,8 +205,8 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
        bdev->bd_sync_fd        = sync_fd;
        bdev->bd_holder         = holder;
        bdev->bd_disk           = &bdev->__bd_disk;
-       bdev->bd_bdi            = &bdev->__bd_bdi;
-       bdev->queue.backing_dev_info = bdev->bd_bdi;
+       bdev->bd_disk->bdi      = &bdev->bd_disk->__bdi;
+       bdev->queue.backing_dev_info = bdev->bd_disk->bdi;
 
        return bdev;
 }
@@ -236,6 +221,55 @@ int lookup_bdev(const char *path, dev_t *dev)
        return -EINVAL;
 }
 
+static void io_fallback(void)
+{
+       fops++;
+       if (fops->init == NULL)
+               die("no fallback possible, something is very wrong");
+       fops->init();
+}
+
+static void sync_check(struct bio *bio, int ret)
+{
+       if (ret != bio->bi_iter.bi_size) {
+               die("IO error: %s\n", strerror(-ret));
+       }
+
+       if (bio->bi_opf & REQ_FUA) {
+               ret = fdatasync(bio->bi_bdev->bd_fd);
+               if (ret)
+                       die("fsync error: %s\n", strerror(-ret));
+       }
+       bio_endio(bio);
+}
+
+static void sync_init(void) {}
+
+static void sync_cleanup(void)
+{
+       /* not necessary? */
+       sync();
+}
+
+static void sync_read(struct bio *bio, struct iovec * iov, unsigned i)
+{
+
+       int fd = bio->bi_opf & REQ_FUA
+                       ? bio->bi_bdev->bd_sync_fd
+                       : bio->bi_bdev->bd_fd;
+       ssize_t ret = preadv(fd, iov, i, bio->bi_iter.bi_sector << 9);
+       sync_check(bio, ret);
+}
+
+static void sync_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+       int fd = bio->bi_opf & REQ_FUA
+                       ? bio->bi_bdev->bd_sync_fd
+                       : bio->bi_bdev->bd_fd;
+       ssize_t ret = pwritev(fd, iov, i, bio->bi_iter.bi_sector << 9);
+       sync_check(bio, ret);
+}
+
 static int aio_completion_thread(void *arg)
 {
        struct io_event events[8], *ev;
@@ -274,22 +308,24 @@ static int aio_completion_thread(void *arg)
 
 static struct task_struct *aio_task = NULL;
 
-__attribute__((constructor(102)))
-static void blkdev_init(void)
+static void aio_init(void)
 {
        struct task_struct *p;
+       long err = io_setup(256, &aio_ctx);
+       if (!err) {
+               p = kthread_run(aio_completion_thread, NULL, "aio_completion");
+               BUG_ON(IS_ERR(p));
 
-       if (io_setup(256, &aio_ctx))
-               die("io_setup() error: %m");
-
-       p = kthread_run(aio_completion_thread, NULL, "aio_completion");
-       BUG_ON(IS_ERR(p));
+               aio_task = p;
 
-       aio_task = p;
+       } else if (err == -ENOSYS) {
+               io_fallback();
+       } else {
+               die("io_setup() error: %s", strerror(err));
+       }
 }
 
-__attribute__((destructor(102)))
-static void blkdev_cleanup(void)
+static void aio_cleanup(void)
 {
        struct task_struct *p = NULL;
        swap(aio_task, p);
@@ -322,3 +358,71 @@ static void blkdev_cleanup(void)
        close(fds[0]);
        close(fds[1]);
 }
+
+static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode)
+{
+       ssize_t ret;
+       struct iocb iocb = {
+               .data           = bio,
+               .aio_fildes     = bio->bi_opf & REQ_FUA
+                       ? bio->bi_bdev->bd_sync_fd
+                       : bio->bi_bdev->bd_fd,
+               .aio_lio_opcode = opcode,
+               .u.c.buf        = iov,
+               .u.c.nbytes     = i,
+               .u.c.offset     = bio->bi_iter.bi_sector << 9,
+
+       }, *iocbp = &iocb;
+
+       atomic_inc(&running_requests);
+       ret = io_submit(aio_ctx, 1, &iocbp);
+       if (ret != 1)
+               die("io_submit err: %s", strerror(-ret));
+}
+
+static void aio_read(struct bio *bio, struct iovec *iov, unsigned i)
+{
+       aio_op(bio, iov, i, IO_CMD_PREADV);
+}
+
+static void aio_write(struct bio *bio, struct iovec * iov, unsigned i)
+{
+       aio_op(bio, iov, i, IO_CMD_PWRITEV);
+}
+
+
+/* not implemented */
+static void uring_init(void) {
+       io_fallback();
+}
+
+struct fops fops_list[] = {
+       {
+               .init           = uring_init,
+       }, {
+               .init           = aio_init,
+               .cleanup        = aio_cleanup,
+               .read           = aio_read,
+               .write          = aio_write,
+       }, {
+               .init           = sync_init,
+               .cleanup        = sync_cleanup,
+               .read           = sync_read,
+               .write          = sync_write,
+       }, {
+               /* NULL */
+       }
+};
+
+__attribute__((constructor(102)))
+static void blkdev_init(void)
+{
+       fops = fops_list;
+       fops->init();
+}
+
+__attribute__((destructor(102)))
+static void blkdev_cleanup(void)
+{
+       fops->cleanup();
+}
diff --git a/linux/siphash.c b/linux/siphash.c
new file mode 100644 (file)
index 0000000..f8dbece
--- /dev/null
@@ -0,0 +1,552 @@
+/* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ *
+ * This file is provided under a dual BSD/GPLv2 license.
+ *
+ * SipHash: a fast short-input PRF
+ * https://131002.net/siphash/
+ *
+ * This implementation is specifically for SipHash2-4 for a secure PRF
+ * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
+ * hashtables.
+ */
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <asm/unaligned.h>
+
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+#include <linux/dcache.h>
+#include <asm/word-at-a-time.h>
+#endif
+
+#define SIPROUND \
+       do { \
+       v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \
+       v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \
+       v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \
+       v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \
+       } while (0)
+
+#define PREAMBLE(len) \
+       u64 v0 = 0x736f6d6570736575ULL; \
+       u64 v1 = 0x646f72616e646f6dULL; \
+       u64 v2 = 0x6c7967656e657261ULL; \
+       u64 v3 = 0x7465646279746573ULL; \
+       u64 b = ((u64)(len)) << 56; \
+       v3 ^= key->key[1]; \
+       v2 ^= key->key[0]; \
+       v1 ^= key->key[1]; \
+       v0 ^= key->key[0];
+
+#define POSTAMBLE \
+       v3 ^= b; \
+       SIPROUND; \
+       SIPROUND; \
+       v0 ^= b; \
+       v2 ^= 0xff; \
+       SIPROUND; \
+       SIPROUND; \
+       SIPROUND; \
+       SIPROUND; \
+       return (v0 ^ v1) ^ (v2 ^ v3);
+
+u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u64));
+       const u8 left = len & (sizeof(u64) - 1);
+       u64 m;
+       PREAMBLE(len)
+       for (; data != end; data += sizeof(u64)) {
+               m = le64_to_cpup(data);
+               v3 ^= m;
+               SIPROUND;
+               SIPROUND;
+               v0 ^= m;
+       }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+       if (left)
+               b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+                                                 bytemask_from_count(left)));
+#else
+       switch (left) {
+       case 7: b |= ((u64)end[6]) << 48; fallthrough;
+       case 6: b |= ((u64)end[5]) << 40; fallthrough;
+       case 5: b |= ((u64)end[4]) << 32; fallthrough;
+       case 4: b |= le32_to_cpup(data); break;
+       case 3: b |= ((u64)end[2]) << 16; fallthrough;
+       case 2: b |= le16_to_cpup(data); break;
+       case 1: b |= end[0];
+       }
+#endif
+       POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u64));
+       const u8 left = len & (sizeof(u64) - 1);
+       u64 m;
+       PREAMBLE(len)
+       for (; data != end; data += sizeof(u64)) {
+               m = get_unaligned_le64(data);
+               v3 ^= m;
+               SIPROUND;
+               SIPROUND;
+               v0 ^= m;
+       }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+       if (left)
+               b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+                                                 bytemask_from_count(left)));
+#else
+       switch (left) {
+       case 7: b |= ((u64)end[6]) << 48; fallthrough;
+       case 6: b |= ((u64)end[5]) << 40; fallthrough;
+       case 5: b |= ((u64)end[4]) << 32; fallthrough;
+       case 4: b |= get_unaligned_le32(end); break;
+       case 3: b |= ((u64)end[2]) << 16; fallthrough;
+       case 2: b |= get_unaligned_le16(end); break;
+       case 1: b |= end[0];
+       }
+#endif
+       POSTAMBLE
+}
+EXPORT_SYMBOL(__siphash_unaligned);
+#endif
+
+/**
+ * siphash_1u64 - compute 64-bit siphash PRF value of a u64
+ * @first: first u64
+ * @key: the siphash key
+ */
+u64 siphash_1u64(const u64 first, const siphash_key_t *key)
+{
+       PREAMBLE(8)
+       v3 ^= first;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= first;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u64);
+
+/**
+ * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
+ * @first: first u64
+ * @second: second u64
+ * @key: the siphash key
+ */
+u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
+{
+       PREAMBLE(16)
+       v3 ^= first;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= second;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_2u64);
+
+/**
+ * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @key: the siphash key
+ */
+u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
+                const siphash_key_t *key)
+{
+       PREAMBLE(24)
+       v3 ^= first;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= second;
+       v3 ^= third;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= third;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u64);
+
+/**
+ * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
+ * @first: first u64
+ * @second: second u64
+ * @third: third u64
+ * @forth: forth u64
+ * @key: the siphash key
+ */
+u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
+                const u64 forth, const siphash_key_t *key)
+{
+       PREAMBLE(32)
+       v3 ^= first;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= second;
+       v3 ^= third;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= third;
+       v3 ^= forth;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= forth;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_4u64);
+
+u64 siphash_1u32(const u32 first, const siphash_key_t *key)
+{
+       PREAMBLE(4)
+       b |= first;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_1u32);
+
+u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
+                const siphash_key_t *key)
+{
+       u64 combined = (u64)second << 32 | first;
+       PREAMBLE(12)
+       v3 ^= combined;
+       SIPROUND;
+       SIPROUND;
+       v0 ^= combined;
+       b |= third;
+       POSTAMBLE
+}
+EXPORT_SYMBOL(siphash_3u32);
+
+#if BITS_PER_LONG == 64
+/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
+ * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
+ */
+
+#define HSIPROUND SIPROUND
+#define HPREAMBLE(len) PREAMBLE(len)
+#define HPOSTAMBLE \
+       v3 ^= b; \
+       HSIPROUND; \
+       v0 ^= b; \
+       v2 ^= 0xff; \
+       HSIPROUND; \
+       HSIPROUND; \
+       HSIPROUND; \
+       return (v0 ^ v1) ^ (v2 ^ v3);
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u64));
+       const u8 left = len & (sizeof(u64) - 1);
+       u64 m;
+       HPREAMBLE(len)
+       for (; data != end; data += sizeof(u64)) {
+               m = le64_to_cpup(data);
+               v3 ^= m;
+               HSIPROUND;
+               v0 ^= m;
+       }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+       if (left)
+               b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+                                                 bytemask_from_count(left)));
+#else
+       switch (left) {
+       case 7: b |= ((u64)end[6]) << 48; fallthrough;
+       case 6: b |= ((u64)end[5]) << 40; fallthrough;
+       case 5: b |= ((u64)end[4]) << 32; fallthrough;
+       case 4: b |= le32_to_cpup(data); break;
+       case 3: b |= ((u64)end[2]) << 16; fallthrough;
+       case 2: b |= le16_to_cpup(data); break;
+       case 1: b |= end[0];
+       }
+#endif
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+                        const hsiphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u64));
+       const u8 left = len & (sizeof(u64) - 1);
+       u64 m;
+       HPREAMBLE(len)
+       for (; data != end; data += sizeof(u64)) {
+               m = get_unaligned_le64(data);
+               v3 ^= m;
+               HSIPROUND;
+               v0 ^= m;
+       }
+#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
+       if (left)
+               b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
+                                                 bytemask_from_count(left)));
+#else
+       switch (left) {
+       case 7: b |= ((u64)end[6]) << 48; fallthrough;
+       case 6: b |= ((u64)end[5]) << 40; fallthrough;
+       case 5: b |= ((u64)end[4]) << 32; fallthrough;
+       case 4: b |= get_unaligned_le32(end); break;
+       case 3: b |= ((u64)end[2]) << 16; fallthrough;
+       case 2: b |= get_unaligned_le16(end); break;
+       case 1: b |= end[0];
+       }
+#endif
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+       HPREAMBLE(4)
+       b |= first;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+       u64 combined = (u64)second << 32 | first;
+       HPREAMBLE(8)
+       v3 ^= combined;
+       HSIPROUND;
+       v0 ^= combined;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+                 const hsiphash_key_t *key)
+{
+       u64 combined = (u64)second << 32 | first;
+       HPREAMBLE(12)
+       v3 ^= combined;
+       HSIPROUND;
+       v0 ^= combined;
+       b |= third;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+                 const u32 forth, const hsiphash_key_t *key)
+{
+       u64 combined = (u64)second << 32 | first;
+       HPREAMBLE(16)
+       v3 ^= combined;
+       HSIPROUND;
+       v0 ^= combined;
+       combined = (u64)forth << 32 | third;
+       v3 ^= combined;
+       HSIPROUND;
+       v0 ^= combined;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#else
+#define HSIPROUND \
+       do { \
+       v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \
+       v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \
+       v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \
+       v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \
+       } while (0)
+
+#define HPREAMBLE(len) \
+       u32 v0 = 0; \
+       u32 v1 = 0; \
+       u32 v2 = 0x6c796765U; \
+       u32 v3 = 0x74656462U; \
+       u32 b = ((u32)(len)) << 24; \
+       v3 ^= key->key[1]; \
+       v2 ^= key->key[0]; \
+       v1 ^= key->key[1]; \
+       v0 ^= key->key[0];
+
+#define HPOSTAMBLE \
+       v3 ^= b; \
+       HSIPROUND; \
+       v0 ^= b; \
+       v2 ^= 0xff; \
+       HSIPROUND; \
+       HSIPROUND; \
+       HSIPROUND; \
+       return v1 ^ v3;
+
+u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u32));
+       const u8 left = len & (sizeof(u32) - 1);
+       u32 m;
+       HPREAMBLE(len)
+       for (; data != end; data += sizeof(u32)) {
+               m = le32_to_cpup(data);
+               v3 ^= m;
+               HSIPROUND;
+               v0 ^= m;
+       }
+       switch (left) {
+       case 3: b |= ((u32)end[2]) << 16; fallthrough;
+       case 2: b |= le16_to_cpup(data); break;
+       case 1: b |= end[0];
+       }
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_aligned);
+
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+u32 __hsiphash_unaligned(const void *data, size_t len,
+                        const hsiphash_key_t *key)
+{
+       const u8 *end = data + len - (len % sizeof(u32));
+       const u8 left = len & (sizeof(u32) - 1);
+       u32 m;
+       HPREAMBLE(len)
+       for (; data != end; data += sizeof(u32)) {
+               m = get_unaligned_le32(data);
+               v3 ^= m;
+               HSIPROUND;
+               v0 ^= m;
+       }
+       switch (left) {
+       case 3: b |= ((u32)end[2]) << 16; fallthrough;
+       case 2: b |= get_unaligned_le16(end); break;
+       case 1: b |= end[0];
+       }
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(__hsiphash_unaligned);
+#endif
+
+/**
+ * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
+ * @first: first u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
+{
+       HPREAMBLE(4)
+       v3 ^= first;
+       HSIPROUND;
+       v0 ^= first;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_1u32);
+
+/**
+ * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
+ * @first: first u32
+ * @second: second u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
+{
+       HPREAMBLE(8)
+       v3 ^= first;
+       HSIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       HSIPROUND;
+       v0 ^= second;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_2u32);
+
+/**
+ * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
+                 const hsiphash_key_t *key)
+{
+       HPREAMBLE(12)
+       v3 ^= first;
+       HSIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       HSIPROUND;
+       v0 ^= second;
+       v3 ^= third;
+       HSIPROUND;
+       v0 ^= third;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_3u32);
+
+/**
+ * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
+ * @first: first u32
+ * @second: second u32
+ * @third: third u32
+ * @forth: forth u32
+ * @key: the hsiphash key
+ */
+u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
+                 const u32 forth, const hsiphash_key_t *key)
+{
+       HPREAMBLE(16)
+       v3 ^= first;
+       HSIPROUND;
+       v0 ^= first;
+       v3 ^= second;
+       HSIPROUND;
+       v0 ^= second;
+       v3 ^= third;
+       HSIPROUND;
+       v0 ^= third;
+       v3 ^= forth;
+       HSIPROUND;
+       v0 ^= forth;
+       HPOSTAMBLE
+}
+EXPORT_SYMBOL(hsiphash_4u32);
+#endif
diff --git a/mount/build.rs b/mount/build.rs
deleted file mode 100644 (file)
index 6542889..0000000
+++ /dev/null
@@ -1,67 +0,0 @@
-fn main() {
-       use std::path::PathBuf;
-       use std::process::Command;
-
-       let out_dir: PathBuf = std::env::var_os("OUT_DIR").unwrap().into();
-       let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR").unwrap().into();
-       let libbcachefs_inc_dir = std::env::var("LIBBCACHEFS_INCLUDE")
-               .unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string());
-       let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir);
-       println!("{}", libbcachefs_inc_dir.display());
-
-       let libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs");
-       let bindings = bindgen::builder()
-               .header(top_dir
-                       .join("src")
-                       .join("libbcachefs_wrapper.h")
-                       .display()
-                       .to_string())
-               .clang_arg(format!(
-                       "-I{}",
-                       libbcachefs_inc_dir.join("include").display()
-               ))
-               .clang_arg(format!("-I{}", libbcachefs_inc_dir.display()))
-               .clang_arg("-DZSTD_STATIC_LINKING_ONLY")
-               .clang_arg("-DNO_BCACHEFS_FS")
-               .clang_arg("-D_GNU_SOURCE")
-               .derive_debug(false)
-               .derive_default(true)
-               .default_enum_style(bindgen::EnumVariation::Rust {
-                       non_exhaustive: true,
-               })
-               .whitelist_function("bch2_read_super")
-               .whitelist_function("bch2_sb_field_.*")
-               .whitelist_function("bch2_chacha_encrypt_key")
-               .whitelist_function("derive_passphrase")
-               .whitelist_function("request_key")
-               .whitelist_function("add_key")
-               .whitelist_function("keyctl_search")
-               .whitelist_var("BCH_.*")
-               .whitelist_var("KEY_SPEC_.*")
-               .whitelist_type("bch_kdf_types")
-               .whitelist_type("bch_sb_field_.*")
-               .whitelist_type("bch_encrypted_key")
-               .whitelist_type("nonce")
-               .rustified_enum("bch_kdf_types")
-               .opaque_type("gendisk")
-               .opaque_type("bkey")
-               .generate()
-               .unwrap();
-       bindings.write_to_file(out_dir.join("bcachefs.rs")).unwrap();
-
-       let keyutils = pkg_config::probe_library("libkeyutils").unwrap();
-       let bindings = bindgen::builder()
-               .header(top_dir
-                       .join("src")
-                       .join("keyutils_wrapper.h")
-                       .display()
-                       .to_string())
-               .clang_args(
-                       keyutils.include_paths
-                               .iter()
-                               .map(|p| format!("-I{}", p.display())),
-               )
-               .generate()
-               .unwrap();
-       bindings.write_to_file(out_dir.join("keyutils.rs")).unwrap();
-}
diff --git a/mount/src/filesystem.rs b/mount/src/filesystem.rs
deleted file mode 100644 (file)
index 36af8c0..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-extern "C" {
-       pub static stdout: *mut libc::FILE;
-}
-
-use getset::{CopyGetters, Getters};
-use std::path::PathBuf;
-#[derive(Getters, CopyGetters)]
-pub struct FileSystem {
-       /// External UUID of the bcachefs
-       #[getset(get = "pub")]
-       uuid: uuid::Uuid,
-       /// Whether filesystem is encrypted
-       #[getset(get_copy = "pub")]
-       encrypted: bool,
-       /// Super block
-       #[getset(get = "pub")]
-       sb: bcachefs::bch_sb_handle,
-       /// Member devices for this filesystem
-       #[getset(get = "pub")]
-       devices: Vec<PathBuf>,
-}
-
-/// Parse a comma-separated mount options and split out mountflags and filesystem
-/// specific options.
-fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
-       use either::Either::*;
-       let (opts, flags) = options
-               .as_ref()
-               .split(",")
-               .map(|o| match o {
-                       "dirsync" => Left(libc::MS_DIRSYNC),
-                       "lazytime" => Left(1 << 25), // MS_LAZYTIME
-                       "mand" => Left(libc::MS_MANDLOCK),
-                       "noatime" => Left(libc::MS_NOATIME),
-                       "nodev" => Left(libc::MS_NODEV),
-                       "nodiratime" => Left(libc::MS_NODIRATIME),
-                       "noexec" => Left(libc::MS_NOEXEC),
-                       "nosuid" => Left(libc::MS_NOSUID),
-                       "ro" => Left(libc::MS_RDONLY),
-                       "rw" => Left(0),
-                       "relatime" => Left(libc::MS_RELATIME),
-                       "strictatime" => Left(libc::MS_STRICTATIME),
-                       "sync" => Left(libc::MS_SYNCHRONOUS),
-                       "" => Left(0),
-                       o @ _ => Right(o),
-               })
-               .fold((Vec::new(), 0), |(mut opts, flags), next| match next {
-                       Left(f) => (opts, flags | f),
-                       Right(o) => {
-                               opts.push(o);
-                               (opts, flags)
-                       }
-               });
-
-       use itertools::Itertools;
-       (
-               if opts.len() == 0 {
-                       None
-               } else {
-                       Some(opts.iter().join(","))
-               },
-               flags,
-       )
-}
-
-impl FileSystem {
-       pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self {
-               Self {
-                       uuid: sb.sb().uuid(),
-                       encrypted: sb.sb().crypt().is_some(),
-                       sb: sb,
-                       devices: vec![],
-               }
-       }
-
-       pub fn mount(
-               &self,
-               target: impl AsRef<std::path::Path>,
-               options: impl AsRef<str>,
-       ) -> anyhow::Result<()> {
-               use itertools::Itertools;
-               use std::ffi::c_void;
-               use std::os::raw::c_char;
-               use std::os::unix::ffi::OsStrExt;
-               let src = self.devices.iter().map(|d| d.display()).join(":");
-               let (data, mountflags) = parse_mount_options(options);
-               let fstype = c_str!("bcachefs");
-
-               let src = std::ffi::CString::new(src)?; // bind the CString to keep it alive
-               let target = std::ffi::CString::new(target.as_ref().as_os_str().as_bytes())?; // ditto
-               let data = data.map(|data| std::ffi::CString::new(data)).transpose()?; // ditto
-
-               let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
-               let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
-               let data = data.as_ref().map_or(std::ptr::null(), |data| {
-                       data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void
-               });
-
-               let ret = unsafe { libc::mount(src, target, fstype, mountflags, data) };
-               if ret == 0 {
-                       Ok(())
-               } else {
-                       Err(crate::ErrnoError(errno::errno()).into())
-               }
-       }
-}
-
-use crate::bcachefs;
-use std::collections::HashMap;
-use uuid::Uuid;
-pub fn probe_filesystems() -> anyhow::Result<HashMap<Uuid, FileSystem>> {
-       use std::os::unix::ffi::OsStrExt;
-       let mut udev = udev::Enumerator::new()?;
-       let mut fss = HashMap::new();
-       udev.match_subsystem("block")?;
-
-       {
-               // Stop libbcachefs from spamming the output
-               let _gag = gag::Gag::stdout().unwrap();
-               for dev in udev.scan_devices()? {
-                       if let Some(p) = dev.devnode() {
-                               let path =
-                                       std::ffi::CString::new(p.as_os_str().as_bytes()).unwrap();
-                               let result = unsafe {
-                                       let mut opts = std::mem::MaybeUninit::zeroed();
-                                       let mut sb = std::mem::MaybeUninit::zeroed();
-                                       let ret = bcachefs::bch2_read_super(
-                                               path.as_ptr(),
-                                               opts.as_mut_ptr(),
-                                               sb.as_mut_ptr(),
-                                       );
-                                       if ret == -libc::EACCES {
-                                               Err(std::io::Error::new(
-                                                       std::io::ErrorKind::PermissionDenied,
-                                                       "no permission",
-                                               ))
-                                       } else if ret != 0 {
-                                               Err(std::io::Error::new(
-                                                       std::io::ErrorKind::Other,
-                                                       "failed to read super",
-                                               ))
-                                       } else {
-                                               Ok((opts.assume_init(), sb.assume_init()))
-                                       }
-                               };
-                               match result {
-                                       Ok((_, sb)) => match fss.get_mut(&sb.sb().uuid()) {
-                                               None => {
-                                                       let mut fs = FileSystem::new(sb);
-                                                       fs.devices.push(p.to_owned());
-                                                       fss.insert(fs.uuid, fs);
-                                               }
-                                               Some(fs) => {
-                                                       fs.devices.push(p.to_owned());
-                                               }
-                                       },
-                                       Err(e) if e.kind()
-                                               != std::io::ErrorKind::PermissionDenied =>
-                                       {
-                                               ()
-                                       }
-                                       e @ Err(_) => {
-                                               e?;
-                                       }
-                               }
-                       }
-               }
-               // Flush stdout so buffered output don't get printed after we remove the gag
-               unsafe {
-                       libc::fflush(stdout);
-               }
-       }
-       Ok(fss)
-}
diff --git a/mount/src/lib.rs b/mount/src/lib.rs
deleted file mode 100644 (file)
index 751eab3..0000000
+++ /dev/null
@@ -1,190 +0,0 @@
-use structopt::StructOpt;
-use anyhow::anyhow;
-
-#[macro_export]
-macro_rules! c_str {
-       ($lit:expr) => {
-               unsafe { std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char)
-                              .to_bytes_with_nul()
-                              .as_ptr() as *const std::os::raw::c_char }
-       };
-}
-
-#[derive(Debug)]
-struct ErrnoError(errno::Errno);
-impl std::fmt::Display for ErrnoError {
-       fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
-               self.0.fmt(f)
-       }
-}
-impl std::error::Error for ErrnoError {}
-
-#[derive(Debug)]
-pub(crate) enum KeyLocation {
-       Fail,
-       Wait,
-       Ask,
-}
-
-impl std::str::FromStr for KeyLocation {
-       type Err = anyhow::Error;
-       fn from_str(s: &str) -> anyhow::Result<Self> {
-               use anyhow::anyhow;
-               match s {
-                       "fail" => Ok(Self::Fail),
-                       "wait" => Ok(Self::Wait),
-                       "ask" => Ok(Self::Ask),
-                       _ => Err(anyhow!("invalid password option"))
-               }
-       }
-}
-
-#[derive(StructOpt, Debug)]
-/// Mount a bcachefs filesystem by its UUID.
-struct Options {
-       /// Where the password would be loaded from.
-       ///
-       /// Possible values are:
-       /// "fail" - don't ask for password, fail if filesystem is encrypted;
-       /// "wait" - wait for password to become available before mounting;
-       /// "ask" -  prompt the user for password;
-       #[structopt(short, long, default_value = "fail")]
-       key_location: KeyLocation,
-
-       /// External UUID of the bcachefs filesystem
-       uuid: uuid::Uuid,
-
-       /// Where the filesystem should be mounted. If not set, then the filesystem
-       /// won't actually be mounted. But all steps preceeding mounting the
-       /// filesystem (e.g. asking for passphrase) will still be performed.
-       mountpoint: Option<std::path::PathBuf>,
-
-       /// Mount options
-       #[structopt(short, default_value = "")]
-       options: String,
-}
-
-mod filesystem;
-mod key;
-mod keyutils {
-       #![allow(non_upper_case_globals)]
-       #![allow(non_camel_case_types)]
-       #![allow(non_snake_case)]
-       #![allow(unused)]
-
-       include!(concat!(env!("OUT_DIR"), "/keyutils.rs"));
-}
-
-mod bcachefs {
-       #![allow(non_upper_case_globals)]
-       #![allow(non_camel_case_types)]
-       #![allow(non_snake_case)]
-       #![allow(unused)]
-
-       include!(concat!(env!("OUT_DIR"), "/bcachefs.rs"));
-
-       use bitfield::bitfield;
-       bitfield! {
-               pub struct bch_scrypt_flags(u64);
-               pub N, _: 15, 0;
-               pub R, _: 31, 16;
-               pub P, _: 47, 32;
-       }
-       bitfield! {
-               pub struct bch_crypt_flags(u64);
-               TYPE, _: 4, 0;
-       }
-       use memoffset::offset_of;
-       impl bch_sb_field_crypt {
-               pub fn scrypt_flags(&self) -> Option<bch_scrypt_flags> {
-                       let t = bch_crypt_flags(self.flags);
-                       if t.TYPE() != bch_kdf_types::BCH_KDF_SCRYPT as u64 {
-                               None
-                       } else {
-                               Some(bch_scrypt_flags(self.kdf_flags))
-                       }
-               }
-               pub fn key(&self) -> &bch_encrypted_key {
-                       &self.key
-               }
-       }
-       impl bch_sb {
-               pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
-                       unsafe {
-                               let ptr = bch2_sb_field_get(
-                                       self as *const _ as *mut _,
-                                       bch_sb_field_type::BCH_SB_FIELD_crypt,
-                               ) as *const u8;
-                               if ptr.is_null() {
-                                       None
-                               } else {
-                                       let offset = offset_of!(bch_sb_field_crypt, field);
-                                       Some(&*((ptr.sub(offset)) as *const _))
-                               }
-                       }
-               }
-               pub fn uuid(&self) -> uuid::Uuid {
-                       uuid::Uuid::from_bytes(self.user_uuid.b)
-               }
-
-               /// Get the nonce used to encrypt the superblock
-               pub fn nonce(&self) -> nonce {
-                       use byteorder::{ReadBytesExt, LittleEndian};
-                       let mut internal_uuid = &self.uuid.b[..];
-                       let dword1 = internal_uuid.read_u32::<LittleEndian>().unwrap();
-                       let dword2 = internal_uuid.read_u32::<LittleEndian>().unwrap();
-                       nonce { d: [0, 0, dword1, dword2] }
-               }
-       }
-       impl bch_sb_handle {
-               pub fn sb(&self) -> &bch_sb {
-                       unsafe { &*self.sb }
-               }
-       }
-}
-
-fn main_inner() -> anyhow::Result<()> {
-       use itertools::Itertools;
-       use log::{info, trace};
-
-       env_logger::init();
-       let opt = Options::from_args();
-       trace!("{:?}", opt);
-
-       let fss = filesystem::probe_filesystems()?;
-       info!("Found {} bcachefs filesystems: ", fss.len());
-       for fs in fss.values() {
-               info!(
-                       "{} ({}): {}",
-                       fs.uuid(),
-                       if fs.encrypted() {
-                               "encrypted"
-                       } else {
-                               "unencrypted"
-                       },
-                       fs.devices().iter().map(|d| d.display()).join(" ")
-               );
-       }
-
-       if let Some(fs) = fss.get(&opt.uuid) {
-               if fs.encrypted() {
-                       info!("Making sure key is loaded for this filesystem");
-                       key::prepare_key(&fs, opt.key_location)?;
-               }
-
-               if let Some(p) = opt.mountpoint {
-                       fs.mount(&p, &opt.options)
-               } else {
-                       Ok(())
-               }
-       } else {
-               Err(anyhow!("Filesystem {} is not found", opt.uuid))
-       }
-}
-
-#[no_mangle]
-pub extern "C" fn main() {
-       if let Err(e) = main_inner() {
-               println!("Error: {:?}", e);
-       }
-}
diff --git a/nix/bcachefs-kernel.nix b/nix/bcachefs-kernel.nix
new file mode 100644 (file)
index 0000000..c937df4
--- /dev/null
@@ -0,0 +1,34 @@
+{ lib
+, fetchpatch
+, fetchgit
+, fetchFromGitHub
+, buildLinux
+, commit
+, sha256 ? lib.fakeSha256
+, kernelVersion ? "5.13.0"
+, kernelPatches ? [] # must always be defined in bcachefs' all-packages.nix entry because it's also a top-level attribute supplied by callPackage
+, argsOverride ? {}
+, versionString ? (builtins.substring 0 8 commit)
+, ...
+} @ args:
+
+buildLinux {
+       inherit kernelPatches;
+
+       # pname = "linux";
+       version = "${kernelVersion}-bcachefs-${versionString}";
+       
+       modDirVersion = kernelVersion;
+       
+
+       src = fetchFromGitHub {
+               name = "bcachefs-kernel-src";
+               owner = "koverstreet";
+               repo = "bcachefs";
+               rev = commit;
+               inherit sha256;
+       };
+
+       extraConfig = "BCACHEFS_FS m";
+       # NIX_DEBUG=5;
+}
\ No newline at end of file
diff --git a/nix/bcachefs.rev.sha256 b/nix/bcachefs.rev.sha256
new file mode 100644 (file)
index 0000000..3f06215
--- /dev/null
@@ -0,0 +1 @@
+sha256-JsWrbuxrs047YKGES+r7mMfPdDWIMAGrg1fWi8qU4+A=
\ No newline at end of file
diff --git a/nix/overlay.nix b/nix/overlay.nix
new file mode 100644 (file)
index 0000000..42d3fb2
--- /dev/null
@@ -0,0 +1,29 @@
+{ filter, self, ... }:
+final: prev: {
+       bcachefs = {
+               tools = final.callPackage ../default.nix {
+                       testWithValgrind = false;
+                       filter = filter.lib;
+                       lastModified = builtins.substring 0 8 self.lastModifiedDate;
+                       versionString = self.version;
+               };
+               toolsValgrind = final.bcachefs.tools.override {
+                       testWithValgrind = true;
+               };
+               toolsDebug = final.bcachefs.toolsValgrind.override {
+                       debugMode = true;
+               };
+
+               bch_bindgen = final.callPackage ../rust-src/bch_bindgen {};
+
+               mount = final.callPackage ../rust-src/mount {};
+
+               kernelPackages = final.recurseIntoAttrs (final.linuxPackagesFor final.bcachefs.kernel);
+               kernel = final.callPackage ./bcachefs-kernel.nix {
+                       commit = final.bcachefs.tools.bcachefs_revision;
+                       # This needs to be recalculated for every revision change
+                       sha256 = builtins.readFile ./bcachefs.rev.sha256;
+                       kernelPatches = [];
+               };
+       };
+}
index 4946cef947dc641e2970433d9e1df30c17e86fe0..00d0fbb47e35e2c6e332ea36bbdaa4685341452f 100644 (file)
@@ -15,7 +15,6 @@ BuildRequires:  keyutils-libs-devel
 BuildRequires:  libaio-devel
 BuildRequires:  libattr-devel
 BuildRequires:  libblkid-devel
-BuildRequires:  libscrypt-devel
 BuildRequires:  libsodium-devel
 BuildRequires:  libtool-ltdl-devel
 BuildRequires:  libuuid-devel
@@ -32,7 +31,6 @@ Requires:   keyutils-libs
 Requires:   libaio
 Requires:   libattr
 Requires:   libblkid
-Requires:   libscrypt
 Requires:   libsodium
 Requires:   libtool-ltdl
 Requires:   libuuid
diff --git a/qcow2.c b/qcow2.c
index b7aa8c26e0ca037f2d66de6057613b23a64796fd..7cf4992fbd9ffd43bd1d7478bf85d1e41dbfe5ee 100644 (file)
--- a/qcow2.c
+++ b/qcow2.c
@@ -46,7 +46,8 @@ static void flush_l2(struct qcow2_image *img)
        if (img->l1_index != -1) {
                img->l1_table[img->l1_index] =
                        cpu_to_be64(img->offset|QCOW_OFLAG_COPIED);
-               xpwrite(img->fd, img->l2_table, img->block_size, img->offset);
+               xpwrite(img->fd, img->l2_table, img->block_size, img->offset,
+                       "qcow2 l2 table");
                img->offset += img->block_size;
 
                memset(img->l2_table, 0, img->block_size);
@@ -101,7 +102,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
                        img.offset += img.block_size;
 
                        xpread(infd, buf, block_size, src_offset);
-                       xpwrite(outfd, buf, block_size, dst_offset);
+                       xpwrite(outfd, buf, block_size, dst_offset,
+                               "qcow2 data");
 
                        add_l2(&img, src_offset / block_size, dst_offset);
                }
@@ -111,7 +113,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
        /* Write L1 table: */
        dst_offset              = img.offset;
        img.offset              += round_up(l1_size * sizeof(u64), block_size);
-       xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset);
+       xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset,
+               "qcow2 l1 table");
 
        /* Write header: */
        hdr.magic               = cpu_to_be32(QCOW_MAGIC);
@@ -123,7 +126,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data,
 
        memset(buf, 0, block_size);
        memcpy(buf, &hdr, sizeof(hdr));
-       xpwrite(img.fd, buf, block_size, 0);
+       xpwrite(img.fd, buf, block_size, 0,
+               "qcow2 header");
 
        free(img.l2_table);
        free(img.l1_table);
diff --git a/rust-src/bch_bindgen/.gitignore b/rust-src/bch_bindgen/.gitignore
new file mode 100644 (file)
index 0000000..0aa133a
--- /dev/null
@@ -0,0 +1,15 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# Required By Nix
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
diff --git a/rust-src/bch_bindgen/Cargo.lock b/rust-src/bch_bindgen/Cargo.lock
new file mode 100644 (file)
index 0000000..2138d33
--- /dev/null
@@ -0,0 +1,484 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "anyhow"
+version = "1.0.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1"
+
+[[package]]
+name = "autocfg"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
+
+[[package]]
+name = "bch_bindgen"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "bitfield",
+ "byteorder",
+ "gag",
+ "libc",
+ "memoffset",
+ "pkg-config",
+ "tracing",
+ "tracing-attributes",
+ "udev",
+ "uuid",
+]
+
+[[package]]
+name = "bindgen"
+version = "0.59.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
+dependencies = [
+ "bitflags",
+ "cexpr",
+ "clang-sys",
+ "lazy_static",
+ "lazycell",
+ "peeking_take_while",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+]
+
+[[package]]
+name = "bitfield"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitvec"
+version = "0.19.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
+[[package]]
+name = "byteorder"
+version = "1.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
+
+[[package]]
+name = "cexpr"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
+dependencies = [
+ "nom",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "clang-sys"
+version = "1.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee"
+dependencies = [
+ "glob",
+ "libc",
+]
+
+[[package]]
+name = "filedescriptor"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f"
+dependencies = [
+ "libc",
+ "thiserror",
+ "winapi",
+]
+
+[[package]]
+name = "funty"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
+
+[[package]]
+name = "gag"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972"
+dependencies = [
+ "filedescriptor",
+ "tempfile",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "glob"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
+[[package]]
+name = "libc"
+version = "0.2.103"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd8f7255a17a627354f321ef0055d63b898c6fb27eff628af4d1b66b7331edf6"
+
+[[package]]
+name = "libudev-sys"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c8469b4a23b962c1396b9b451dda50ef5b283e8dd309d69033475fa9b334324"
+dependencies = [
+ "libc",
+ "pkg-config",
+]
+
+[[package]]
+name = "memchr"
+version = "2.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525"
+
+[[package]]
+name = "memoffset"
+version = "0.5.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "nom"
+version = "6.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6"
+dependencies = [
+ "bitvec",
+ "funty",
+ "memchr",
+ "version_check",
+]
+
+[[package]]
+name = "peeking_take_while"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
+
+[[package]]
+name = "pkg-config"
+version = "0.3.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857"
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d"
+dependencies = [
+ "unicode-xid",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "radium"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
+
+[[package]]
+name = "rand"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+ "rand_hc",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_hc"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff"
+dependencies = [
+ "bitflags",
+]
+
+[[package]]
+name = "regex"
+version = "1.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461"
+dependencies = [
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.25"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
+
+[[package]]
+name = "remove_dir_all"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
+[[package]]
+name = "shlex"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+
+[[package]]
+name = "syn"
+version = "1.0.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-xid",
+]
+
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
+[[package]]
+name = "tempfile"
+version = "3.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "rand",
+ "redox_syscall",
+ "remove_dir_all",
+ "winapi",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing"
+version = "0.1.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8"
+dependencies = [
+ "cfg-if",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "udev"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24953d50a3bce0f5f5a9a2766567072dc9af8096f8c40ea81815da651066bc9f"
+dependencies = [
+ "libc",
+ "libudev-sys",
+]
+
+[[package]]
+name = "unicode-xid"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
+
+[[package]]
+name = "uuid"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
+
+[[package]]
+name = "version_check"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
+
+[[package]]
+name = "wasi"
+version = "0.10.2+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wyz"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
diff --git a/rust-src/bch_bindgen/Cargo.toml b/rust-src/bch_bindgen/Cargo.toml
new file mode 100644 (file)
index 0000000..91cc77f
--- /dev/null
@@ -0,0 +1,26 @@
+[package]
+name = "bch_bindgen"
+version = "0.1.0"
+authors = [ "Kayla Firestack <dev@kaylafire.me>", "Yuxuan Shui <yshuiv7@gmail.com>" ]
+edition = "2018"
+
+[lib]
+crate-type = ["lib"]
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+tracing = "0.1.26"
+anyhow = "1.0"
+udev = "0.4"
+uuid = "0.8"
+bitfield = "0.13"
+memoffset = "0.5"
+byteorder = "1.3"
+tracing-attributes = "0.1.15"
+libc = "0.2.69"
+gag = "1.0.0"
+
+
+[build-dependencies]
+pkg-config = "0.3"
+bindgen = { version = "0.59.1", default-features = false }
diff --git a/rust-src/bch_bindgen/build.rs b/rust-src/bch_bindgen/build.rs
new file mode 100644 (file)
index 0000000..fd570db
--- /dev/null
@@ -0,0 +1,74 @@
+fn main() {
+       use std::path::PathBuf;
+       // use std::process::Command;
+
+       let out_dir: PathBuf = std::env::var_os("OUT_DIR").expect("ENV Var 'OUT_DIR' Expected").into();
+       let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR")
+               .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected")
+               .into();
+       let libbcachefs_inc_dir =
+               std::env::var("LIBBCACHEFS_INCLUDE").unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string());
+       let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir);
+       println!("{}", libbcachefs_inc_dir.display());
+
+       println!("cargo:rustc-link-lib=dylib=bcachefs");
+       println!("cargo:rustc-link-search={}", env!("LIBBCACHEFS_LIB"));
+
+       let _libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs");
+       let bindings = bindgen::builder()
+               .header(top_dir.join("src").join("libbcachefs_wrapper.h").display().to_string())
+               .clang_arg(format!("-I{}", libbcachefs_inc_dir.join("include").display()))
+               .clang_arg(format!("-I{}", libbcachefs_inc_dir.display()))
+               .clang_arg("-DZSTD_STATIC_LINKING_ONLY")
+               .clang_arg("-DNO_BCACHEFS_FS")
+               .clang_arg("-D_GNU_SOURCE")
+               .derive_debug(true)
+               .derive_default(true)
+               .derive_eq(true)
+               .layout_tests(true)
+               .default_enum_style(bindgen::EnumVariation::Rust { non_exhaustive: true })
+               .allowlist_function(".*bch2_.*")
+               // .allowlist_function("bch2_read_super")
+               // .allowlist_function("bch2_sb_field_.*")
+               // .allowlist_function("bch2_super_write")
+               // .allowlist_function("bch2_chacha_encrypt_key")
+               // .allowlist_function("__bch2_super_read")
+               .allowlist_function("bio_.*")
+               .allowlist_function("bch2_super_write_fd")
+               .allowlist_function("derive_passphrase")
+               .allowlist_function("request_key")
+               .allowlist_function("add_key")
+               .allowlist_function("keyctl_search")
+               .blocklist_type("bch_extent_ptr")
+               .blocklist_type("btree_node")
+               .blocklist_type("bch_extent_crc32")
+               .blocklist_type("rhash_lock_head")
+               .blocklist_type("srcu_struct")
+               .allowlist_var("BCH_.*")
+               .allowlist_var("KEY_SPEC_.*")
+               .allowlist_type("bch_kdf_types")
+               .allowlist_type("bch_sb_field_.*")
+               .allowlist_type("bch_encrypted_key")
+               .allowlist_type("nonce")
+               .newtype_enum("bch_kdf_types")
+               .opaque_type("gendisk")
+               .opaque_type("bkey")
+               // .opaque_type("bch_extent_ptr")
+               // .opaque_type("bch_extent_crc32")
+               .opaque_type("open_bucket.*")
+               .generate()
+               .expect("BindGen Generation Failiure: [libbcachefs_wrapper]");
+       bindings
+               .write_to_file(out_dir.join("bcachefs.rs"))
+               .expect("Writing to output file failed for: `bcachefs.rs`");
+
+       let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib");
+       let bindings = bindgen::builder()
+               .header(top_dir.join("src").join("keyutils_wrapper.h").display().to_string())
+               .clang_args(keyutils.include_paths.iter().map(|p| format!("-I{}", p.display())))
+               .generate()
+               .expect("BindGen Generation Failiure: [Keyutils]");
+       bindings
+               .write_to_file(out_dir.join("keyutils.rs"))
+               .expect("Writing to output file failed for: `keyutils.rs`");
+}
diff --git a/rust-src/bch_bindgen/default.nix b/rust-src/bch_bindgen/default.nix
new file mode 100644 (file)
index 0000000..f6053d5
--- /dev/null
@@ -0,0 +1,76 @@
+{ lib
+, stdenv
+, rustPlatform
+, llvmPackages
+, bcachefs
+, pkg-config
+
+, udev
+, liburcu
+, zstd
+, keyutils
+, libaio
+               
+, lz4 # liblz4
+, libsodium
+, libuuid
+, zlib # zlib1g
+, libscrypt
+
+, rustfmt
+
+, glibc
+, ...
+}: let 
+       include = {
+               glibc = "${glibc.dev}/include";
+               clang = let libc = llvmPackages.libclang; in
+                       "${libc.lib}/lib/clang/${libc.version}/include";
+               urcu = "${liburcu}/include";
+               zstd = "${zstd.dev}/include";
+       };
+       cargo = lib.trivial.importTOML ./Cargo.toml;
+in rustPlatform.buildRustPackage {
+       pname = cargo.package.name;
+       version = cargo.package.version;
+       
+       src = builtins.path { path = ./.; name = "bch_bindgen"; };
+
+       cargoLock = { lockFile = ./Cargo.lock; };
+
+       nativeBuildInputs = [ rustfmt pkg-config ];
+       buildInputs = [
+               
+               # libaio
+               keyutils # libkeyutils
+               lz4 # liblz4
+               libsodium
+               liburcu
+               libuuid
+               zstd # libzstd
+               zlib # zlib1g
+               udev
+               libscrypt
+               libaio
+       ];
+       
+       LIBBCACHEFS_LIB ="${bcachefs.tools}/lib";
+       LIBBCACHEFS_INCLUDE = bcachefs.tools.src;
+       LIBCLANG_PATH = "${llvmPackages.libclang.lib}/lib";
+       BINDGEN_EXTRA_CLANG_ARGS = lib.replaceStrings ["\n" "\t"] [" " ""] ''
+               -std=gnu99
+               -I${include.glibc}
+               -I${include.clang}
+               -I${include.urcu}
+               -I${include.zstd}
+       '';
+
+       postPatch = ''
+               cp ${./Cargo.lock} Cargo.lock
+       '';
+       
+
+       doCheck = true;
+       
+       # NIX_DEBUG = 4;
+}
\ No newline at end of file
diff --git a/rust-src/bch_bindgen/rustfmt.toml b/rust-src/bch_bindgen/rustfmt.toml
new file mode 100644 (file)
index 0000000..a2b7f32
--- /dev/null
@@ -0,0 +1,2 @@
+max_width=120
+hard_tabs = true
diff --git a/rust-src/bch_bindgen/src/bcachefs.rs b/rust-src/bch_bindgen/src/bcachefs.rs
new file mode 100644 (file)
index 0000000..cc98ffc
--- /dev/null
@@ -0,0 +1,124 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+include!(concat!(env!("OUT_DIR"), "/bcachefs.rs"));
+
+use bitfield::bitfield;
+bitfield! {
+       pub struct bch_scrypt_flags(u64);
+       pub N, _: 15, 0;
+       pub R, _: 31, 16;
+       pub P, _: 47, 32;
+}
+bitfield! {
+       pub struct bch_crypt_flags(u64);
+       pub TYPE, _: 4, 0;
+}
+use memoffset::offset_of;
+impl bch_sb_field_crypt {
+       pub fn scrypt_flags(&self) -> Option<bch_scrypt_flags> {
+               use std::convert::TryInto;
+               match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) {
+                       bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)),
+                       _ => None,
+               }
+       }
+       pub fn key(&self) -> &bch_encrypted_key {
+               &self.key
+       }
+}
+impl PartialEq for bch_sb {
+       fn eq(&self, other: &Self) -> bool {
+               self.magic.b == other.magic.b
+               && self.user_uuid.b == other.user_uuid.b
+               && self.block_size == other.block_size
+               && self.version == other.version
+               && self.uuid.b == other.uuid.b
+               && self.seq == other.seq
+       }
+}
+
+impl std::fmt::Debug for bch_sb {
+       fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+               f.debug_struct("bch_sb")
+                       .field("uuid", &self.uuid())
+                       .field("version", &(self.version, self.version_min))
+                       .field("block_size", &self.block_size)
+                       .field("device_idx", &self.dev_idx)
+                       .field("seq", &self.seq)
+                       .field("csum", &(self.csum.lo, self.csum.hi))
+                       .field("offset", &self.offset)
+               .finish_non_exhaustive()
+    }
+}
+
+
+impl bch_sb {
+       pub fn crypt(&self) -> Option<&bch_sb_field_crypt> {
+               unsafe {
+                       let ptr = bch2_sb_field_get(self as *const _ as *mut _, bch_sb_field_type::BCH_SB_FIELD_crypt) as *const u8;
+                       if ptr.is_null() {
+                               None
+                       } else {
+                               let offset = offset_of!(bch_sb_field_crypt, field);
+                               Some(&*((ptr.sub(offset)) as *const _))
+                       }
+               }
+       }
+       pub fn uuid(&self) -> uuid::Uuid {
+               uuid::Uuid::from_bytes(self.user_uuid.b)
+       }
+
+       /// Get the nonce used to encrypt the superblock
+       pub fn nonce(&self) -> nonce {
+               use byteorder::{LittleEndian, ReadBytesExt};
+               let mut internal_uuid = &self.uuid.b[..];
+               let dword1 = internal_uuid.read_u32::<LittleEndian>().unwrap();
+               let dword2 = internal_uuid.read_u32::<LittleEndian>().unwrap();
+               nonce {
+                       d: [0, 0, dword1, dword2],
+               }
+       }
+}
+impl bch_sb_handle {
+       pub fn sb(&self) -> &bch_sb {
+               unsafe { &*self.sb }
+       }
+
+       pub fn bdev(&self) -> &block_device {
+               unsafe { &*self.bdev }
+       }
+}
+
+#[repr(C)]
+// #[repr(align(8))]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct bch_extent_ptr {
+       pub _bitfield_1: __BindgenBitfieldUnit<[u8; 8usize]>,
+}
+
+#[repr(C, packed(8))]
+pub struct btree_node {
+       pub csum: bch_csum,
+       pub magic: __le64,
+       pub flags: __le64,
+       pub min_key: bpos,
+       pub max_key: bpos,
+       pub _ptr: bch_extent_ptr,
+       pub format: bkey_format,
+       pub __bindgen_anon_1: btree_node__bindgen_ty_1,
+}
+
+#[repr(C, packed(8))]
+// #[repr(align(8))]
+#[derive(Debug, Default, Copy, Clone)]
+pub struct bch_extent_crc32 {
+       pub _bitfield_1: __BindgenBitfieldUnit<[u8; 4usize]>,
+       pub csum: __u32,
+}
+
+// #[repr(u8)]
+pub enum rhash_lock_head {}
+pub enum srcu_struct {}
diff --git a/rust-src/bch_bindgen/src/keyutils.rs b/rust-src/bch_bindgen/src/keyutils.rs
new file mode 100644 (file)
index 0000000..30fc56f
--- /dev/null
@@ -0,0 +1,6 @@
+#![allow(non_upper_case_globals)]
+#![allow(non_camel_case_types)]
+#![allow(non_snake_case)]
+#![allow(unused)]
+
+include!(concat!(env!("OUT_DIR"), "/keyutils.rs"));
diff --git a/rust-src/bch_bindgen/src/lib.rs b/rust-src/bch_bindgen/src/lib.rs
new file mode 100644 (file)
index 0000000..c19b5a2
--- /dev/null
@@ -0,0 +1,7 @@
+pub mod bcachefs;
+pub mod keyutils;
+pub mod rs;
+
+pub mod c {
+       pub use crate::bcachefs::*;
+}
similarity index 59%
rename from mount/src/libbcachefs_wrapper.h
rename to rust-src/bch_bindgen/src/libbcachefs_wrapper.h
index 9d9754c12e63386c2fa12b7ca5247a18ab5edee7..2a0e7026068582c3f2b33ff855e0f726f347c4a3 100644 (file)
@@ -1,4 +1,8 @@
 #include "../libbcachefs/super-io.h"
 #include "../libbcachefs/checksum.h"
 #include "../libbcachefs/bcachefs_format.h"
+#include "../libbcachefs/opts.h"
+#include "../libbcachefs.h"
 #include "../crypto.h"
+#include "../include/linux/bio.h"
+
diff --git a/rust-src/bch_bindgen/src/rs.rs b/rust-src/bch_bindgen/src/rs.rs
new file mode 100644 (file)
index 0000000..4452f0b
--- /dev/null
@@ -0,0 +1,58 @@
+use crate::bcachefs;
+
+pub const SUPERBLOCK_MAGIC: uuid::Uuid = uuid::Uuid::from_u128(
+       0x_c68573f6_4e1a_45ca_8265_f57f48ba6d81
+);
+       
+extern "C" {
+       pub static stdout: *mut libc::FILE;
+}
+
+pub enum ReadSuperErr {
+       Io(std::io::Error),
+}
+
+type RResult<T> = std::io::Result<std::io::Result<T>>;
+
+#[tracing_attributes::instrument(skip(opts))]
+pub fn read_super_opts(path: &std::path::Path, mut opts: bcachefs::bch_opts) -> RResult<bcachefs::bch_sb_handle> {
+       // let devp = camino::Utf8Path::from_path(devp).unwrap();
+
+       use std::os::unix::ffi::OsStrExt;
+       let path = std::ffi::CString::new(path.as_os_str().as_bytes())?;
+
+       let mut sb = std::mem::MaybeUninit::zeroed();
+
+       // use gag::{BufferRedirect};
+       // // Stop libbcachefs from spamming the output
+       // let gag = BufferRedirect::stderr().unwrap();
+       // tracing::trace!("entering libbcachefs");
+
+       let ret = unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) };
+       tracing::trace!(%ret);
+
+       match -ret {
+               libc::EACCES => Err(std::io::Error::new(
+                       std::io::ErrorKind::PermissionDenied,
+                       "Access Permission Denied",
+               )),
+               0 => Ok(Ok(unsafe { sb.assume_init() })),
+               22 => Ok(Err(std::io::Error::new(
+                       std::io::ErrorKind::InvalidData,
+                       "Not a BCacheFS SuperBlock",
+               ))),
+               code => {
+                       tracing::debug!(msg = "BCacheFS return error code", ?code);
+                       Ok(Err(std::io::Error::new(
+                               std::io::ErrorKind::Other,
+                               "Failed to Read SuperBlock",
+                       )))
+               }
+       }
+}
+
+#[tracing_attributes::instrument]
+pub fn read_super(path: &std::path::Path) -> RResult<bcachefs::bch_sb_handle> {
+       let opts = bcachefs::bch_opts::default(); //unsafe {std::mem::MaybeUninit::zeroed().assume_init()};
+       read_super_opts(path, opts)
+}
diff --git a/rust-src/mount/.gitignore b/rust-src/mount/.gitignore
new file mode 100644 (file)
index 0000000..644cd42
--- /dev/null
@@ -0,0 +1,15 @@
+# Generated by Cargo
+# will have compiled files and executables
+debug/
+target/
+
+# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
+# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
+# Needed by nix
+# Cargo.lock
+
+# These are backup files generated by rustfmt
+**/*.rs.bk
+
+# MSVC Windows builds of rustc generate these, which store debugging information
+*.pdb
similarity index 68%
rename from mount/Cargo.lock
rename to rust-src/mount/Cargo.lock
index 77ccbba753cb14e535d5ff0933e146b0ba5863cf..92d13cf5b370443f7964a3598d4e2409da7e0888 100644 (file)
@@ -1,5 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+version = 3
+
 [[package]]
 name = "aho-corasick"
 version = "0.7.10"
@@ -18,6 +20,15 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "ansi_term"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
+dependencies = [
+ "winapi",
+]
+
 [[package]]
 name = "anyhow"
 version = "1.0.28"
@@ -43,39 +54,56 @@ checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d"
 
 [[package]]
 name = "bcachefs-mount"
-version = "0.1.0"
+version = "0.3.1"
 dependencies = [
  "anyhow",
- "bindgen",
- "bitfield",
+ "bch_bindgen",
  "byteorder",
+ "camino",
  "clap",
  "either",
- "env_logger",
  "errno",
  "gag",
  "getset",
  "itertools",
  "libc",
- "log",
- "memoffset",
  "parse-display",
- "pkg-config",
  "rpassword",
  "structopt",
+ "tracing",
+ "tracing-attributes",
+ "tracing-log",
+ "tracing-subscriber",
+ "udev",
+ "uuid",
+]
+
+[[package]]
+name = "bch_bindgen"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bindgen",
+ "bitfield",
+ "byteorder",
+ "gag",
+ "libc",
+ "memoffset",
+ "pkg-config",
+ "tracing",
+ "tracing-attributes",
  "udev",
  "uuid",
 ]
 
 [[package]]
 name = "bindgen"
-version = "0.53.2"
+version = "0.59.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6bb26d6a69a335b8cb0e7c7e9775cd5666611dc50a37177c3f2cedcfc040e8c8"
+checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375"
 dependencies = [
  "bitflags",
  "cexpr",
- "cfg-if",
  "clang-sys",
  "lazy_static",
  "lazycell",
@@ -99,17 +127,35 @@ version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
 
+[[package]]
+name = "bitvec"
+version = "0.19.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321"
+dependencies = [
+ "funty",
+ "radium",
+ "tap",
+ "wyz",
+]
+
 [[package]]
 name = "byteorder"
 version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
 
+[[package]]
+name = "camino"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52d74260d9bf6944e2208aa46841b4b8f0d7ffc0849a06837b2f510337f86b2b"
+
 [[package]]
 name = "cexpr"
-version = "0.4.0"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27"
+checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89"
 dependencies = [
  "nom",
 ]
@@ -120,11 +166,29 @@ version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
+dependencies = [
+ "libc",
+ "num-integer",
+ "num-traits",
+ "winapi",
+]
+
 [[package]]
 name = "clang-sys"
-version = "0.29.3"
+version = "1.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a"
+checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee"
 dependencies = [
  "glob",
  "libc",
@@ -136,7 +200,7 @@ version = "2.33.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
 dependencies = [
- "ansi_term",
+ "ansi_term 0.11.0",
  "atty",
  "bitflags",
  "strsim",
@@ -152,15 +216,6 @@ version = "1.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
 
-[[package]]
-name = "env_logger"
-version = "0.7.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
-dependencies = [
- "log",
-]
-
 [[package]]
 name = "errno"
 version = "0.2.5"
@@ -183,12 +238,29 @@ dependencies = [
 ]
 
 [[package]]
-name = "gag"
-version = "0.1.10"
+name = "filedescriptor"
+version = "0.8.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8cc0b9f53275dc5fada808f1d2f82e3688a6c14d735633d1590b7be8eb2307b5"
+checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f"
 dependencies = [
  "libc",
+ "thiserror",
+ "winapi",
+]
+
+[[package]]
+name = "funty"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7"
+
+[[package]]
+name = "gag"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972"
+dependencies = [
+ "filedescriptor",
  "tempfile",
 ]
 
@@ -204,7 +276,7 @@ version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
 dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
  "libc",
  "wasi",
 ]
@@ -254,6 +326,12 @@ dependencies = [
  "either",
 ]
 
+[[package]]
+name = "itoa"
+version = "0.4.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -288,7 +366,16 @@ version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
 dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
+]
+
+[[package]]
+name = "matchers"
+version = "0.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1"
+dependencies = [
+ "regex-automata",
 ]
 
 [[package]]
@@ -308,14 +395,35 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "5.1.1"
+version = "6.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6"
+checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6"
 dependencies = [
+ "bitvec",
+ "funty",
  "memchr",
  "version_check",
 ]
 
+[[package]]
+name = "num-integer"
+version = "0.1.44"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
+dependencies = [
+ "autocfg",
+]
+
 [[package]]
 name = "parse-display"
 version = "0.1.1"
@@ -347,6 +455,12 @@ version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
 
+[[package]]
+name = "pin-project-lite"
+version = "0.2.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443"
+
 [[package]]
 name = "pkg-config"
 version = "0.3.17"
@@ -429,6 +543,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "radium"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -488,6 +608,15 @@ dependencies = [
  "thread_local",
 ]
 
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax",
+]
+
 [[package]]
 name = "regex-syntax"
 version = "0.6.17"
@@ -519,11 +648,49 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
 
+[[package]]
+name = "ryu"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
+
+[[package]]
+name = "serde"
+version = "1.0.130"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913"
+
+[[package]]
+name = "serde_json"
+version = "1.0.67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950"
+dependencies = [
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982"
+dependencies = [
+ "lazy_static",
+]
+
 [[package]]
 name = "shlex"
-version = "0.1.1"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+
+[[package]]
+name = "smallvec"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
+checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e"
 
 [[package]]
 name = "strsim"
@@ -533,9 +700,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
 
 [[package]]
 name = "structopt"
-version = "0.3.14"
+version = "0.3.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef"
+checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa"
 dependencies = [
  "clap",
  "lazy_static",
@@ -544,9 +711,9 @@ dependencies = [
 
 [[package]]
 name = "structopt-derive"
-version = "0.4.7"
+version = "0.4.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a"
+checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba"
 dependencies = [
  "heck",
  "proc-macro-error 1.0.2",
@@ -577,13 +744,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "tap"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
+
 [[package]]
 name = "tempfile"
 version = "3.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
 dependencies = [
- "cfg-if",
+ "cfg-if 0.1.10",
  "libc",
  "rand",
  "redox_syscall",
@@ -611,6 +784,26 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "thiserror"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "318234ffa22e0920fe9a40d7b8369b5f649d490980cf7aadcf1eb91594869b42"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cae2447b6282786c3493999f40a9be2a6ad20cb8bd268b0a0dbf5a065535c0ab"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "thread_local"
 version = "1.0.1"
@@ -620,6 +813,81 @@ dependencies = [
  "lazy_static",
 ]
 
+[[package]]
+name = "tracing"
+version = "0.1.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d"
+dependencies = [
+ "cfg-if 1.0.0",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3"
+dependencies = [
+ "lazy_static",
+ "log",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.2.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe"
+dependencies = [
+ "ansi_term 0.12.1",
+ "chrono",
+ "lazy_static",
+ "matchers",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
 [[package]]
 name = "udev"
 version = "0.4.0"
@@ -693,3 +961,9 @@ name = "winapi-x86_64-pc-windows-gnu"
 version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "wyz"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
similarity index 54%
rename from mount/Cargo.toml
rename to rust-src/mount/Cargo.toml
index 4fd0d497bdeeab204064d3da695a025ade0db368..d48d4f727d917b01350764f75ce9fa11e6e7c9d1 100644 (file)
@@ -1,34 +1,30 @@
 [package]
 name = "bcachefs-mount"
-version = "0.1.0"
-authors = ["Yuxuan Shui <yshuiv7@gmail.com>"]
+version = "0.3.1"
+authors = ["Yuxuan Shui <yshuiv7@gmail.com>", "Kayla Firestack <dev@kaylafire.me>"]
 edition = "2018"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-log = "0.4"
+tracing = "0.1.26"
+tracing-log = "0.1.2"
+tracing-subscriber = "0.2.20"
+tracing-attributes = "0.1.15"
 clap = { version = "2.33", features = [ "wrap_help" ] }
-env_logger = { version = "0.7", default-features = false }
 anyhow = "1.0"
-udev = "0.4"
-uuid = "0.8"
 libc = "0.2.69"
-gag = "0.1"
-bitfield = "0.13"
-memoffset = "0.5"
+uuid = "0.8"
+udev = "0.4"
+gag = "1.0.0"
 getset = "0.1"
 itertools = "0.9"
-structopt = "0.3"
+structopt = "0.3.23"
 parse-display = "0.1"
 errno = "0.2"
 either = "1.5"
 rpassword = "4"
+camino = "1.0.5"
+bch_bindgen = { path = "../bch_bindgen" }
 byteorder = "1.3"
 
-[lib]
-crate-type = ["staticlib"]
-
-[build-dependencies]
-pkg-config = "0.3"
-bindgen = { version = "0.53", default-features = false }
diff --git a/rust-src/mount/README.md b/rust-src/mount/README.md
new file mode 100644 (file)
index 0000000..e4700f6
--- /dev/null
@@ -0,0 +1,62 @@
+Usage
+=====
+
+```
+bcachefs-mount 0.1.0
+Mount a bcachefs filesystem by its UUID
+
+USAGE:
+    bcachefs-mount [OPTIONS] <uuid> <mountpoint>
+
+FLAGS:
+    -h, --help       
+            Prints help information
+
+    -V, --version    
+            Prints version information
+
+
+OPTIONS:
+    -o <options>                 
+            Mount options [default: ]
+
+    -p, --password <password>    
+            Where the password would be loaded from.
+            
+            Possible values are: "fail" - don't ask for password, fail if filesystem is encrypted; "wait" - wait for
+            password to become available before mounting; "ask" -  prompt the user for password; [default: fail]
+
+ARGS:
+    <uuid>          
+            External UUID of the bcachefs filesystem
+
+    <mountpoint>    
+            Where the filesystem should be mounted
+```
+
+Caveats
+=======
+
+* `--password ask` is not yet implemented, but you can use `--password wait`, and load the key with `bcachefs unlock`.
+
+Build
+=====
+
+```sh
+$ git submodule update --init --recursive
+$ cargo build --release
+```
+
+Binary will be built in `target/release/bcachefs-mount`
+
+Dependencies:
+
+* rust
+* blkid
+* uuid
+* liburcu
+* libsodium
+* zlib
+* liblz4
+* libzstd
+* libkeyutils
diff --git a/rust-src/mount/default.nix b/rust-src/mount/default.nix
new file mode 100644 (file)
index 0000000..dab7db7
--- /dev/null
@@ -0,0 +1,41 @@
+{ lib
+
+, stdenv
+, glibc
+, llvmPackages
+, rustPlatform
+
+, bcachefs
+
+, ...
+}: rustPlatform.buildRustPackage ( let 
+       cargo = lib.trivial.importTOML ./Cargo.toml;
+in {
+       pname = "mount.bcachefs";
+       version = cargo.package.version;
+       
+       src = builtins.path { path = ../.; name = "rust-src"; };
+       sourceRoot = "rust-src/mount";
+
+       cargoLock = { lockFile = ./Cargo.lock; };
+
+       nativeBuildInputs = bcachefs.bch_bindgen.nativeBuildInputs;
+       buildInputs = bcachefs.bch_bindgen.buildInputs;
+       inherit (bcachefs.bch_bindgen)
+               LIBBCACHEFS_INCLUDE
+               LIBBCACHEFS_LIB
+               LIBCLANG_PATH
+               BINDGEN_EXTRA_CLANG_ARGS;
+       
+       postInstall = ''
+               ln $out/bin/${cargo.package.name} $out/bin/mount.bcachefs
+               ln -s $out/bin $out/sbin
+       '';
+       # -isystem ${llvmPackages.libclang.lib}/lib/clang/${lib.getVersion llvmPackages.libclang}/include";
+       # CFLAGS = "-I${llvmPackages.libclang.lib}/include";
+       # LDFLAGS = "-L${libcdev}";
+
+       doCheck = false;
+       
+       # NIX_DEBUG = 4;
+})
\ No newline at end of file
diff --git a/rust-src/mount/module.nix b/rust-src/mount/module.nix
new file mode 100644 (file)
index 0000000..b62aa7d
--- /dev/null
@@ -0,0 +1,54 @@
+## Mirrors: https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/tasks/filesystems/bcachefs.nix
+## with changes to use flakes and import mount.bcachefs
+{ config, lib, pkgs, utils, ... }:
+
+with lib;
+
+let
+
+       bootFs = filterAttrs (n: fs: (fs.fsType == "bcachefs") && (utils.fsNeededForBoot fs)) config.fileSystems;
+       cfg = config.filesystems.bcachefs;
+in
+
+{
+       options.filesystems.bcachefs.packages.tools = lib.mkOption {
+               description = "Which package to use to link in the bcachefs tools package";
+               default = pkgs.bcachefs.tools;
+               type = lib.types.package;
+       };
+       options.filesystems.bcachefs.packages.mount = lib.mkOption {
+               description = "Which package to use to link in the bcachefs mount package";
+               default = pkgs.bcachefs.mount;
+               type = lib.types.package;
+       };
+       options.filesystems.bcachefs.packages.kernelPackages = lib.mkOption {
+               description = "Which package to use to link in the kernel package to use";
+               default = pkgs.bcachefs.kernelPackages;
+               type = lib.types.attrs;
+
+       };
+
+       config = mkIf (elem "bcachefs" config.boot.supportedFilesystems) (mkMerge [
+               {
+                       system.fsPackages = [ cfg.packages.tools cfg.packages.mount ];
+
+                       # use kernel package with bcachefs support until it's in mainline
+                       boot.kernelPackages = cfg.packages.kernelPackages;
+               }
+
+               (mkIf ((elem "bcachefs" config.boot.initrd.supportedFilesystems) || (bootFs != {})) {
+                       # chacha20 and poly1305 are required only for decryption attempts
+                       boot.initrd.availableKernelModules = [ "sha256" "chacha20" "poly1305" ];
+                       boot.initrd.kernelModules = [ "bcachefs" ];
+
+                       boot.initrd.extraUtilsCommands = ''
+                               copy_bin_and_libs ${cfg.packages.tools}/bin/bcachefs
+                               copy_bin_and_libs ${cfg.packages.mount}/bin/mount.bcachefs
+                       '';
+                       boot.initrd.extraUtilsCommandsTest = ''
+                               $out/bin/bcachefs version
+                               $out/bin/mount.bcachefs --version
+                       '';
+               })
+       ]);
+}
diff --git a/rust-src/mount/rustfmt.toml b/rust-src/mount/rustfmt.toml
new file mode 100644 (file)
index 0000000..a2b7f32
--- /dev/null
@@ -0,0 +1,2 @@
+max_width=120
+hard_tabs = true
diff --git a/rust-src/mount/src/filesystem.rs b/rust-src/mount/src/filesystem.rs
new file mode 100644 (file)
index 0000000..b1575c2
--- /dev/null
@@ -0,0 +1,208 @@
+extern "C" {
+       pub static stdout: *mut libc::FILE;
+}
+
+use getset::{CopyGetters, Getters};
+use std::path::PathBuf;
+#[derive(Getters, CopyGetters)]
+pub struct FileSystem {
+       /// External UUID of the bcachefs
+       #[getset(get = "pub")]
+       uuid: uuid::Uuid,
+       /// Whether filesystem is encrypted
+       #[getset(get_copy = "pub")]
+       encrypted: bool,
+       /// Super block
+       #[getset(get = "pub")]
+       sb: bcachefs::bch_sb_handle,
+       /// Member devices for this filesystem
+       #[getset(get = "pub")]
+       devices: Vec<PathBuf>,
+}
+impl std::fmt::Debug for FileSystem {
+       fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+               f.debug_struct("FileSystem")
+                       .field("uuid", &self.uuid)
+                       .field("encrypted", &self.encrypted)
+                       .field("devices", &self.device_string())
+                       .finish()
+       }
+}
+use std::fmt;
+impl std::fmt::Display for FileSystem {
+       fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+               let devs = self.device_string();
+               write!(
+                       f,
+                       "{:?}: locked?={lock} ({}) ",
+                       self.uuid,
+                       devs,
+                       lock = self.encrypted
+               )
+       }
+}
+
+impl FileSystem {
+       pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self {
+               Self {
+                       uuid: sb.sb().uuid(),
+                       encrypted: sb.sb().crypt().is_some(),
+                       sb: sb,
+                       devices: Vec::new(),
+               }
+       }
+
+       pub fn device_string(&self) -> String {
+               use itertools::Itertools;
+               self.devices.iter().map(|d| d.display()).join(":")
+       }
+
+       pub fn mount(
+               &self,
+               target: impl AsRef<std::path::Path>,
+               options: impl AsRef<str>,
+       ) -> anyhow::Result<()> {
+               tracing::info_span!("mount").in_scope(|| {
+                       let src = self.device_string();
+                       let (data, mountflags) = parse_mount_options(options);
+                       // let fstype = c_str!("bcachefs");
+
+                       tracing::info!(msg="mounting bcachefs filesystem", target=%target.as_ref().display());
+                       mount_inner(src, target, "bcachefs", mountflags, data)
+               })
+       }
+}
+
+fn mount_inner(
+       src: String,
+       target: impl AsRef<std::path::Path>,
+       fstype: &str,
+       mountflags: u64,
+       data: Option<String>,
+) -> anyhow::Result<()> {
+       use std::{
+               ffi::{c_void, CString},
+               os::{raw::c_char, unix::ffi::OsStrExt},
+       };
+
+       // bind the CStrings to keep them alive
+       let src = CString::new(src)?;
+       let target = CString::new(target.as_ref().as_os_str().as_bytes())?;
+       let data = data.map(CString::new).transpose()?;
+       let fstype = CString::new(fstype)?;
+
+       // convert to pointers for ffi
+       let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+       let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+       let data = data.as_ref().map_or(std::ptr::null(), |data| {
+               data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void
+       });
+       let fstype = fstype.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char;
+       
+       let ret = {let _entered = tracing::info_span!("libc::mount").entered();
+               tracing::info!("mounting filesystem");
+               // REQUIRES: CAP_SYS_ADMIN
+               unsafe { libc::mount(src, target, fstype, mountflags, data) }
+       };
+       match ret {
+               0 => Ok(()),
+               _ => Err(crate::ErrnoError(errno::errno()).into()),
+       }
+}
+
+/// Parse a comma-separated mount options and split out mountflags and filesystem
+/// specific options.
+#[tracing_attributes::instrument(skip(options))]
+fn parse_mount_options(options: impl AsRef<str>) -> (Option<String>, u64) {
+       use either::Either::*;
+       tracing::debug!(msg="parsing mount options", options=?options.as_ref());
+       let (opts, flags) = options
+               .as_ref()
+               .split(",")
+               .map(|o| match o {
+                       "dirsync" => Left(libc::MS_DIRSYNC),
+                       "lazytime" => Left(1 << 25), // MS_LAZYTIME
+                       "mand" => Left(libc::MS_MANDLOCK),
+                       "noatime" => Left(libc::MS_NOATIME),
+                       "nodev" => Left(libc::MS_NODEV),
+                       "nodiratime" => Left(libc::MS_NODIRATIME),
+                       "noexec" => Left(libc::MS_NOEXEC),
+                       "nosuid" => Left(libc::MS_NOSUID),
+                       "ro" => Left(libc::MS_RDONLY),
+                       "rw" => Left(0),
+                       "relatime" => Left(libc::MS_RELATIME),
+                       "strictatime" => Left(libc::MS_STRICTATIME),
+                       "sync" => Left(libc::MS_SYNCHRONOUS),
+                       "" => Left(0),
+                       o @ _ => Right(o),
+               })
+               .fold((Vec::new(), 0), |(mut opts, flags), next| match next {
+                       Left(f) => (opts, flags | f),
+                       Right(o) => {
+                               opts.push(o);
+                               (opts, flags)
+                       }
+               });
+
+       use itertools::Itertools;
+       (
+               if opts.len() == 0 {
+                       None
+               } else {
+                       Some(opts.iter().join(","))
+               },
+               flags,
+       )
+}
+
+use bch_bindgen::bcachefs;
+use std::collections::HashMap;
+use uuid::Uuid;
+
+#[tracing_attributes::instrument]
+pub fn probe_filesystems() -> anyhow::Result<HashMap<Uuid, FileSystem>> {
+       tracing::trace!("enumerating udev devices");
+       let mut udev = udev::Enumerator::new()?;
+
+       udev.match_subsystem("block")?; // find kernel block devices
+
+       let mut fs_map = HashMap::new();
+       let devresults = 
+                       udev.scan_devices()?
+                       .into_iter()
+                       .filter_map(|dev| dev.devnode().map(ToOwned::to_owned));
+       
+       for pathbuf in devresults {
+               match get_super_block_uuid(&pathbuf)? {
+
+                               Ok((uuid_key, superblock)) => {
+                                       let fs = fs_map.entry(uuid_key).or_insert_with(|| {
+                                               tracing::info!(msg="found bcachefs pool", uuid=?uuid_key);
+                                               FileSystem::new(superblock)
+                                       });
+
+                                       fs.devices.push(pathbuf);
+                               },
+
+                               Err(e) => { tracing::debug!(inner2_error=?e);}
+               }
+       }
+
+       
+       tracing::info!(msg = "found filesystems", count = fs_map.len());
+       Ok(fs_map)
+}
+
+// #[tracing_attributes::instrument(skip(dev, fs_map))]
+fn get_super_block_uuid(path: &std::path::Path) -> std::io::Result<std::io::Result<(Uuid, bcachefs::bch_sb_handle)>> {
+       let sb = bch_bindgen::rs::read_super(&path)?;
+       let super_block = match sb { 
+               Err(e) => { return Ok(Err(e)); }
+               Ok(sb) => sb,
+       };
+
+       let uuid = (&super_block).sb().uuid();
+       tracing::debug!(found="bcachefs superblock", devnode=?path, ?uuid);
+
+       Ok(Ok((uuid, super_block)))
+}
similarity index 77%
rename from mount/src/key.rs
rename to rust-src/mount/src/key.rs
index 6769f52c4a8b127db63caaf70585921f6499c180..91c92d1c22997ee4bcba12e81927b27f7953ca32 100644 (file)
@@ -1,12 +1,11 @@
-use log::info;
+use tracing::info;
 
 fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result<bool> {
-       use crate::keyutils::{self, keyctl_search};
+       use bch_bindgen::keyutils::{self, keyctl_search};
        let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _;
        let key_type = c_str!("logon");
 
-       let key_id =
-               unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
+       let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) };
        if key_id > 0 {
                info!("Key has became avaiable");
                Ok(true)
@@ -31,9 +30,9 @@ fn wait_for_key(uuid: &uuid::Uuid) -> anyhow::Result<()> {
 const BCH_KEY_MAGIC: &str = "bch**key";
 use crate::filesystem::FileSystem;
 fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> {
-       use crate::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key};
        use anyhow::anyhow;
        use byteorder::{LittleEndian, ReadBytesExt};
+       use bch_bindgen::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key};
        use std::os::raw::c_char;
 
        let key_name = std::ffi::CString::new(format!("bcachefs:{}", fs.uuid())).unwrap();
@@ -62,19 +61,18 @@ fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> {
                )
        };
        if ret != 0 {
-               Err(anyhow!("chache decryption failure"))
+               Err(anyhow!("chacha decryption failure"))
        } else if key.magic != bch_key_magic {
                Err(anyhow!("failed to verify the password"))
        } else {
                let key_type = c_str!("logon");
                let ret = unsafe {
-                       crate::keyutils::add_key(
+                       bch_bindgen::keyutils::add_key(
                                key_type,
-                               key_name.as_c_str().to_bytes_with_nul() as *const _
-                                       as *const c_char,
+                               key_name.as_c_str().to_bytes_with_nul() as *const _ as *const c_char,
                                &output as *const _ as *const _,
                                std::mem::size_of::<bch_key>() as u64,
-                               crate::keyutils::KEY_SPEC_USER_KEYRING,
+                               bch_bindgen::keyutils::KEY_SPEC_USER_KEYRING,
                        )
                };
                if ret == -1 {
@@ -85,9 +83,12 @@ fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> {
        }
 }
 
-pub(crate) fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> {
+#[tracing_attributes::instrument]
+pub fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> {
        use crate::KeyLocation::*;
        use anyhow::anyhow;
+
+       tracing::info!(msg = "checking if key exists for filesystem");
        match password {
                Fail => Err(anyhow!("no key available")),
                Wait => Ok(wait_for_key(fs.uuid())?),
diff --git a/rust-src/mount/src/lib.rs b/rust-src/mount/src/lib.rs
new file mode 100644 (file)
index 0000000..4e918e1
--- /dev/null
@@ -0,0 +1,91 @@
+use anyhow::anyhow;
+use structopt::StructOpt;
+
+pub mod err {
+       pub enum GError {
+               Unknown{
+                       message: std::borrow::Cow<'static, String>
+                }
+       }
+       pub type GResult<T, E, OE> =::core::result::Result< ::core::result::Result<T, E>, OE>;
+       pub type Result<T, E> = GResult<T, E, GError>;
+}
+
+#[macro_export]
+macro_rules! c_str {
+       ($lit:expr) => {
+               unsafe {
+                       std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char)
+                               .to_bytes_with_nul()
+                               .as_ptr() as *const std::os::raw::c_char
+               }
+       };
+}
+
+#[derive(Debug)]
+struct ErrnoError(errno::Errno);
+impl std::fmt::Display for ErrnoError {
+       fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+               self.0.fmt(f)
+       }
+}
+impl std::error::Error for ErrnoError {}
+
+#[derive(Debug)]
+pub enum KeyLocation {
+       Fail,
+       Wait,
+       Ask,
+}
+
+#[derive(Debug)]
+pub struct KeyLoc(pub Option<KeyLocation>);
+impl std::ops::Deref for KeyLoc {
+       type Target = Option<KeyLocation>;
+       fn deref(&self) -> &Self::Target {
+               &self.0
+       }
+}
+impl std::str::FromStr for KeyLoc {
+       type Err = anyhow::Error;
+       fn from_str(s: &str) -> anyhow::Result<Self> {
+               // use anyhow::anyhow;
+               match s {
+                       "" => Ok(KeyLoc(None)),
+                       "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))),
+                       "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))),
+                       "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))),
+                       _ => Err(anyhow!("invalid password option")),
+               }
+       }
+}
+
+#[derive(StructOpt, Debug)]
+/// Mount a bcachefs filesystem by its UUID.
+pub struct Options {
+       /// Where the password would be loaded from.
+       ///
+       /// Possible values are:
+       /// "fail" - don't ask for password, fail if filesystem is encrypted;
+       /// "wait" - wait for password to become available before mounting;
+       /// "ask" -  prompt the user for password;
+       #[structopt(short, long, default_value = "")]
+       pub key_location: KeyLoc,
+
+       /// External UUID of the bcachefs filesystem
+       pub uuid: uuid::Uuid,
+
+       /// Where the filesystem should be mounted. If not set, then the filesystem
+       /// won't actually be mounted. But all steps preceeding mounting the
+       /// filesystem (e.g. asking for passphrase) will still be performed.
+       pub mountpoint: Option<std::path::PathBuf>,
+
+       /// Mount options
+       #[structopt(short, default_value = "")]
+       pub options: String,
+}
+
+pub mod filesystem;
+pub mod key;
+
+// pub fn mnt_in_use()
diff --git a/rust-src/mount/src/main.rs b/rust-src/mount/src/main.rs
new file mode 100644 (file)
index 0000000..92b6917
--- /dev/null
@@ -0,0 +1,63 @@
+fn main() {
+       // convert existing log statements to tracing events
+       // tracing_log::LogTracer::init().expect("logtracer init failed!");
+       // format tracing log data to env_logger like stdout
+       tracing_subscriber::fmt::init();
+
+       if let Err(e) = crate::main_inner() {
+               tracing::error!(fatal_error = ?e);
+       }
+}
+
+
+
+#[tracing_attributes::instrument("main")]
+pub fn main_inner() -> anyhow::Result<()> {
+       use structopt::StructOpt;
+       use bcachefs_mount::{Options, filesystem, key};
+       unsafe {
+               libc::setvbuf(
+                       filesystem::stdout,
+                       std::ptr::null_mut(),
+                       libc::_IONBF,
+                       0,
+               );
+               // libc::fflush(filesystem::stdout);
+       }
+       let opt = Options::from_args();
+
+       
+       tracing::trace!(?opt);
+
+       let fss = filesystem::probe_filesystems()?;
+       let fs = fss
+               .get(&opt.uuid)
+               .ok_or_else(|| anyhow::anyhow!("filesystem was not found"))?;
+
+       tracing::info!(msg="found filesystem", %fs);
+       if fs.encrypted() {
+               let key = opt
+                       .key_location
+                       .0
+                       .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?;
+
+               key::prepare_key(&fs, key)?;
+       }
+
+       let mountpoint = opt
+               .mountpoint
+               .ok_or_else(|| anyhow::anyhow!("mountpoint option was not specified"))?;
+
+       fs.mount(&mountpoint, &opt.options)?;
+
+       Ok(())
+}
+
+#[cfg(test)]
+mod test {
+       // use insta::assert_debug_snapshot;
+       // #[test]
+       // fn snapshot_testing() {
+       //      insta::assert_debug_snapshot!();
+       // }
+}
index 15a9fcee1ac1c438e898036a667b57955e94a3ff..112280889c63099a912f49d1cbecdca8550cbd24 100755 (executable)
@@ -21,7 +21,7 @@
 set -e
 
 PYTEST="${PYTEST:-pytest-3}"
-spam=$(tempfile)
+spam=$(mktemp)
 unset BCACHEFS_FUSE BCACHEFS_TEST_USE_VALGRIND BCACHEFS_DEBUG
 
 trap "set +x; cat ${spam}; rm -f ${spam} ; echo; echo FAILED." EXIT
@@ -44,7 +44,6 @@ function build() {
 function test() {
     echo Running tests.
     (
-        cd tests
         ${PYTEST} -n${JOBS}
     ) > ${spam} 2>&1
 }
@@ -53,7 +52,6 @@ function test_vg() {
     echo Running tests with valgrind.
     (
         export BCACHEFS_TEST_USE_VALGRIND=yes
-        cd tests
         ${PYTEST} -n${JOBS}
     ) > ${spam} 2>&1
 }
@@ -71,13 +69,13 @@ test
 echo -- Test: debug with valgrind --
 test_vg
 
-echo -- Test: fuse debug --
-export BCACHEFS_FUSE=1
-build
-test
+#echo -- Test: fuse debug --
+#export BCACHEFS_FUSE=1
+#build
+#test
 
-echo -- Test: fuse debug with valgrind --
-test_vg
+#echo -- Test: fuse debug with valgrind --
+#test_vg
 
 rm -f ${spam}
 trap "set +x; echo; echo SUCCESS." EXIT
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
index 45c62732dd1d9bbe6db3074ceb79132e28bbf1b5..ffc2bad28da7cf6a6d9d03f412697f59806e1103 100644 (file)
@@ -3,7 +3,7 @@
 # pytest fixture definitions.
 
 import pytest
-import util
+from tests import util
 
 @pytest.fixture
 def bfuse(tmpdir):
index 6278102838bda7b32334156a22ec677a620e2f0a..a2e95c5972d919bc8cd80ed231e0ad1f6ef2f3db 100644 (file)
@@ -3,7 +3,7 @@
 # Basic bcachefs functionality tests.
 
 import re
-import util
+from tests import util
 
 def test_help():
     ret = util.run_bch(valgrind=True)
@@ -38,9 +38,6 @@ def test_list(tmpdir):
     assert len(ret.stderr) == 0
     assert "recovering from clean shutdown" in ret.stdout
 
-    # Totally arbitrary, feel free to update or remove after inspecting.
-    assert len(ret.stdout.splitlines()) == 97
-
 def test_list_inodes(tmpdir):
     dev = util.format_1g(tmpdir)
 
index d8d3819e814427473b4f171a75c09f79d1650312..d96ce88d2311563d80f83ee3e494b6fdf80a5cc2 100644 (file)
@@ -2,16 +2,15 @@
 #
 # Tests of the functions in util.py
 
-import pytest
 import signal
 import subprocess
 import time
+import os
+import pytest
 
-import util
-from pathlib import Path
+from tests import util
 
-#helper = Path('.') / 'test_helper'
-helper = './test_helper'
+helper = os.path.abspath(os.path.join(util.BASE_PATH, 'test_helper'))
 
 def test_sparse_file(tmpdir):
     dev = util.sparse_file(tmpdir / '1k', 1024)
@@ -32,32 +31,32 @@ def test_segfault():
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_check():
     with pytest.raises(subprocess.CalledProcessError):
-        ret = util.run(helper, 'abort', check=True)
+        util.run(helper, 'abort', check=True)
 
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_leak():
     with pytest.raises(util.ValgrindFailedError):
-        ret = util.run(helper, 'leak', valgrind=True)
+        util.run(helper, 'leak', valgrind=True)
 
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_undefined():
     with pytest.raises(util.ValgrindFailedError):
-        ret = util.run(helper, 'undefined', valgrind=True)
+        util.run(helper, 'undefined', valgrind=True)
 
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_undefined_branch():
     with pytest.raises(util.ValgrindFailedError):
-        ret = util.run(helper, 'undefined_branch', valgrind=True)
+        util.run(helper, 'undefined_branch', valgrind=True)
 
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_read_after_free():
     with pytest.raises(util.ValgrindFailedError):
-        ret = util.run(helper, 'read_after_free', valgrind=True)
+        util.run(helper, 'read_after_free', valgrind=True)
 
 @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind")
 def test_write_after_free():
     with pytest.raises(util.ValgrindFailedError):
-        ret = util.run(helper, 'write_after_free', valgrind=True)
+        util.run(helper, 'write_after_free', valgrind=True)
 
 def test_mountpoint(tmpdir):
     path = util.mountpoint(tmpdir)
index 69c512e914fdd95fd30c21752eeb82f523abefd8..48288e6a9f43bf8677e5ef7c41c218eb7ecb5d06 100644 (file)
@@ -4,7 +4,7 @@
 
 import pytest
 import os
-import util
+from tests import util
 
 pytestmark = pytest.mark.skipif(
     not util.have_fuse(), reason="bcachefs not built with fuse support.")
index b5e02c13f9aee1623a5452c7614e8f1ac32dbc57..00314f4c24d6d7749e9def9bd6db29ad0bff4224 100644 (file)
@@ -2,18 +2,18 @@
 
 import errno
 import os
-import pytest
 import re
 import subprocess
-import sys
 import tempfile
 import threading
 import time
 
 from pathlib import Path
 
-DIR = Path('..')
-BCH_PATH = DIR / 'bcachefs'
+BASE_PATH= os.path.dirname(__file__)
+BCH_PATH = os.path.abspath(os.path.join(BASE_PATH, '..', 'bcachefs'))
+VALGRIND_PATH= os.path.abspath(os.path.join(BASE_PATH,
+    'valgrind-suppressions.txt'))
 
 VPAT = re.compile(r'ERROR SUMMARY: (\d+) errors from (\d+) contexts')
 
@@ -46,21 +46,22 @@ def run(cmd, *args, valgrind=False, check=False):
     cmds = [cmd] + list(args)
     valgrind = valgrind and ENABLE_VALGRIND
 
+    print("Running '{}'".format(cmds))
     if valgrind:
         vout = tempfile.NamedTemporaryFile()
         vcmd = ['valgrind',
                '--leak-check=full',
                '--gen-suppressions=all',
-               '--suppressions=valgrind-suppressions.txt',
+               '--suppressions={}'.format(VALGRIND_PATH),
                '--log-file={}'.format(vout.name)]
         cmds = vcmd + cmds
 
-    print("Running '{}'".format(cmds))
-    res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                         encoding='utf-8', check=check)
-
-    if valgrind:
+        res = subprocess.run(cmds, stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE, encoding='utf-8', check=check)
         check_valgrind(vout.read().decode('utf-8'))
+    else:
+        res = subprocess.run(cmds, stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE, encoding='utf-8', check=check)
 
     return res
 
@@ -75,7 +76,7 @@ def sparse_file(lpath, size):
     This is typically used to create device files for bcachefs.
     """
     path = Path(lpath)
-    f = path.touch(mode = 0o600, exist_ok = False)
+    path.touch(mode = 0o600, exist_ok = False)
     os.truncate(path, size)
 
     return path
@@ -195,7 +196,8 @@ class BFuse:
 
         self.stdout = out1 + out2
         self.stderr = err.read()
-        self.vout = vlog.read().decode('utf-8')
+        if vlog:
+            self.vout = vlog.read().decode('utf-8')
 
     def expect(self, pipe, regex):
         """Wait for the child process to mount."""
@@ -230,7 +232,8 @@ class BFuse:
             print("Waiting for thread to exit.")
             self.thread.join(timeout)
             if self.thread.is_alive():
-                self.proc.kill()
+                if self.proc:
+                    self.proc.kill()
                 self.thread.join()
         else:
             print("Thread was already done.")
@@ -242,6 +245,9 @@ class BFuse:
             check_valgrind(self.vout)
 
     def verify(self):
+        # avoid throwing exception in assertion
+        assert self.stdout is not None
+        assert self.stderr is not None
         assert self.returncode == 0
         assert len(self.stdout) > 0
         assert len(self.stderr) == 0
index 4ed4de34ba59da6a9448e2ea29413f7e8aaa9a88..d83e0529e8a67025bae097596332da57d9484628 100644 (file)
@@ -1,8 +1,17 @@
 {
-   <insert_a_suppression_name_here>
+   call_rcu_memb
    Memcheck:Leak
    match-leak-kinds: possible,definite
    ...
    fun:get_default_call_rcu_data_memb
    fun:call_rcu_memb
 }
+{
+   call_rcu_data_init
+   Memcheck:Leak
+   match-leak-kinds: possible
+   fun:calloc
+   fun:_dl_allocate_tls
+   ...
+   fun:call_rcu_data_init
+}
index 3cc0de444cf95b32d2def2e16c476c6bd726bcb6..9491779baffb59e5c83f8d3e9fde10285b1bc8be 100644 (file)
@@ -94,12 +94,12 @@ void xpread(int fd, void *buf, size_t count, off_t offset)
        }
 }
 
-void xpwrite(int fd, const void *buf, size_t count, off_t offset)
+void xpwrite(int fd, const void *buf, size_t count, off_t offset, const char *msg)
 {
        ssize_t r = pwrite(fd, buf, count, offset);
 
        if (r != count)
-               die("write error (ret %zi err %m)", r);
+               die("error writing %s (ret %zi err %m)", msg, r);
 }
 
 struct stat xfstatat(int dirfd, const char *path, int flags)
@@ -211,7 +211,7 @@ u64 read_file_u64(int dirfd, const char *path)
 {
        char *buf = read_file_str(dirfd, path);
        u64 v;
-       if (kstrtou64(buf, 10, &v))
+       if (bch2_strtou64_h(buf, &v))
                die("read_file_u64: error parsing %s (got %s)", path, buf);
        free(buf);
        return v;
@@ -242,17 +242,17 @@ u64 get_size(const char *path, int fd)
        return ret;
 }
 
-/* Returns blocksize in units of 512 byte sectors: */
+/* Returns blocksize, in bytes: */
 unsigned get_blocksize(const char *path, int fd)
 {
        struct stat statbuf = xfstat(fd);
 
        if (!S_ISBLK(statbuf.st_mode))
-               return statbuf.st_blksize >> 9;
+               return statbuf.st_blksize;
 
        unsigned ret;
        xioctl(fd, BLKPBSZGET, &ret);
-       return ret >> 9;
+       return ret;
 }
 
 /* Open a block device, do magic blkid stuff to probe for existing filesystems: */
@@ -262,7 +262,9 @@ int open_for_format(const char *dev, bool force)
        const char *fs_type = NULL, *fs_label = NULL;
        size_t fs_type_len, fs_label_len;
 
-       int fd = xopen(dev, O_RDWR|O_EXCL);
+       int fd = open(dev, O_RDWR|O_EXCL);
+       if (fd < 0)
+               die("Error opening device to format %s: %m", dev);
 
        if (force)
                return fd;
@@ -401,24 +403,6 @@ char *strcmp_prefix(char *a, const char *a_prefix)
        return *a_prefix ? NULL : a;
 }
 
-unsigned hatoi_validate(const char *s, const char *msg)
-{
-       u64 v;
-
-       if (bch2_strtoull_h(s, &v))
-               die("bad %s %s", msg, s);
-
-       v /= 512;
-
-       if (v > USHRT_MAX)
-               die("%s too large\n", msg);
-
-       if (!v)
-               die("%s too small\n", msg);
-
-       return v;
-}
-
 /* crc32c */
 
 static u32 crc32c_default(u32 crc, const void *buf, size_t size)
index 568707bc2f54b9fe6ba4e82a07fab8a8f23eab11..9468f070f3729d369de681a413199b5156c0e8cd 100644 (file)
 
 #define noreturn __attribute__((noreturn))
 
-void die(const char *, ...) noreturn;
+void die(const char *, ...)
+       __attribute__ ((format (printf, 1, 2))) noreturn;
 char *mprintf(const char *, ...)
        __attribute__ ((format (printf, 1, 2)));
 void *xcalloc(size_t, size_t);
 void *xmalloc(size_t);
 void *xrealloc(void *, size_t);
 void xpread(int, void *, size_t, off_t);
-void xpwrite(int, const void *, size_t, off_t);
+void xpwrite(int, const void *, size_t, off_t, const char *);
 struct stat xfstatat(int, const char *, int);
 struct stat xfstat(int);
 struct stat xstat(const char *);
@@ -150,8 +151,6 @@ struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
 
 char *strcmp_prefix(char *, const char *);
 
-unsigned hatoi_validate(const char *, const char *);
-
 u32 crc32c(u32, const void *, size_t);
 
 char *dev_to_name(dev_t);