From dafd26afd98f92d8f66346aca7b6361d15f4dd3c Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Wed, 16 Feb 2022 14:53:14 +0200 Subject: [PATCH] New upstream snapshot --- .gitignore | 2 +- .travis.yml | 2 +- INSTALL | 10 +- Makefile | 94 +- bcachefs.8 | 4 +- bcachefs.c | 80 +- cmd_data.c | 13 + cmd_debug.c | 77 +- cmd_device.c | 52 +- cmd_format.c | 26 +- cmd_fs.c | 12 + cmd_fusemount.c | 18 +- cmd_migrate.c | 61 +- cmd_subvolume.c | 188 + cmds.h | 8 + crypto.c | 21 +- debian/changelog | 9 + debian/copyright | 4 +- debian/files | 2 +- debian/rules | 6 + default.nix | 172 +- flake.lock | 59 + flake.nix | 96 + include/linux/backing-dev-defs.h | 0 include/linux/blk_types.h | 5 +- include/linux/blkdev.h | 13 +- include/linux/compiler.h | 1 + include/linux/poison.h | 3 + include/linux/siphash.h | 145 + include/linux/slab.h | 44 +- include/linux/vmalloc.h | 14 +- include/trace/events/bcachefs.h | 502 +-- libbcachefs.c | 140 +- libbcachefs.h | 8 +- libbcachefs/acl.c | 74 +- libbcachefs/acl.h | 13 +- libbcachefs/alloc_background.c | 551 ++- libbcachefs/alloc_background.h | 70 +- libbcachefs/alloc_foreground.c | 202 +- libbcachefs/alloc_foreground.h | 41 +- libbcachefs/alloc_types.h | 13 +- libbcachefs/bcachefs.h | 101 +- libbcachefs/bcachefs_format.h | 225 +- libbcachefs/bcachefs_ioctl.h | 15 + libbcachefs/bkey.h | 33 +- libbcachefs/bkey_methods.c | 55 +- libbcachefs/bkey_methods.h | 2 - libbcachefs/bkey_sort.c | 65 +- libbcachefs/bkey_sort.h | 5 - libbcachefs/bset.c | 149 +- libbcachefs/bset.h | 1 - libbcachefs/btree_cache.c | 108 +- libbcachefs/btree_cache.h | 8 +- libbcachefs/btree_gc.c | 890 +++-- libbcachefs/btree_gc.h | 1 - libbcachefs/btree_io.c | 58 +- libbcachefs/btree_io.h | 3 +- libbcachefs/btree_iter.c | 3113 ++++++++++------- libbcachefs/btree_iter.h | 390 ++- libbcachefs/btree_key_cache.c | 205 +- libbcachefs/btree_key_cache.h | 8 +- libbcachefs/btree_locking.h | 170 +- libbcachefs/btree_types.h | 187 +- libbcachefs/btree_update.h | 27 +- libbcachefs/btree_update_interior.c | 528 +-- libbcachefs/btree_update_interior.h | 43 +- libbcachefs/btree_update_leaf.c | 1171 +++++-- libbcachefs/buckets.c | 1219 +++---- libbcachefs/buckets.h | 71 +- libbcachefs/buckets_types.h | 17 +- libbcachefs/buckets_waiting_for_journal.c | 167 + libbcachefs/buckets_waiting_for_journal.h | 15 + .../buckets_waiting_for_journal_types.h | 23 + libbcachefs/chardev.c | 5 +- libbcachefs/checksum.c | 93 +- libbcachefs/checksum.h | 20 +- libbcachefs/compress.c | 23 +- libbcachefs/debug.c | 38 +- libbcachefs/dirent.c | 366 +- libbcachefs/dirent.h | 36 +- libbcachefs/disk_groups.c | 62 +- libbcachefs/ec.c | 322 +- libbcachefs/ec.h | 9 +- libbcachefs/ec_types.h | 9 + libbcachefs/errcode.h | 12 + libbcachefs/error.c | 4 +- libbcachefs/extent_update.c | 36 +- libbcachefs/extent_update.h | 8 +- libbcachefs/extents.c | 132 +- libbcachefs/extents.h | 15 +- libbcachefs/eytzinger.h | 48 +- libbcachefs/fs-common.c | 360 +- libbcachefs/fs-common.h | 26 +- libbcachefs/fs-io.c | 932 +++-- libbcachefs/fs-ioctl.c | 183 +- libbcachefs/fs.c | 591 +++- libbcachefs/fs.h | 30 +- libbcachefs/fsck.c | 1975 ++++++++--- libbcachefs/inode.c | 389 +- libbcachefs/inode.h | 37 +- libbcachefs/io.c | 431 ++- libbcachefs/io.h | 27 +- libbcachefs/io_types.h | 2 + libbcachefs/journal.c | 224 +- libbcachefs/journal.h | 15 +- libbcachefs/journal_io.c | 197 +- libbcachefs/journal_io.h | 6 +- libbcachefs/journal_reclaim.c | 21 +- libbcachefs/journal_seq_blacklist.c | 107 +- libbcachefs/journal_types.h | 10 +- libbcachefs/migrate.c | 57 +- libbcachefs/move.c | 322 +- libbcachefs/move.h | 4 + libbcachefs/move_types.h | 2 + libbcachefs/movinggc.c | 199 +- libbcachefs/opts.c | 202 +- libbcachefs/opts.h | 186 +- libbcachefs/quota.c | 114 +- libbcachefs/rebalance.c | 11 +- libbcachefs/rebalance_types.h | 1 - libbcachefs/recovery.c | 606 ++-- libbcachefs/recovery.h | 14 +- libbcachefs/reflink.c | 133 +- libbcachefs/reflink.h | 4 +- libbcachefs/replicas.c | 218 +- libbcachefs/replicas.h | 3 - libbcachefs/str_hash.h | 158 +- libbcachefs/subvolume.c | 1089 ++++++ libbcachefs/subvolume.h | 136 + libbcachefs/subvolume_types.h | 11 + libbcachefs/super-io.c | 510 ++- libbcachefs/super-io.h | 8 +- libbcachefs/super.c | 461 ++- libbcachefs/super.h | 22 +- libbcachefs/super_types.h | 1 - libbcachefs/sysfs.c | 309 +- libbcachefs/tests.c | 315 +- libbcachefs/util.c | 15 +- libbcachefs/util.h | 34 +- libbcachefs/varint.c | 9 +- libbcachefs/xattr.c | 71 +- libbcachefs/xattr.h | 3 +- linux/blkdev.c | 178 +- linux/siphash.c | 552 +++ mount/build.rs | 67 - mount/src/filesystem.rs | 174 - mount/src/lib.rs | 190 - nix/bcachefs-kernel.nix | 34 + nix/bcachefs.rev.sha256 | 1 + nix/overlay.nix | 29 + packaging/bcachefs-tools.spec | 2 - qcow2.c | 12 +- rust-src/bch_bindgen/.gitignore | 15 + rust-src/bch_bindgen/Cargo.lock | 484 +++ rust-src/bch_bindgen/Cargo.toml | 26 + rust-src/bch_bindgen/build.rs | 74 + rust-src/bch_bindgen/default.nix | 76 + rust-src/bch_bindgen/rustfmt.toml | 2 + rust-src/bch_bindgen/src/bcachefs.rs | 124 + rust-src/bch_bindgen/src/keyutils.rs | 6 + .../bch_bindgen}/src/keyutils_wrapper.h | 0 rust-src/bch_bindgen/src/lib.rs | 7 + .../bch_bindgen}/src/libbcachefs_wrapper.h | 4 + rust-src/bch_bindgen/src/rs.rs | 58 + rust-src/mount/.gitignore | 15 + {mount => rust-src/mount}/Cargo.lock | 350 +- {mount => rust-src/mount}/Cargo.toml | 28 +- rust-src/mount/README.md | 62 + rust-src/mount/default.nix | 41 + rust-src/mount/module.nix | 54 + rust-src/mount/rustfmt.toml | 2 + rust-src/mount/src/filesystem.rs | 208 ++ {mount => rust-src/mount}/src/key.rs | 23 +- rust-src/mount/src/lib.rs | 91 + rust-src/mount/src/main.rs | 63 + smoke_test | 16 +- tests/__init__.py | 0 tests/conftest.py | 2 +- tests/test_basic.py | 5 +- tests/test_fixture.py | 21 +- tests/test_fuse.py | 2 +- tests/util.py | 32 +- tests/valgrind-suppressions.txt | 11 +- tools-util.c | 34 +- tools-util.h | 7 +- 185 files changed, 17422 insertions(+), 9717 deletions(-) create mode 100644 cmd_subvolume.c create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 include/linux/backing-dev-defs.h create mode 100644 include/linux/siphash.h create mode 100644 libbcachefs/buckets_waiting_for_journal.c create mode 100644 libbcachefs/buckets_waiting_for_journal.h create mode 100644 libbcachefs/buckets_waiting_for_journal_types.h create mode 100644 libbcachefs/errcode.h create mode 100644 libbcachefs/subvolume.c create mode 100644 libbcachefs/subvolume.h create mode 100644 libbcachefs/subvolume_types.h create mode 100644 linux/siphash.c delete mode 100644 mount/build.rs delete mode 100644 mount/src/filesystem.rs delete mode 100644 mount/src/lib.rs create mode 100644 nix/bcachefs-kernel.nix create mode 100644 nix/bcachefs.rev.sha256 create mode 100644 nix/overlay.nix create mode 100644 rust-src/bch_bindgen/.gitignore create mode 100644 rust-src/bch_bindgen/Cargo.lock create mode 100644 rust-src/bch_bindgen/Cargo.toml create mode 100644 rust-src/bch_bindgen/build.rs create mode 100644 rust-src/bch_bindgen/default.nix create mode 100644 rust-src/bch_bindgen/rustfmt.toml create mode 100644 rust-src/bch_bindgen/src/bcachefs.rs create mode 100644 rust-src/bch_bindgen/src/keyutils.rs rename {mount => rust-src/bch_bindgen}/src/keyutils_wrapper.h (100%) create mode 100644 rust-src/bch_bindgen/src/lib.rs rename {mount => rust-src/bch_bindgen}/src/libbcachefs_wrapper.h (59%) create mode 100644 rust-src/bch_bindgen/src/rs.rs create mode 100644 rust-src/mount/.gitignore rename {mount => rust-src/mount}/Cargo.lock (68%) rename {mount => rust-src/mount}/Cargo.toml (54%) create mode 100644 rust-src/mount/README.md create mode 100644 rust-src/mount/default.nix create mode 100644 rust-src/mount/module.nix create mode 100644 rust-src/mount/rustfmt.toml create mode 100644 rust-src/mount/src/filesystem.rs rename {mount => rust-src/mount}/src/key.rs (77%) create mode 100644 rust-src/mount/src/lib.rs create mode 100644 rust-src/mount/src/main.rs create mode 100644 tests/__init__.py diff --git a/.gitignore b/.gitignore index 8feb598..b1c03cd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,4 +18,4 @@ tests/__pycache__/ mount/target mount.bcachefs -doc/bcachefs.5.rst +bcachefs-principles-of-operation.* diff --git a/.travis.yml b/.travis.yml index 3b90b73..e66f0c2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,7 @@ addons: apt: packages: - valgrind + - python3-docutils - python3-pytest - python3-pytest-xdist - meson @@ -18,7 +19,6 @@ addons: - libblkid-dev - libkeyutils-dev - liblz4-dev - - libscrypt-dev - libsodium-dev - liburcu-dev - libzstd-dev diff --git a/INSTALL b/INSTALL index 85c09a2..b4d60bf 100644 --- a/INSTALL +++ b/INSTALL @@ -6,7 +6,6 @@ Dependencies: * libblkid * libkeyutils * liblz4 - * libscrypt * libsodium * liburcu * libuuid @@ -17,17 +16,18 @@ Dependencies: Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \ - liblz4-dev libscrypt-dev libsodium-dev liburcu-dev libzstd-dev \ - uuid-dev zlib1g-dev valgrind libudev-dev + liblz4-dev libsodium-dev liburcu-dev libzstd-dev \ + uuid-dev zlib1g-dev valgrind libudev-dev git build-essential \ + python3 python3-docutils Fedora: install the "Development tools" group along with: dnf install -y libaio-devel libsodium-devel \ libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \ lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \ - libscrypt-devel findutils + findutils Arch: install bcachefs-tools-git from the AUR. -Or to build from source, install libscrypt from the AUR along with, +Or to build from source, install build dependencies with pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind Then, just make && make install diff --git a/Makefile b/Makefile index 23e0508..e49534e 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,8 @@ PREFIX?=/usr/local PKG_CONFIG?=pkg-config INSTALL=install -PYTEST=pytest-3 -CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \ + +CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall -fPIC \ -Wno-pointer-sign \ -fno-strict-aliasing \ -fno-delete-null-pointer-checks \ @@ -20,6 +20,21 @@ CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \ $(EXTRA_CFLAGS) LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS) +## Configure Tools +PYTEST_ARGS?= +PYTEST_CMD?=$(shell \ + command -v pytest-3 \ + || which pytest-3 \ +) +PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS) + +CARGO_ARGS= +CARGO=cargo $(CARGO_ARGS) +CARGO_PROFILE=release +# CARGO_PROFILE=debug + +CARGO_BUILD_ARGS=--$(CARGO_PROFILE) +CARGO_BUILD=$(CARGO) build $(CARGO_BUILD_ARGS) VERSION?=$(shell git describe --dirty=+ 2>/dev/null || echo v0.1-nogit) include Makefile.compiler @@ -49,8 +64,7 @@ endif CFLAGS+=$(PKGCONFIG_CFLAGS) LDLIBS+=$(PKGCONFIG_LDLIBS) - -LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio -ldl +LDLIBS+=-lm -lpthread -lrt -lkeyutils -laio -ldl LDLIBS+=$(EXTRA_LDLIBS) ifeq ($(PREFIX),/usr) @@ -61,31 +75,22 @@ else INITRAMFS_DIR=/etc/initramfs-tools endif -var := $(shell rst2man -V 2>/dev/null) -ifeq ($(.SHELLSTATUS),0) - RST2MAN=rst2man -endif - -var := $(shell rst2man.py -V 2>/dev/null) -ifeq ($(.SHELLSTATUS),0) - RST2MAN=rst2man.py -endif - -undefine var - -ifeq (,$(RST2MAN)) - @echo "WARNING: no RST2MAN found!" -endif - .PHONY: all -all: bcachefs bcachefs.5 +all: bcachefs lib + +.PHONY: lib +lib: libbcachefs.so .PHONY: tests tests: tests/test_helper .PHONY: check check: tests bcachefs - cd tests; $(PYTEST) +ifneq (,$(PYTEST_CMD)) + $(PYTEST) +else + @echo "WARNING: pytest not found or specified, tests could not be run." +endif .PHONY: TAGS tags TAGS: @@ -94,34 +99,32 @@ TAGS: tags: ctags -R . -DOCSRC := opts_macro.h bcachefs.5.rst.tmpl -DOCGENERATED := bcachefs.5 doc/bcachefs.5.rst -DOCDEPS := $(addprefix ./doc/,$(DOCSRC)) -bcachefs.5: $(DOCDEPS) libbcachefs/opts.h - $(CC) doc/opts_macro.h -I libbcachefs -I include -E 2>/dev/null \ - | doc/macro2rst.py - $(RST2MAN) doc/bcachefs.5.rst bcachefs.5 - -SRCS=$(shell find . -type f -iname '*.c') +SRCS=$(shell find . -type f ! -path '*/.*/*' -iname '*.c') DEPS=$(SRCS:.c=.d) -include $(DEPS) OBJS=$(SRCS:.c=.o) bcachefs: $(filter-out ./tests/%.o, $(OBJS)) -MOUNT_SRCS=$(shell find mount/src -type f -iname '*.rs') \ - mount/Cargo.toml mount/Cargo.lock mount/build.rs +RUST_SRCS=$(shell find rust-src/ -type f -iname '*.rs') +MOUNT_SRCS=$(filter %mount, $(RUST_SRCS)) debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y debug: bcachefs -libbcachefs_mount.a: $(MOUNT_SRCS) - LIBBCACHEFS_INCLUDE=$(CURDIR) cargo build --manifest-path mount/Cargo.toml --release - cp mount/target/release/libbcachefs_mount.a $@ - MOUNT_OBJ=$(filter-out ./bcachefs.o ./tests/%.o ./cmd_%.o , $(OBJS)) -mount.bcachefs: libbcachefs_mount.a $(MOUNT_OBJ) - $(CC) -Wl,--gc-sections libbcachefs_mount.a $(MOUNT_OBJ) -o $@ $(LDLIBS) +libbcachefs.so: LDFLAGS+=-shared +libbcachefs.so: $(MOUNT_OBJ) + $(CC) $(LDFLAGS) $+ -o $@ $(LDLIBS) + +MOUNT_TOML=rust-src/mount/Cargo.toml +mount.bcachefs: lib $(MOUNT_SRCS) + LIBBCACHEFS_LIB=$(CURDIR) \ + LIBBCACHEFS_INCLUDE=$(CURDIR) \ + $(CARGO_BUILD) --manifest-path $(MOUNT_TOML) + + ln -f rust-src/mount/target/$(CARGO_PROFILE)/bcachefs-mount $@ + tests/test_helper: $(filter ./tests/%.o, $(OBJS)) @@ -138,7 +141,7 @@ cmd_version.o : .version .PHONY: install install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs -install: bcachefs +install: bcachefs lib $(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0755 fsck.bcachefs $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0755 mkfs.bcachefs $(DESTDIR)$(ROOT_SBINDIR) @@ -146,18 +149,26 @@ install: bcachefs $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT) $(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK) $(INSTALL) -m0755 -D mount.bcachefs.sh $(DESTDIR)$(ROOT_SBINDIR) + $(INSTALL) -m0755 -D libbcachefs.so -t $(DESTDIR)$(PREFIX)/lib/ + sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK) echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK) .PHONY: clean clean: $(RM) bcachefs mount.bcachefs libbcachefs_mount.a tests/test_helper .version $(OBJS) $(DEPS) $(DOCGENERATED) - $(RM) -rf mount/target + $(RM) -rf rust-src/*/target .PHONY: deb deb: all debuild -us -uc -nc -b -i -I +bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex + pdflatex doc/bcachefs-principles-of-operation.tex + pdflatex doc/bcachefs-principles-of-operation.tex + +doc: bcachefs-principles-of-operation.pdf + .PHONY: update-bcachefs-sources update-bcachefs-sources: git rm -rf --ignore-unmatch libbcachefs @@ -184,6 +195,7 @@ update-bcachefs-sources: git -C $(LINUX_DIR) rev-parse HEAD | tee .bcachefs_revision git add .bcachefs_revision + .PHONY: update-commit-bcachefs-sources update-commit-bcachefs-sources: update-bcachefs-sources git commit -m "Update bcachefs sources to $(shell git -C $(LINUX_DIR) show --oneline --no-patch)" diff --git a/bcachefs.8 b/bcachefs.8 index 61af7f4..874068c 100644 --- a/bcachefs.8 +++ b/bcachefs.8 @@ -99,7 +99,7 @@ Format one or a list of devices with bcachefs data structures. You need to do this before you create a volume. .Pp Device specific options must come before corresponding devices, e.g. -.Dl bcachefs format --group=ssd /dev/sda --group=hdd /dev/sdb +.Dl bcachefs format --group=ssd /dev/sda --label=hdd /dev/sdb .Bl -tag -width Ds .It Fl b , Fl -block Ns = Ns Ar size block size, in bytes (e.g. 4k) @@ -111,7 +111,7 @@ Set metadata checksum type (default: .It Fl -data_checksum_type Ns = Ns ( Cm none | crc32c | crc64 ) Set data checksum type (default: .Cm crc32c ) . -.It Fl -compression_type Ns = Ns ( Cm none | lz4 | gzip ) +.It Fl -compression Ns = Ns ( Cm none | lz4 | gzip | zstd ) Set compression type (default: .Cm none ) . .It Fl -data_replicas Ns = Ns Ar number diff --git a/bcachefs.c b/bcachefs.c index 239b114..4f2cd55 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -37,14 +37,14 @@ static void usage(void) "Repair:\n" " fsck Check an existing filesystem for errors\n" "\n" - "Startup/shutdown, assembly of multi device filesystems:\n" #if 0 + "Startup/shutdown, assembly of multi device filesystems:\n" " assemble Assemble an existing multi device filesystem\n" " incremental Incrementally assemble an existing multi device filesystem\n" " run Start a partially assembled filesystem\n" " stop Stop a running filesystem\n" -#endif "\n" +#endif "Commands for managing a running filesystem:\n" " fs usage Show disk usage\n" "\n" @@ -56,7 +56,12 @@ static void usage(void) " device evacuate Migrate data off of a specific device\n" " device set-state Mark a device as failed\n" " device resize Resize filesystem on a device\n" - " device journal-resize Resize journal on a device\n" + " device resize-journal Resize journal on a device\n" + "\n" + "Commands for managing subvolumes and snapshots:\n" + " subvolume create Create a new subvolume\n" + " subvolume delete Delete an existing subvolume\n" + " subvolume snapshot Create a snapshot\n" "\n" "Commands for managing filesystem data:\n" " data rereplicate Rereplicate degraded data\n" @@ -87,14 +92,9 @@ static char *full_cmd; static char *pop_cmd(int *argc, char *argv[]) { - if (*argc < 2) { - printf("%s: missing command\n", argv[0]); - usage(); - exit(EXIT_FAILURE); - } - char *cmd = argv[1]; - memmove(&argv[1], &argv[2], *argc * sizeof(argv[0])); + if (!(*argc < 2)) + memmove(&argv[1], &argv[2], *argc * sizeof(argv[0])); (*argc)--; full_cmd = mprintf("%s %s", full_cmd, cmd); @@ -105,10 +105,11 @@ static int fs_cmds(int argc, char *argv[]) { char *cmd = pop_cmd(&argc, argv); + if (argc < 1) + return fs_usage(); if (!strcmp(cmd, "usage")) return cmd_fs_usage(argc, argv); - usage(); return 0; } @@ -116,6 +117,8 @@ static int device_cmds(int argc, char *argv[]) { char *cmd = pop_cmd(&argc, argv); + if (argc < 1) + return device_usage(); if (!strcmp(cmd, "add")) return cmd_device_add(argc, argv); if (!strcmp(cmd, "remove")) @@ -133,7 +136,6 @@ static int device_cmds(int argc, char *argv[]) if (!strcmp(cmd, "resize-journal")) return cmd_device_resize_journal(argc, argv); - usage(); return 0; } @@ -141,12 +143,28 @@ static int data_cmds(int argc, char *argv[]) { char *cmd = pop_cmd(&argc, argv); + if (argc < 1) + return data_usage(); if (!strcmp(cmd, "rereplicate")) return cmd_data_rereplicate(argc, argv); if (!strcmp(cmd, "job")) return cmd_data_job(argc, argv); - usage(); + return 0; +} + +static int subvolume_cmds(int argc, char *argv[]) +{ + char *cmd = pop_cmd(&argc, argv); + if (argc < 1) + return subvolume_usage(); + if (!strcmp(cmd, "create")) + return cmd_subvolume_create(argc, argv); + if (!strcmp(cmd, "delete")) + return cmd_subvolume_delete(argc, argv); + if (!strcmp(cmd, "snapshot")) + return cmd_subvolume_snapshot(argc, argv); + return 0; } @@ -159,16 +177,34 @@ int main(int argc, char *argv[]) setvbuf(stdout, NULL, _IOLBF, 0); char *cmd = pop_cmd(&argc, argv); + if (argc < 1) { + puts("missing command\n"); + goto usage; + } - if (!strcmp(cmd, "version")) - return cmd_version(argc, argv); + /* these subcommands display usage when argc < 2 */ + if (!strcmp(cmd, "device")) + return device_cmds(argc, argv); + if (!strcmp(cmd, "fs")) + return fs_cmds(argc, argv); + if (!strcmp(cmd, "data")) + return data_cmds(argc, argv); + if (!strcmp(cmd, "subvolume")) + return subvolume_cmds(argc, argv); if (!strcmp(cmd, "format")) return cmd_format(argc, argv); + if (!strcmp(cmd, "fsck")) + return cmd_fsck(argc, argv); + if (!strcmp(cmd, "version")) + return cmd_version(argc, argv); if (!strcmp(cmd, "show-super")) return cmd_show_super(argc, argv); - if (!strcmp(cmd, "fsck")) - return cmd_fsck(argc, argv); + if (argc < 2) { + printf("%s: missing command\n", argv[0]); + usage(); + exit(EXIT_FAILURE); + } #if 0 if (!strcmp(cmd, "assemble")) @@ -181,15 +217,6 @@ int main(int argc, char *argv[]) return cmd_stop(argc, argv); #endif - if (!strcmp(cmd, "fs")) - return fs_cmds(argc, argv); - - if (!strcmp(cmd, "device")) - return device_cmds(argc, argv); - - if (!strcmp(cmd, "data")) - return data_cmds(argc, argv); - if (!strcmp(cmd, "unlock")) return cmd_unlock(argc, argv); if (!strcmp(cmd, "set-passphrase")) @@ -223,6 +250,7 @@ int main(int argc, char *argv[]) } printf("Unknown command %s\n", cmd); +usage: usage(); exit(EXIT_FAILURE); } diff --git a/cmd_data.c b/cmd_data.c index 25a2dcb..d78598d 100644 --- a/cmd_data.c +++ b/cmd_data.c @@ -9,6 +9,19 @@ #include "cmds.h" #include "libbcachefs.h" +int data_usage(void) +{ + puts("bcachefs data - manage filesystem data\n" + "Usage: bcachefs data [OPTIONS]\n" + "\n" + "Commands:\n" + " rereplicate Rereplicate degraded data\n" + " job Kick off low level data jobs\n" + "\n" + "Report bugs to "); + return 0; +} + static void data_rereplicate_usage(void) { puts("bcachefs data rereplicate\n" diff --git a/cmd_debug.c b/cmd_debug.c index 2f56e41..6ff58a9 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -37,6 +37,7 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) struct bch_sb *sb = ca->disk_sb.sb; ranges data; unsigned i; + int ret; darray_init(data); @@ -64,12 +65,12 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) const struct bch_extent_ptr *ptr; struct bkey_ptrs_c ptrs; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; bch2_trans_init(&trans, c, 0, 0); - __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b) { + __for_each_btree_node(&trans, iter, i, POS_MIN, 0, 1, 0, b, ret) { struct btree_node_iter iter; struct bkey u; struct bkey_s_c k; @@ -85,6 +86,9 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) } } + if (ret) + die("error %s walking btree nodes", strerror(-ret)); + b = c->btree_roots[i].b; if (!btree_node_fake(b)) { ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); @@ -95,6 +99,8 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) ptr->offset << 9, btree_bytes(c)); } + + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); } @@ -181,7 +187,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, struct bpos start, struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; char buf[512]; int ret; @@ -189,6 +195,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, btree_id, start, + BTREE_ITER_ALL_SNAPSHOTS| BTREE_ITER_PREFETCH, k, ret) { if (bkey_cmp(k.k->p, end) > 0) break; @@ -196,7 +203,7 @@ static void list_keys(struct bch_fs *c, enum btree_id btree_id, bch2_bkey_val_to_text(&PBUF(buf), c, k); puts(buf); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); } @@ -205,20 +212,24 @@ static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigne struct bpos start, struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; char buf[4096]; + int ret; bch2_trans_init(&trans, c, 0, 0); - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) { + __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { if (bkey_cmp(b->key.k.p, end) > 0) break; bch2_btree_node_to_text(&PBUF(buf), c, b); puts(buf); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); } @@ -227,13 +238,14 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos start, struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; char buf[4096]; + int ret; bch2_trans_init(&trans, c, 0, 0); - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) { + __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { if (bkey_cmp(b->key.k.p, end) > 0) break; @@ -241,7 +253,10 @@ static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, fputs(buf, stdout); putchar('\n'); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); } @@ -280,7 +295,7 @@ static void print_node_ondisk(struct bch_fs *c, struct btree *b) bio_put(bio); percpu_ref_put(&ca->io_ref); - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { struct bset *i; struct nonce nonce; struct bch_csum csum; @@ -346,13 +361,14 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned struct bpos start, struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; char buf[4096]; + int ret; bch2_trans_init(&trans, c, 0, 0); - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) { + __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { if (bkey_cmp(b->key.k.p, end) > 0) break; @@ -362,7 +378,10 @@ static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned print_node_ondisk(c, b); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); } @@ -371,16 +390,17 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l struct bpos start, struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree_node_iter node_iter; struct bkey unpacked; struct bkey_s_c k; struct btree *b; char buf[4096]; + int ret; bch2_trans_init(&trans, c, 0, 0); - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b) { + __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { if (bkey_cmp(b->key.k.p, end) > 0) break; @@ -393,7 +413,10 @@ static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned l puts(buf); } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + die("error %s walking btree nodes", strerror(-ret)); bch2_trans_exit(&trans); } @@ -443,9 +466,9 @@ int cmd_list(int argc, char *argv[]) enum btree_id btree_id_start = 0; enum btree_id btree_id_end = BTREE_ID_NR; enum btree_id btree_id; - unsigned level; + unsigned level = 0; struct bpos start = POS_MIN, end = POS_MAX; - u64 inum; + u64 inum = 0; int mode = 0, opt; opt_set(opts, nochanges, true); @@ -572,9 +595,6 @@ int cmd_list_journal(int argc, char *argv[]) struct journal_replay *p; struct jset_entry *entry; - struct bkey_i *k, *_n; - - /* This could be greatly expanded: */ list_for_each_entry(p, &c->journal_entries, list) { printf("journal entry %8llu\n" @@ -585,14 +605,13 @@ int cmd_list_journal(int argc, char *argv[]) le32_to_cpu(p->j.version), le64_to_cpu(p->j.last_seq)); - for_each_jset_key(k, _n, entry, &p->j) { - char buf[200]; + vstruct_for_each(&p->j, entry) { + char _buf[4096]; + struct printbuf buf = PBUF(_buf); - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); - printf("btree %s l %u: %s\n", - bch2_btree_ids[entry->btree_id], - entry->level, - buf); + printbuf_indent_push(&buf, 2); + bch2_journal_entry_to_text(&buf, c, entry); + printf("%s\n", _buf); } } diff --git a/cmd_device.c b/cmd_device.c index b18bdd8..ef2dfa1 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -21,6 +21,25 @@ #include "libbcachefs/opts.h" #include "tools-util.h" +int device_usage(void) +{ + puts("bcachefs device - manage devices within a running filesystem\n" + "Usage: bcachefs device [OPTION]\n" + "\n" + "Commands:\n" + " add add a new device to an existing filesystem\n" + " remove remove a device from an existing filesystem\n" + " online re-add an existing member to a filesystem\n" + " offline take a device offline, without removing it\n" + " evacuate migrate data off a specific device\n" + " set-state mark a device as failed\n" + " resize resize filesystem on a device\n" + " resize-journal resize journal on a device\n" + "\n" + "Report bugs to "); + return 0; +} + static void device_add_usage(void) { puts("bcachefs device add - add a device to an existing filesystem\n" @@ -30,7 +49,7 @@ static void device_add_usage(void) " -S, --fs_size=size Size of filesystem on device\n" " -B, --bucket=size Bucket size\n" " -D, --discard Enable discards\n" - " -g, --group=group Disk group\n" + " -l, --label=label Disk label\n" " -f, --force Use device even if it appears to already be formatted\n" " -h, --help Display this help and exit\n" "\n" @@ -43,7 +62,7 @@ int cmd_device_add(int argc, char *argv[]) { "fs_size", required_argument, NULL, 'S' }, { "bucket", required_argument, NULL, 'B' }, { "discard", no_argument, NULL, 'D' }, - { "group", required_argument, NULL, 'g' }, + { "label", required_argument, NULL, 'l' }, { "force", no_argument, NULL, 'f' }, { "help", no_argument, NULL, 'h' }, { NULL } @@ -59,18 +78,16 @@ int cmd_device_add(int argc, char *argv[]) case 'S': if (bch2_strtoull_h(optarg, &dev_opts.size)) die("invalid filesystem size"); - - dev_opts.size >>= 9; break; case 'B': - dev_opts.bucket_size = - hatoi_validate(optarg, "bucket size"); + if (bch2_strtoull_h(optarg, &dev_opts.bucket_size)) + die("bad bucket_size %s", optarg); break; case 'D': dev_opts.discard = true; break; - case 'g': - dev_opts.group = strdup(optarg); + case 'l': + dev_opts.label = strdup(optarg); break; case 'f': force = true; @@ -85,8 +102,8 @@ int cmd_device_add(int argc, char *argv[]) if (!fs_path) die("Please supply a filesystem"); - char *dev_path = arg_pop(); - if (!dev_path) + dev_opts.path = arg_pop(); + if (!dev_opts.path) die("Please supply a device"); if (argc) @@ -94,7 +111,6 @@ int cmd_device_add(int argc, char *argv[]) struct bchfs_handle fs = bcache_fs_open(fs_path); - dev_opts.path = dev_path; dev_opts.fd = open_for_format(dev_opts.path, force); struct bch_opt_strs fs_opt_strs; @@ -103,9 +119,9 @@ int cmd_device_add(int argc, char *argv[]) struct bch_opts fs_opts = bch2_parse_opts(fs_opt_strs); opt_set(fs_opts, block_size, - read_file_u64(fs.sysfs_fd, "block_size") >> 9); + read_file_u64(fs.sysfs_fd, "options/block_size")); opt_set(fs_opts, btree_node_size, - read_file_u64(fs.sysfs_fd, "btree_node_size") >> 9); + read_file_u64(fs.sysfs_fd, "options/btree_node_size")); struct bch_sb *sb = bch2_format(fs_opt_strs, fs_opts, @@ -498,6 +514,9 @@ int cmd_device_resize(int argc, char *argv[]) u64 nbuckets = size / le16_to_cpu(m->bucket_size); + if (nbuckets < le64_to_cpu(m->nbuckets)) + die("Shrinking not supported yet"); + printf("resizing %s to %llu buckets\n", dev, nbuckets); bchu_disk_resize(fs, idx, nbuckets); } else { @@ -519,6 +538,9 @@ int cmd_device_resize(int argc, char *argv[]) u64 nbuckets = size / le16_to_cpu(resize->mi.bucket_size); + if (nbuckets < le64_to_cpu(resize->mi.nbuckets)) + die("Shrinking not supported yet"); + printf("resizing %s to %llu buckets\n", dev, nbuckets); int ret = bch2_dev_resize(c, resize, nbuckets); if (ret) @@ -533,7 +555,7 @@ int cmd_device_resize(int argc, char *argv[]) static void device_resize_journal_usage(void) { puts("bcachefs device resize-journal \n" - "Usage: bcachefs device resize-journal device [ size ]\n" + "Usage: bcachefs device resize-journal device size\n" "\n" "Options:\n" " -h, --help display this help and exit\n" @@ -565,7 +587,7 @@ int cmd_device_resize_journal(int argc, char *argv[]) char *size_arg = arg_pop(); if (!size_arg) - size = get_size(dev, dev_fd); + die("Please supply a journal size"); else if (bch2_strtoull_h(size_arg, &size)) die("invalid size"); diff --git a/cmd_format.c b/cmd_format.c index 3f96f5d..cc16b31 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -33,12 +33,12 @@ x(0, replicas, required_argument) \ x(0, encrypted, no_argument) \ x(0, no_passphrase, no_argument) \ -x('L', label, required_argument) \ +x('L', fs_label, required_argument) \ x('U', uuid, required_argument) \ x(0, fs_size, required_argument) \ x(0, superblock_size, required_argument) \ x(0, bucket_size, required_argument) \ -x('g', group, required_argument) \ +x('l', label, required_argument) \ x(0, discard, no_argument) \ x(0, data_allowed, required_argument) \ x(0, durability, required_argument) \ @@ -61,7 +61,7 @@ static void usage(void) " --replicas=# Sets both data and metadata replicas\n" " --encrypted Enable whole filesystem encryption (chacha20/poly1305)\n" " --no_passphrase Don't encrypt master encryption key\n" - " -L, --label=label\n" + " -L, --fs_label=label\n" " -U, --uuid=uuid\n" " --superblock_size=size\n" "\n" @@ -69,14 +69,14 @@ static void usage(void) bch2_opts_usage(OPT_DEVICE); - puts(" -g, --group=label Disk group\n" + puts(" -l, --label=label Disk label\n" "\n" " -f, --force\n" " -q, --quiet Only print errors\n" " -h, --help Display this help and exit\n" "\n" "Device specific options must come before corresponding devices, e.g.\n" - " bcachefs format --group cache /dev/sdb /dev/sdc\n" + " bcachefs format --label cache /dev/sdb /dev/sdc\n" "\n" "Report bugs to "); } @@ -147,7 +147,7 @@ int cmd_format(int argc, char *argv[]) case O_no_passphrase: no_passphrase = true; break; - case O_label: + case O_fs_label: case 'L': opts.label = optarg; break; @@ -163,8 +163,6 @@ int cmd_format(int argc, char *argv[]) case O_fs_size: if (bch2_strtoull_h(optarg, &dev_opts.size)) die("invalid filesystem size"); - - dev_opts.size >>= 9; break; case O_superblock_size: if (bch2_strtouint_h(optarg, &opts.superblock_size)) @@ -173,12 +171,12 @@ int cmd_format(int argc, char *argv[]) opts.superblock_size >>= 9; break; case O_bucket_size: - dev_opts.bucket_size = - hatoi_validate(optarg, "bucket size"); + if (bch2_strtoull_h(optarg, &dev_opts.bucket_size)) + die("bad bucket_size %s", optarg); break; - case O_group: - case 'g': - dev_opts.group = optarg; + case O_label: + case 'l': + dev_opts.label = optarg; break; case O_discard: dev_opts.discard = true; @@ -258,7 +256,7 @@ int cmd_format(int argc, char *argv[]) darray_size(device_paths), bch2_opts_empty()); if (IS_ERR(c)) - die("error opening %s: %s", device_paths.item, + die("error opening %s: %s", device_paths.item[0], strerror(-PTR_ERR(c))); bch2_fs_stop(c); diff --git a/cmd_fs.c b/cmd_fs.c index 8b9d91b..f8c4642 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -195,6 +195,18 @@ static void print_fs_usage(const char *path, enum units units) bcache_fs_close(fs); } +int fs_usage(void) +{ + puts("bcachefs fs - manage a running filesystem\n" + "Usage: bcachefs fs [OPTION]... path\n" + "\n" + "Commands:\n" + " usage show disk usage\n" + "\n" + "Report bugs to "); + return 0; +} + int cmd_fs_usage(int argc, char *argv[]) { enum units units = BYTES; diff --git a/cmd_fusemount.c b/cmd_fusemount.c index 2b6b2d7..216094f 100644 --- a/cmd_fusemount.c +++ b/cmd_fusemount.c @@ -171,7 +171,7 @@ static void bcachefs_fuse_setattr(fuse_req_t req, fuse_ino_t inum, struct bch_fs *c = fuse_req_userdata(req); struct bch_inode_unpacked inode_u; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; u64 now; int ret; @@ -185,8 +185,7 @@ retry: bch2_trans_begin(&trans); now = bch2_current_time(c); - iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); + ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -208,11 +207,11 @@ retry: inode_u.bi_mtime = now; /* TODO: CTIME? */ - ret = bch2_inode_write(&trans, iter, &inode_u) ?: + ret = bch2_inode_write(&trans, &iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -523,7 +522,7 @@ static void bcachefs_fuse_read(fuse_req_t req, fuse_ino_t inum, static int inode_update_times(struct bch_fs *c, fuse_ino_t inum) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bch_inode_unpacked inode_u; int ret = 0; u64 now; @@ -533,15 +532,14 @@ retry: bch2_trans_begin(&trans); now = bch2_current_time(c); - iter = bch2_inode_peek(&trans, &inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); + ret = bch2_inode_peek(&trans, &iter, &inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; inode_u.bi_mtime = now; inode_u.bi_ctime = now; - ret = bch2_inode_write(&trans, iter, &inode_u); + ret = bch2_inode_write(&trans, &iter, &inode_u); if (ret) goto err; @@ -549,7 +547,7 @@ retry: BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; diff --git a/cmd_migrate.c b/cmd_migrate.c index 5126090..4da3ab1 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -123,6 +123,7 @@ static void update_inode(struct bch_fs *c, int ret; bch2_inode_pack(c, &packed, inode); + packed.inode.k.p.snapshot = U32_MAX; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, NULL, NULL, 0); if (ret) @@ -138,8 +139,9 @@ static void create_link(struct bch_fs *c, struct bch_inode_unpacked inode; int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_link_trans(&trans, parent->bi_inum, inum, - &parent_u, &inode, &qstr)); + bch2_link_trans(&trans, + (subvol_inum) { 1, parent->bi_inum }, &parent_u, + (subvol_inum) { 1, inum }, &inode, &qstr)); if (ret) die("error creating hardlink: %s", strerror(-ret)); } @@ -153,13 +155,16 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, struct qstr qstr = QSTR(name); struct bch_inode_unpacked new_inode; + bch2_inode_init_early(c, &new_inode); + int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_create_trans(&trans, - parent->bi_inum, parent, + (subvol_inum) { 1, parent->bi_inum }, parent, &new_inode, &qstr, - uid, gid, mode, rdev, NULL, NULL)); + uid, gid, mode, rdev, NULL, NULL, + (subvol_inum) {}, 0)); if (ret) - die("error creating file: %s", strerror(-ret)); + die("error creating %s: %s", name, strerror(-ret)); return new_inode; } @@ -225,44 +230,48 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, const struct xattr_handler *h = xattr_resolve_name(&attr); int ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, dst->bi_inum, &hash_info, attr, + bch2_xattr_set(&trans, + (subvol_inum) { 1, dst->bi_inum }, + &hash_info, attr, val, val_size, h->flags, 0)); if (ret < 0) die("error creating xattr: %s", strerror(-ret)); } } -static char buf[1 << 20] __aligned(PAGE_SIZE); +#define WRITE_DATA_BUF (1 << 20) + +static char buf[WRITE_DATA_BUF] __aligned(PAGE_SIZE); static void write_data(struct bch_fs *c, struct bch_inode_unpacked *dst_inode, u64 dst_offset, void *buf, size_t len) { - struct { - struct bch_write_op op; - struct bio_vec bv[sizeof(buf) / PAGE_SIZE]; - } o; + struct bch_write_op op; + struct bio_vec bv[WRITE_DATA_BUF / PAGE_SIZE]; struct closure cl; BUG_ON(dst_offset & (block_bytes(c) - 1)); BUG_ON(len & (block_bytes(c) - 1)); + BUG_ON(len > WRITE_DATA_BUF); closure_init_stack(&cl); - bio_init(&o.op.wbio.bio, o.bv, ARRAY_SIZE(o.bv)); - bch2_bio_map(&o.op.wbio.bio, buf, len); + bio_init(&op.wbio.bio, bv, ARRAY_SIZE(bv)); + bch2_bio_map(&op.wbio.bio, buf, len); - bch2_write_op_init(&o.op, c, bch2_opts_to_inode_opts(c->opts)); - o.op.write_point = writepoint_hashed(0); - o.op.nr_replicas = 1; - o.op.pos = POS(dst_inode->bi_inum, dst_offset >> 9); + bch2_write_op_init(&op, c, bch2_opts_to_inode_opts(c->opts)); + op.write_point = writepoint_hashed(0); + op.nr_replicas = 1; + op.subvol = 1; + op.pos = SPOS(dst_inode->bi_inum, dst_offset >> 9, U32_MAX); - int ret = bch2_disk_reservation_get(c, &o.op.res, len >> 9, + int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, c->opts.data_replicas, 0); if (ret) die("error reserving space in new filesystem: %s", strerror(-ret)); - closure_call(&o.op.cl, bch2_write, NULL, &cl); + closure_call(&op.cl, bch2_write, NULL, &cl); closure_sync(&cl); dst_inode->bi_sectors += len >> 9; @@ -314,11 +323,12 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, e = bkey_extent_init(&k.k); e->k.p.inode = dst->bi_inum; e->k.p.offset = logical + sectors; + e->k.p.snapshot = U32_MAX; e->k.size = sectors; bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) { .offset = physical, .dev = 0, - .gen = bucket(ca, b)->mark.gen, + .gen = *bucket_gen(ca, b), }); ret = bch2_disk_reservation_get(c, &res, sectors, 1, @@ -327,8 +337,6 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_mark_bkey_replicas(c, extent_i_to_s_c(e).s_c); - ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, NULL, 0); if (ret) @@ -428,6 +436,7 @@ static void copy_dir(struct copy_fs_state *s, if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, "..") || + !strcmp(d->d_name, "lost+found") || stat.st_ino == s->bcachefs_inum) continue; @@ -569,7 +578,8 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, syncfs(src_fd); struct bch_inode_unpacked root_inode; - int ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, &root_inode); + int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO }, + &root_inode); if (ret) die("error looking up root directory: %s", strerror(-ret)); @@ -595,8 +605,6 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, darray_free(s.extents); genradix_free(&s.hardlinks); - - bch2_alloc_write(c, false); } static void find_superblock_space(ranges extents, @@ -672,7 +680,7 @@ static int migrate_fs(const char *fs_path, u64 bcachefs_inum; ranges extents = reserve_new_fs_space(file_path, - fs_opts.block_size << 9, + fs_opts.block_size >> 9, get_size(dev.path, dev.fd) / 5, &bcachefs_inum, stat.st_dev, force); @@ -694,6 +702,7 @@ static int migrate_fs(const char *fs_path, opt_set(opts, sb, sb_offset); opt_set(opts, nostart, true); opt_set(opts, noexcl, true); + opt_set(opts, buckets_nouse, true); c = bch2_fs_open(path, 1, opts); if (IS_ERR(c)) diff --git a/cmd_subvolume.c b/cmd_subvolume.c new file mode 100644 index 0000000..99a302b --- /dev/null +++ b/cmd_subvolume.c @@ -0,0 +1,188 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libbcachefs/bcachefs.h" +#include "libbcachefs/bcachefs_ioctl.h" +#include "cmds.h" +#include "libbcachefs.h" +#include "libbcachefs/opts.h" +#include "tools-util.h" + +int subvolume_usage(void) +{ + puts("bcachefs subvolume - manage subvolumes and snapshots\n" + "Usage: bcachefs subvolume [OPTION]\n" + "\n" + "Commands:\n" + " create create a subvolume\n" + " delete delete a subvolume\n" + " snapshot create a snapshot\n" + "\n" + "Report bugs to "); + return 0; +} + +static void subvolume_create_usage(void) +{ + puts("bcachefs subvolume create - create a new subvolume\n" + "Usage: bcachefs subvolume create [OPTION]... path\n" + "\n" + "Options:\n" + " -h, --help Display this help and exit\n" + "\n" + "Report bugs to "); +} + +int cmd_subvolume_create(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { NULL } + }; + char *path; + int opt; + + while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1) + switch (opt) { + case 'h': + subvolume_create_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + while ((path = arg_pop())) { + char *dir = dirname(strdup(path)); + + struct bchfs_handle fs = bcache_fs_open(dir); + + struct bch_ioctl_subvolume i = { + .dirfd = AT_FDCWD, + .mode = 0777, + .dst_ptr = (unsigned long)path, + }; + + xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i); + bcache_fs_close(fs); + } + + return 0; +} + +static void subvolume_delete_usage(void) +{ + puts("bcachefs subvolume delete - delete an existing subvolume\n" + "Usage: bcachefs subvolume delete [OPTION]... path\n" + "\n" + "Options:\n" + " -h, --help Display this help and exit\n" + "\n" + "Report bugs to "); +} + +int cmd_subvolume_delete(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { NULL } + }; + char *path; + int opt; + + while ((opt = getopt_long(argc, argv, "h", longopts, NULL)) != -1) + switch (opt) { + case 'h': + subvolume_delete_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + while ((path = arg_pop())) { + char *dir = dirname(strdup(path)); + + struct bchfs_handle fs = bcache_fs_open(dir); + + struct bch_ioctl_subvolume i = { + .dirfd = AT_FDCWD, + .mode = 0777, + .dst_ptr = (unsigned long)path, + }; + + xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_DESTROY, &i); + bcache_fs_close(fs); + } + + return 0; +} + +static void snapshot_create_usage(void) +{ + puts("bcachefs subvolume snapshot - create a snapshot \n" + "Usage: bcachefs subvolume snapshot [OPTION]... \n" + "\n" + "Create a snapshot of at . If specified, must be a subvolume;\n" + "if not specified the snapshot will be of the subvolme containing .\n" + "Options:\n" + " -r Make snapshot read only\n" + " -h, --help Display this help and exit\n" + "\n" + "Report bugs to "); +} + +int cmd_subvolume_snapshot(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "help", no_argument, NULL, 'h' }, + { NULL } + }; + unsigned flags = BCH_SUBVOL_SNAPSHOT_CREATE; + int opt; + + while ((opt = getopt_long(argc, argv, "rh", longopts, NULL)) != -1) + switch (opt) { + case 'r': + flags |= BCH_SUBVOL_SNAPSHOT_RO; + break; + case 'h': + snapshot_create_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + char *src = arg_pop(); + char *dst = arg_pop(); + + if (argc) + die("Too many arguments"); + + if (!dst) + swap(src, dst); + if (!dst) + die("Please specify a path to create"); + + char *dir = dirname(strdup(dst)); + + struct bchfs_handle fs = bcache_fs_open(dir); + + struct bch_ioctl_subvolume i = { + .flags = flags, + .dirfd = AT_FDCWD, + .mode = 0777, + .src_ptr = (unsigned long)src, + .dst_ptr = (unsigned long)dst, + }; + + xioctl(fs.ioctl_fd, BCH_IOCTL_SUBVOLUME_CREATE, &i); + bcache_fs_close(fs); + return 0; +} diff --git a/cmds.h b/cmds.h index cc49084..52db63f 100644 --- a/cmds.h +++ b/cmds.h @@ -19,8 +19,10 @@ int cmd_run(int argc, char *argv[]); int cmd_stop(int argc, char *argv[]); #endif +int fs_usage(void); int cmd_fs_usage(int argc, char *argv[]); +int device_usage(void); int cmd_device_add(int argc, char *argv[]); int cmd_device_remove(int argc, char *argv[]); int cmd_device_online(int argc, char *argv[]); @@ -30,6 +32,7 @@ int cmd_device_set_state(int argc, char *argv[]); int cmd_device_resize(int argc, char *argv[]); int cmd_device_resize_journal(int argc, char *argv[]); +int data_usage(void); int cmd_data_rereplicate(int argc, char *argv[]); int cmd_data_job(int argc, char *argv[]); @@ -50,6 +53,11 @@ int cmd_version(int argc, char *argv[]); int cmd_setattr(int argc, char *argv[]); +int subvolume_usage(void); +int cmd_subvolume_create(int argc, char *argv[]); +int cmd_subvolume_delete(int argc, char *argv[]); +int cmd_subvolume_snapshot(int argc, char *argv[]); + int cmd_fusemount(int argc, char *argv[]); #endif /* _CMDS_H */ diff --git a/crypto.c b/crypto.c index 7f7fbd5..43753a3 100644 --- a/crypto.c +++ b/crypto.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include "libbcachefs/checksum.h" @@ -84,12 +84,13 @@ struct bch_key derive_passphrase(struct bch_sb_field_crypt *crypt, switch (BCH_CRYPT_KDF_TYPE(crypt)) { case BCH_KDF_SCRYPT: - ret = libscrypt_scrypt((void *) passphrase, strlen(passphrase), - salt, sizeof(salt), - 1ULL << BCH_KDF_SCRYPT_N(crypt), - 1ULL << BCH_KDF_SCRYPT_R(crypt), - 1ULL << BCH_KDF_SCRYPT_P(crypt), - (void *) &key, sizeof(key)); + ret = crypto_pwhash_scryptsalsa208sha256_ll( + (void *) passphrase, strlen(passphrase), + salt, sizeof(salt), + 1ULL << BCH_KDF_SCRYPT_N(crypt), + 1ULL << BCH_KDF_SCRYPT_R(crypt), + 1ULL << BCH_KDF_SCRYPT_P(crypt), + (void *) &key, sizeof(key)); if (ret) die("scrypt error: %i", ret); break; @@ -170,9 +171,9 @@ void bch_sb_crypt_init(struct bch_sb *sb, if (passphrase) { SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT); - SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N)); - SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r)); - SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p)); + SET_BCH_KDF_SCRYPT_N(crypt, ilog2(16384)); + SET_BCH_KDF_SCRYPT_R(crypt, ilog2(8)); + SET_BCH_KDF_SCRYPT_P(crypt, ilog2(16)); struct bch_key passphrase_key = derive_passphrase(crypt, passphrase); diff --git a/debian/changelog b/debian/changelog index 64bcbcb..3cb0882 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +bcachefs-tools (0.1+git20220216.a1e928a-1) unstable; urgency=medium + + * New upstream snapshot + * Grab patch from Ubuntu to reduce memory on amd64 builders + (http://launchpadlibrarian.net/580140160/bcachefs-tools_0.1+git20210805.6c42566-2_0.1+git20210805.6c42566-2ubuntu1.diff.gz) + * Update copyright years + + -- Jonathan Carter Wed, 16 Feb 2022 14:42:20 +0200 + bcachefs-tools (0.1+git20210805.6c42566-2) unstable; urgency=medium * Remove valgrind as build-dependency, seems unneeded unless diff --git a/debian/copyright b/debian/copyright index 3125db1..7fe4f5b 100644 --- a/debian/copyright +++ b/debian/copyright @@ -4,7 +4,7 @@ Upstream-Contact: kmo@daterainc.com Source: https://evilpiepirate.org/git/bcachefs-tools.git Files: * -Copyright: 2013-2020 Kent Overstreet +Copyright: 2013-2022 Kent Overstreet 2013 Gabriel de Perthuis 2008 Intel Corporation License: GPL-2 @@ -38,7 +38,7 @@ License: expat THE SOFTWARE. Files: debian/* -Copyright: 2019-2020 Jonathan Carter +Copyright: 2019-2022 Jonathan Carter 2014 Tom Strickx , 2014 David Mohr License: GPL-2+ diff --git a/debian/files b/debian/files index 66d7523..2ea4bfc 100644 --- a/debian/files +++ b/debian/files @@ -1 +1 @@ -bcachefs-tools_0.1+git20210805.6c42566-2_source.buildinfo utils optional +bcachefs-tools_0.1+git20220216.a1e928a-1_source.buildinfo utils optional diff --git a/debian/rules b/debian/rules index b3ee89d..748186e 100755 --- a/debian/rules +++ b/debian/rules @@ -3,6 +3,12 @@ export DEB_BUILD_MAINT_OPTIONS=hardening=+all PREFIX := /usr +DEB_BUILD_ARCH ?= $(shell dpkg-architecture -qDEB_BUILD_ARCH) + +ifeq ($(DEB_BUILD_ARCH),amd64) + DEB_BUILD_MAINT_OPTIONS += optimize=-lto +endif + %: dh $@ diff --git a/default.nix b/default.nix index f19ff10..48f2aa9 100644 --- a/default.nix +++ b/default.nix @@ -1,32 +1,142 @@ -{ nixpkgs ? (import ./nix/nixpkgs.nix) -}: - -with nixpkgs; - -stdenv.mkDerivation rec { - name = "bcachefs-tools-${version}"; - version = "git"; - - src = lib.cleanSource ./.; # NOTE: ignore .git, otherwise things get weird! - - nativeBuildInputs = [ git pkgconfig ]; - buildInputs = - [ liburcu libuuid libaio zlib attr keyutils - libsodium libscrypt - ]; - - enableParallelBuilding = true; - makeFlags = - [ "PREFIX=$(out)" - ]; - - meta = with stdenv.lib; { - description = "Userspace tools for bcachefs"; - homepage = http://bcachefs.org; - license = licenses.gpl2; - platforms = platforms.linux; - maintainers = - [ "Kent Overstreet " - ]; - }; +{ lib +, filter + +, stdenv +, pkg-config +, attr +, libuuid +, libsodium +, keyutils + +, liburcu +, zlib +, libaio +, udev +, zstd +, lz4 + +, python39 +, python39Packages +, docutils +, nixosTests + +, lastModified +, versionString ? lastModified + +, inShell ? false +, debugMode ? inShell + +, testWithValgrind ? true +, valgrind + +, fuseSupport ? false +, fuse3 ? null }: + +assert fuseSupport -> fuse3 != null; +assert testWithValgrind -> valgrind != null; +stdenv.mkDerivation { + pname = "bcachefs-tools"; + + version = "v0.1-flake-${versionString}"; + VERSION = "v0.1-flake-${versionString}"; + + src = filter.filter { + name = "bcachefs-tools"; + root = ./.; + exclude = [ + ./rust-src + + ./.git + ./nix + + ./flake.nix + ./flake.lock + ]; + }; + + postPatch = "patchShebangs --build doc/macro2rst.py"; + + nativeBuildInputs = [ + # used to find dependencies + ## see ./INSTALL + pkg-config + ]; + buildInputs = [ + # bcachefs explicit dependencies + ## see ./INSTALL + libaio + + # libblkid + keyutils # libkeyutils + lz4 # liblz4 + + libsodium + liburcu + libuuid + zstd # libzstd + zlib # zlib1g + valgrind + + # unspecified dependencies + attr + udev + + # documentation depenedencies + docutils + python39Packages.pygments + ] ++ (lib.optional fuseSupport fuse3) + ++ (lib.optional testWithValgrind valgrind) ; + + makeFlags = [ + "PREFIX=${placeholder "out"}" + ] ++ lib.optional debugMode "EXTRA_CFLAGS=-ggdb"; + + installFlags = [ + "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools" + ]; + + doCheck = true; # needs bcachefs module loaded on builder + + checkInputs = [ + python39Packages.pytest + python39Packages.pytest-xdist + ] ++ lib.optional testWithValgrind valgrind; + + checkFlags = [ + "BCACHEFS_TEST_USE_VALGRIND=${if testWithValgrind then "yes" else "no"}" + # cannot escape spaces within make flags, quotes are stripped + "PYTEST_CMD=pytest" # "PYTEST_ARGS='-n4 --version'" + ]; + + preCheck = + '' + makeFlagsArray+=(PYTEST_ARGS="--verbose -n2") + '' + + lib.optionalString fuseSupport '' + rm tests/test_fuse.py + ''; + + dontStrip = debugMode == true; + passthru = { + bcachefs_revision = let + file = builtins.readFile ./.bcachefs_revision; + removeLineFeeds = str: lib.lists.foldr (lib.strings.removeSuffix) str ["\r" "\n"]; + in removeLineFeeds file; + + tests = { + smoke-test = nixosTests.bcachefs; + }; + }; + + enableParallelBuilding = true; + meta = with lib; { + description = "Userspace tools for bcachefs"; + homepage = http://bcachefs.org; + license = licenses.gpl2; + platforms = platforms.linux; + maintainers = + [ "Kent Overstreet " + ]; + + }; } diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..2c9c15b --- /dev/null +++ b/flake.lock @@ -0,0 +1,59 @@ +{ + "nodes": { + "filter": { + "locked": { + "lastModified": 1620202920, + "narHash": "sha256-BOkm3eKT45Dk4NNxJT0xL9NnyYeZcF+t79zPnJkggac=", + "owner": "numtide", + "repo": "nix-filter", + "rev": "3c9e33ed627e009428197b07216613206f06ed80", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "nix-filter", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1633351077, + "narHash": "sha256-z38JG4Bb0GtM1aF1pANVdp1dniMP23Yb3HnRoJRy2uU=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "14aef06d9b3ad1d07626bdbb16083b83f92dc6c1", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "filter": "filter", + "nixpkgs": "nixpkgs", + "utils": "utils" + } + }, + "utils": { + "locked": { + "lastModified": 1629481132, + "narHash": "sha256-JHgasjPR0/J1J3DRm4KxM4zTyAj4IOJY8vIl75v/kPI=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "997f7efcb746a9c140ce1f13c72263189225f482", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..b52bc7e --- /dev/null +++ b/flake.nix @@ -0,0 +1,96 @@ +{ + description = "Userspace tools for bcachefs"; + + # Nixpkgs / NixOS version to use. + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + inputs.utils.url = "github:numtide/flake-utils"; + inputs.filter.url = "github:numtide/nix-filter"; + + outputs = { self, nixpkgs, utils, filter, ... }@inputs: + let + # System types to support. + supportedSystems = [ "x86_64-linux" ]; + in + { + version = "${builtins.substring 0 8 self.lastModifiedDate}-${self.shortRev or "dirty"}"; + + overlay = import ./nix/overlay.nix inputs; + nixosModule = self.nixosModules.bcachefs; + nixosModules.bcachefs = import ./rust-src/mount/module.nix; + nixosModules.bcachefs-enable-boot = ({config, pkgs, lib, ... }:{ + # Disable Upstream NixOS Module when this is in use + disabledModules = [ "tasks/filesystems/bcachefs.nix" ]; + # Import needed packages + nixpkgs.overlays = [ self.overlay ]; + + # Add bcachefs to boot and kernel + boot.initrd.supportedFilesystems = [ "bcachefs" ]; + boot.supportedFilesystems = [ "bcachefs" ]; + }); + + nixosConfigurations.netboot-bcachefs = self.systems.netboot-bcachefs "x86_64-linux"; + systems.netboot-bcachefs = system: (nixpkgs.lib.nixosSystem { + inherit system; modules = [ + self.nixosModule + self.nixosModules.bcachefs-enable-boot + ("${nixpkgs}/nixos/modules/installer/netboot/netboot-minimal.nix") + ({ lib, pkgs, config, ... }: { + # installation disk autologin + services.getty.autologinUser = lib.mkForce "root"; + users.users.root.initialPassword = "toor"; + + # Symlink everything together + system.build.netboot = pkgs.symlinkJoin { + name = "netboot"; + paths = with config.system.build; [ + netbootRamdisk + kernel + netbootIpxeScript + ]; + preferLocalBuild = true; + }; + }) + ]; + }); + } + // utils.lib.eachSystem supportedSystems (system: + let pkgs = import nixpkgs { + inherit system; + overlays = [ self.overlay ]; + }; + in rec { + + # A Nixpkgs overlay. + + # Provide some binary packages for selected system types. + defaultPackage = pkgs.bcachefs.tools; + packages = { + inherit (pkgs.bcachefs) + tools + toolsValgrind + toolsDebug + mount + bch_bindgen + kernel; + + tools-musl = pkgs.pkgsMusl.bcachefs.tools; + mount-musl = pkgs.pkgsMusl.bcachefs.mount; + }; + + checks = { + kernelSrc = packages.kernel.src; + inherit (packages) + mount + bch_bindgen + toolsValgrind; + + # Build and test initrd with bcachefs and bcachefs.mount installed + # Disabled Test because it takes a while to build the kernel + # bootStage1Module = self.nixosConfigurations.netboot-bcachefs.config.system.build.bootStage1; + }; + + devShell = devShells.tools; + devShells.tools = pkgs.bcachefs.tools.override { inShell = true; }; + devShells.mount = pkgs.bcachefs.mount.override { inShell = true; }; + }); +} diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h new file mode 100644 index 0000000..e69de29 diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 8aef4bb..be736c8 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -21,6 +21,8 @@ struct request_queue { }; struct gendisk { + struct backing_dev_info *bdi; + struct backing_dev_info __bdi; }; struct hd_struct { @@ -38,9 +40,6 @@ struct block_device { struct gendisk __bd_disk; int bd_fd; int bd_sync_fd; - - struct backing_dev_info *bd_bdi; - struct backing_dev_info __bd_bdi; }; #define bdev_kobj(_bdev) (&((_bdev)->kobj)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 35082ae..4ce43b5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -6,7 +6,7 @@ #include #include -#define BIO_MAX_VECS 256 +#define BIO_MAX_VECS 256U typedef unsigned fmode_t; @@ -74,6 +74,17 @@ int blkdev_issue_discard(struct block_device *, sector_t, #define bdev_get_queue(bdev) (&((bdev)->queue)) +#ifndef SECTOR_SHIFT +#define SECTOR_SHIFT 9 +#endif +#ifndef SECTOR_SIZE +#define SECTOR_SIZE (1 << SECTOR_SHIFT) +#endif + +#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) +#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) +#define SECTOR_MASK (PAGE_SECTORS - 1) + #define blk_queue_discard(q) ((void) (q), 0) #define blk_queue_nonrot(q) ((void) (q), 0) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 2bfbfad..6d039ea 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -60,6 +60,7 @@ #define unlikely(x) __builtin_expect(!!(x), 0) #define unreachable() __builtin_unreachable() #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) +#define fallthrough __attribute__((__fallthrough__)) #define ___PASTE(a,b) a##b #define __PASTE(a,b) ___PASTE(a,b) diff --git a/include/linux/poison.h b/include/linux/poison.h index aff1c92..d62ef5a 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -78,4 +78,7 @@ /********** security/ **********/ #define KEY_DESTROY 0xbd +/********** net/core/page_pool.c **********/ +#define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) + #endif diff --git a/include/linux/siphash.h b/include/linux/siphash.h new file mode 100644 index 0000000..bf21591 --- /dev/null +++ b/include/linux/siphash.h @@ -0,0 +1,145 @@ +/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. + * + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * + * This implementation is specifically for SipHash2-4 for a secure PRF + * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for + * hashtables. + */ + +#ifndef _LINUX_SIPHASH_H +#define _LINUX_SIPHASH_H + +#include +#include + +#define SIPHASH_ALIGNMENT __alignof__(u64) +typedef struct { + u64 key[2]; +} siphash_key_t; + +static inline bool siphash_key_is_zero(const siphash_key_t *key) +{ + return !(key->key[0] | key->key[1]); +} + +u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key); +#endif + +u64 siphash_1u64(const u64 a, const siphash_key_t *key); +u64 siphash_2u64(const u64 a, const u64 b, const siphash_key_t *key); +u64 siphash_3u64(const u64 a, const u64 b, const u64 c, + const siphash_key_t *key); +u64 siphash_4u64(const u64 a, const u64 b, const u64 c, const u64 d, + const siphash_key_t *key); +u64 siphash_1u32(const u32 a, const siphash_key_t *key); +u64 siphash_3u32(const u32 a, const u32 b, const u32 c, + const siphash_key_t *key); + +static inline u64 siphash_2u32(const u32 a, const u32 b, + const siphash_key_t *key) +{ + return siphash_1u64((u64)b << 32 | a, key); +} +static inline u64 siphash_4u32(const u32 a, const u32 b, const u32 c, + const u32 d, const siphash_key_t *key) +{ + return siphash_2u64((u64)b << 32 | a, (u64)d << 32 | c, key); +} + + +static inline u64 ___siphash_aligned(const __le64 *data, size_t len, + const siphash_key_t *key) +{ + if (__builtin_constant_p(len) && len == 4) + return siphash_1u32(le32_to_cpup((const __le32 *)data), key); + if (__builtin_constant_p(len) && len == 8) + return siphash_1u64(le64_to_cpu(data[0]), key); + if (__builtin_constant_p(len) && len == 16) + return siphash_2u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), + key); + if (__builtin_constant_p(len) && len == 24) + return siphash_3u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), + le64_to_cpu(data[2]), key); + if (__builtin_constant_p(len) && len == 32) + return siphash_4u64(le64_to_cpu(data[0]), le64_to_cpu(data[1]), + le64_to_cpu(data[2]), le64_to_cpu(data[3]), + key); + return __siphash_aligned(data, len, key); +} + +/** + * siphash - compute 64-bit siphash PRF value + * @data: buffer to hash + * @size: size of @data + * @key: the siphash key + */ +static inline u64 siphash(const void *data, size_t len, + const siphash_key_t *key) +{ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (!IS_ALIGNED((unsigned long)data, SIPHASH_ALIGNMENT)) + return __siphash_unaligned(data, len, key); +#endif + return ___siphash_aligned(data, len, key); +} + +#define HSIPHASH_ALIGNMENT __alignof__(unsigned long) +typedef struct { + unsigned long key[2]; +} hsiphash_key_t; + +u32 __hsiphash_aligned(const void *data, size_t len, + const hsiphash_key_t *key); +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key); +#endif + +u32 hsiphash_1u32(const u32 a, const hsiphash_key_t *key); +u32 hsiphash_2u32(const u32 a, const u32 b, const hsiphash_key_t *key); +u32 hsiphash_3u32(const u32 a, const u32 b, const u32 c, + const hsiphash_key_t *key); +u32 hsiphash_4u32(const u32 a, const u32 b, const u32 c, const u32 d, + const hsiphash_key_t *key); + +static inline u32 ___hsiphash_aligned(const __le32 *data, size_t len, + const hsiphash_key_t *key) +{ + if (__builtin_constant_p(len) && len == 4) + return hsiphash_1u32(le32_to_cpu(data[0]), key); + if (__builtin_constant_p(len) && len == 8) + return hsiphash_2u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), + key); + if (__builtin_constant_p(len) && len == 12) + return hsiphash_3u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), + le32_to_cpu(data[2]), key); + if (__builtin_constant_p(len) && len == 16) + return hsiphash_4u32(le32_to_cpu(data[0]), le32_to_cpu(data[1]), + le32_to_cpu(data[2]), le32_to_cpu(data[3]), + key); + return __hsiphash_aligned(data, len, key); +} + +/** + * hsiphash - compute 32-bit hsiphash PRF value + * @data: buffer to hash + * @size: size of @data + * @key: the hsiphash key + */ +static inline u32 hsiphash(const void *data, size_t len, + const hsiphash_key_t *key) +{ +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + if (!IS_ALIGNED((unsigned long)data, HSIPHASH_ALIGNMENT)) + return __hsiphash_unaligned(data, len, key); +#endif + return ___hsiphash_aligned(data, len, key); +} + +#endif /* _LINUX_SIPHASH_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index ef86153..bc99973 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,20 +16,23 @@ static inline void *kmalloc(size_t size, gfp_t flags) { + unsigned i = 0; void *p; - run_shrinkers(); - - if (size) { - size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE); - alignment = max(sizeof(void *), alignment); - if (posix_memalign(&p, alignment, size)) - p = NULL; - } else { - p = malloc(0); - } - if (p && (flags & __GFP_ZERO)) - memset(p, 0, size); + do { + run_shrinkers(); + + if (size) { + size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE); + alignment = max(sizeof(void *), alignment); + if (posix_memalign(&p, alignment, size)) + p = NULL; + } else { + p = malloc(0); + } + if (p && (flags & __GFP_ZERO)) + memset(p, 0, size); + } while (!p && i++ < 10); return p; } @@ -38,8 +41,6 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) { void *new; - run_shrinkers(); - new = kmalloc(size, flags); if (!new) return NULL; @@ -62,6 +63,10 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) ((size) != 0 && (n) > SIZE_MAX / (size) \ ? NULL : kmalloc((n) * (size), flags)) +#define kvmalloc_array(n, size, flags) \ + ((size) != 0 && (n) > SIZE_MAX / (size) \ + ? NULL : kmalloc((n) * (size), flags)) + #define kcalloc(n, size, flags) kmalloc_array(n, size, flags|__GFP_ZERO) #define kfree(p) free(p) @@ -74,13 +79,16 @@ static inline void *krealloc(void *old, size_t size, gfp_t flags) static inline struct page *alloc_pages(gfp_t flags, unsigned int order) { size_t size = PAGE_SIZE << order; + unsigned i = 0; void *p; - run_shrinkers(); + do { + run_shrinkers(); - p = aligned_alloc(PAGE_SIZE, size); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, size); + p = aligned_alloc(PAGE_SIZE, size); + if (p && (flags & __GFP_ZERO)) + memset(p, 0, size); + } while (!p && i++ < 10); return p; } diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c674d9a..ccb319e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -14,18 +14,18 @@ static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) { + unsigned i = 0; void *p; size = round_up(size, PAGE_SIZE); - run_shrinkers(); + do { + run_shrinkers(); - p = aligned_alloc(PAGE_SIZE, size); - if (!p) - return NULL; - - if (gfp_mask & __GFP_ZERO) - memset(p, 0, size); + p = aligned_alloc(PAGE_SIZE, size); + if (p && gfp_mask & __GFP_ZERO) + memset(p, 0, size); + } while (!p && i++ < 10); return p; } diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index a11bb5f..a21a392 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -298,28 +298,6 @@ TRACE_EVENT(btree_reserve_get_fail, __entry->required, __entry->cl) ); -TRACE_EVENT(btree_insert_key, - TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), - TP_ARGS(c, b, k), - - TP_STRUCT__entry( - __field(u8, id ) - __field(u64, inode ) - __field(u64, offset ) - __field(u32, size ) - ), - - TP_fast_assign( - __entry->id = b->c.btree_id; - __entry->inode = k->k.p.inode; - __entry->offset = k->k.p.offset; - __entry->size = k->k.size; - ), - - TP_printk("btree %u: %llu:%llu len %u", __entry->id, - __entry->inode, __entry->offset, __entry->size) -); - DEFINE_EVENT(btree_node, btree_split, TP_PROTO(struct bch_fs *c, struct btree *b), TP_ARGS(c, b) @@ -340,6 +318,80 @@ DEFINE_EVENT(btree_node, btree_set_root, TP_ARGS(c, b) ); +TRACE_EVENT(btree_cache_scan, + TP_PROTO(unsigned long nr_to_scan_pages, + unsigned long nr_to_scan_nodes, + unsigned long can_free_nodes, + long ret), + TP_ARGS(nr_to_scan_pages, nr_to_scan_nodes, can_free_nodes, ret), + + TP_STRUCT__entry( + __field(unsigned long, nr_to_scan_pages ) + __field(unsigned long, nr_to_scan_nodes ) + __field(unsigned long, can_free_nodes ) + __field(long, ret ) + ), + + TP_fast_assign( + __entry->nr_to_scan_pages = nr_to_scan_pages; + __entry->nr_to_scan_nodes = nr_to_scan_nodes; + __entry->can_free_nodes = can_free_nodes; + __entry->ret = ret; + ), + + TP_printk("scanned for %lu pages, %lu nodes, can free %lu nodes, ret %li", + __entry->nr_to_scan_pages, + __entry->nr_to_scan_nodes, + __entry->can_free_nodes, + __entry->ret) +); + +TRACE_EVENT(btree_node_relock_fail, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos, + unsigned long node, + u32 iter_lock_seq, + u32 node_lock_seq), + TP_ARGS(trans_fn, caller_ip, btree_id, pos, node, iter_lock_seq, node_lock_seq), + + TP_STRUCT__entry( + __array(char, trans_fn, 24 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u64, pos_inode ) + __field(u64, pos_offset ) + __field(u32, pos_snapshot ) + __field(unsigned long, node ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = btree_id; + __entry->pos_inode = pos->inode; + __entry->pos_offset = pos->offset; + __entry->pos_snapshot = pos->snapshot; + __entry->node = node; + __entry->iter_lock_seq = iter_lock_seq; + __entry->node_lock_seq = node_lock_seq; + ), + + TP_printk("%s %pS btree %u pos %llu:%llu:%u, node %lu iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->btree_id, + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->node, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + /* Garbage collection */ DEFINE_EVENT(btree_node, btree_gc_rewrite_node, @@ -381,7 +433,7 @@ TRACE_EVENT(alloc_scan, ), TP_fast_assign( - __entry->dev = ca->disk_sb.bdev->bd_dev; + __entry->dev = ca->dev; __entry->found = found; __entry->inc_gen = inc_gen; __entry->inc_gen_skipped = inc_gen_skipped; @@ -403,7 +455,7 @@ TRACE_EVENT(invalidate, ), TP_fast_assign( - __entry->dev = ca->disk_sb.bdev->bd_dev; + __entry->dev = ca->dev; __entry->offset = offset, __entry->sectors = sectors; ), @@ -425,7 +477,7 @@ DECLARE_EVENT_CLASS(bucket_alloc, ), TP_fast_assign( - __entry->dev = ca->disk_sb.bdev->bd_dev; + __entry->dev = ca->dev; __entry->reserve = reserve; ), @@ -540,157 +592,87 @@ TRACE_EVENT(copygc_wait, __entry->wait_amount, __entry->until) ); -TRACE_EVENT(trans_get_iter, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *got_pos, - unsigned got_locks, - unsigned got_uptodate, - struct bpos *src_pos, - unsigned src_locks, - unsigned src_uptodate), - TP_ARGS(trans_ip, caller_ip, btree_id, - got_pos, got_locks, got_uptodate, - src_pos, src_locks, src_uptodate), - - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, got_pos_inode ) - __field(u64, got_pos_offset ) - __field(u32, got_pos_snapshot ) - __field(u8, got_locks ) - __field(u8, got_uptodate ) - __field(u64, src_pos_inode ) - __field(u64, src_pos_offset ) - __field(u32, src_pos_snapshot ) - __field(u8, src_locks ) - __field(u8, src_uptodate ) - ), - - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->got_pos_inode = got_pos->inode; - __entry->got_pos_offset = got_pos->offset; - __entry->got_pos_snapshot = got_pos->snapshot; - __entry->got_locks = got_locks; - __entry->got_uptodate = got_uptodate; - __entry->src_pos_inode = src_pos->inode; - __entry->src_pos_offset = src_pos->offset; - __entry->src_pos_snapshot = src_pos->snapshot; - __entry->src_locks = src_locks; - __entry->src_uptodate = src_uptodate; - ), - - TP_printk("%ps %pS btree %u got %llu:%llu:%u l %u u %u " - "src %llu:%llu:%u l %u u %u", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->btree_id, - __entry->got_pos_inode, - __entry->got_pos_offset, - __entry->got_pos_snapshot, - __entry->got_locks, - __entry->got_uptodate, - __entry->src_pos_inode, - __entry->src_pos_offset, - __entry->src_pos_snapshot, - __entry->src_locks, - __entry->src_uptodate) -); - -TRACE_EVENT(transaction_restart_ip, - TP_PROTO(unsigned long caller, unsigned long ip), - TP_ARGS(caller, ip), - - TP_STRUCT__entry( - __field(unsigned long, caller ) - __field(unsigned long, ip ) - ), - - TP_fast_assign( - __entry->caller = caller; - __entry->ip = ip; - ), - - TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) -); - DECLARE_EVENT_CLASS(transaction_restart, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip), + TP_ARGS(trans_fn, caller_ip), TP_STRUCT__entry( - __field(unsigned long, trans_ip ) + __array(char, trans_fn, 24 ) __field(unsigned long, caller_ip ) ), TP_fast_assign( - __entry->trans_ip = trans_ip; + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; ), - TP_printk("%ps %pS", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip) + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + +DEFINE_EVENT(transaction_restart, transaction_restart_ip, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip), + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_traverse_all, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip), + TP_ARGS(trans_fn, caller_ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_key_cache_raced, + TP_PROTO(const char *trans_fn, unsigned long caller_ip), - TP_ARGS(trans_ip, caller_ip) + TP_ARGS(trans_fn, caller_ip) ); DECLARE_EVENT_CLASS(transaction_restart_iter, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos), TP_STRUCT__entry( - __field(unsigned long, trans_ip ) + __array(char, trans_fn, 24 ) __field(unsigned long, caller_ip ) __field(u8, btree_id ) __field(u64, pos_inode ) @@ -699,7 +681,7 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, ), TP_fast_assign( - __entry->trans_ip = trans_ip; + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->btree_id = btree_id; __entry->pos_inode = pos->inode; @@ -707,8 +689,8 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, __entry->pos_snapshot = pos->snapshot; ), - TP_printk("%ps %pS btree %u pos %llu:%llu:%u", - (void *) __entry->trans_ip, + TP_printk("%s %pS btree %u pos %llu:%llu:%u", + __entry->trans_fn, (void *) __entry->caller_ip, __entry->btree_id, __entry->pos_inode, @@ -717,153 +699,111 @@ DECLARE_EVENT_CLASS(transaction_restart_iter, ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_mark, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_upgrade, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_iter_upgrade, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); -DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, - TP_PROTO(unsigned long trans_ip, +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, enum btree_id btree_id, struct bpos *pos), - TP_ARGS(trans_ip, caller_ip, btree_id, pos) + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); -TRACE_EVENT(iter_traverse, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - bool key_cache, - enum btree_id btree_id, - struct bpos *pos, - int ret), - TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, ret), - - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, key_cache ) - __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) - __field(s32, ret ) - ), - - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->key_cache = key_cache; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; - __entry->ret = ret; - ), +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); - TP_printk("%ps %pS key cache %u btree %u %llu:%llu:%u ret %i", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->key_cache, - __entry->btree_id, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->ret) +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); -TRACE_EVENT(iter_set_search_pos, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - enum btree_id btree_id, - struct bpos *old_pos, - struct bpos *new_pos, - unsigned good_level), - TP_ARGS(trans_ip, caller_ip, btree_id, old_pos, new_pos, good_level), +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); - TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u64, old_pos_inode ) - __field(u64, old_pos_offset ) - __field(u32, old_pos_snapshot ) - __field(u64, new_pos_inode ) - __field(u64, new_pos_offset ) - __field(u32, new_pos_snapshot ) - __field(u8, good_level ) - ), +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); - TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->btree_id = btree_id; - __entry->old_pos_inode = old_pos->inode; - __entry->old_pos_offset = old_pos->offset; - __entry->old_pos_snapshot = old_pos->snapshot; - __entry->new_pos_inode = new_pos->inode; - __entry->new_pos_offset = new_pos->offset; - __entry->new_pos_snapshot = new_pos->snapshot; - __entry->good_level = good_level; - ), +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) +); - TP_printk("%ps %pS btree %u old pos %llu:%llu:%u new pos %llu:%llu:%u l %u", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->btree_id, - __entry->old_pos_inode, - __entry->old_pos_offset, - __entry->old_pos_snapshot, - __entry->new_pos_inode, - __entry->new_pos_offset, - __entry->new_pos_snapshot, - __entry->good_level) +DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + enum btree_id btree_id, + struct bpos *pos), + TP_ARGS(trans_fn, caller_ip, btree_id, pos) ); TRACE_EVENT(trans_restart_would_deadlock, - TP_PROTO(unsigned long trans_ip, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, bool in_traverse_all, unsigned reason, @@ -873,12 +813,12 @@ TRACE_EVENT(trans_restart_would_deadlock, enum btree_id want_btree_id, unsigned want_iter_type, struct bpos *want_pos), - TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason, + TP_ARGS(trans_fn, caller_ip, in_traverse_all, reason, have_btree_id, have_iter_type, have_pos, want_btree_id, want_iter_type, want_pos), TP_STRUCT__entry( - __field(unsigned long, trans_ip ) + __array(char, trans_fn, 24 ) __field(unsigned long, caller_ip ) __field(u8, in_traverse_all ) __field(u8, reason ) @@ -896,7 +836,7 @@ TRACE_EVENT(trans_restart_would_deadlock, ), TP_fast_assign( - __entry->trans_ip = trans_ip; + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); __entry->caller_ip = caller_ip; __entry->in_traverse_all = in_traverse_all; __entry->reason = reason; @@ -914,8 +854,8 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_pos_snapshot = want_pos->snapshot; ), - TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", - (void *) __entry->trans_ip, + TP_printk("%s %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u", + __entry->trans_fn, (void *) __entry->caller_ip, __entry->in_traverse_all, __entry->reason, @@ -931,99 +871,43 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_pos_snapshot) ); -TRACE_EVENT(trans_restart_mem_realloced, - TP_PROTO(unsigned long trans_ip, unsigned long caller_ip, - unsigned long bytes), - TP_ARGS(trans_ip, caller_ip, bytes), +TRACE_EVENT(trans_restart_would_deadlock_write, + TP_PROTO(const char *trans_fn), + TP_ARGS(trans_fn), TP_STRUCT__entry( - __field(unsigned long, trans_ip ) - __field(unsigned long, caller_ip ) - __field(unsigned long, bytes ) + __array(char, trans_fn, 24 ) ), TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->bytes = bytes; + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); ), - TP_printk("%ps %pS bytes %lu", - (void *) __entry->trans_ip, - (void *) __entry->caller_ip, - __entry->bytes) + TP_printk("%s", __entry->trans_fn) ); -DECLARE_EVENT_CLASS(node_lock_fail, - TP_PROTO(unsigned long trans_ip, +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(const char *trans_fn, unsigned long caller_ip, - bool key_cache, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, - level, iter_seq, node, node_seq), + unsigned long bytes), + TP_ARGS(trans_fn, caller_ip, bytes), TP_STRUCT__entry( - __field(unsigned long, trans_ip ) + __array(char, trans_fn, 24 ) __field(unsigned long, caller_ip ) - __field(u8, key_cache ) - __field(u8, btree_id ) - __field(u64, pos_inode ) - __field(u64, pos_offset ) - __field(u32, pos_snapshot ) - __field(u32, level ) - __field(u32, iter_seq ) - __field(u32, node ) - __field(u32, node_seq ) + __field(unsigned long, bytes ) ), TP_fast_assign( - __entry->trans_ip = trans_ip; - __entry->caller_ip = caller_ip; - __entry->key_cache = key_cache; - __entry->btree_id = btree_id; - __entry->pos_inode = pos->inode; - __entry->pos_offset = pos->offset; - __entry->pos_snapshot = pos->snapshot; - __entry->level = level; - __entry->iter_seq = iter_seq; - __entry->node = node; - __entry->node_seq = node_seq; + strncpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; ), - TP_printk("%ps %pS key cache %u btree %u pos %llu:%llu:%u level %u iter seq %u node %u node seq %u", - (void *) __entry->trans_ip, + TP_printk("%s %pS bytes %lu", + __entry->trans_fn, (void *) __entry->caller_ip, - __entry->key_cache, - __entry->btree_id, - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->level, __entry->iter_seq, - __entry->node, __entry->node_seq) -); - -DEFINE_EVENT(node_lock_fail, node_upgrade_fail, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - bool key_cache, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, - level, iter_seq, node, node_seq) -); - -DEFINE_EVENT(node_lock_fail, node_relock_fail, - TP_PROTO(unsigned long trans_ip, - unsigned long caller_ip, - bool key_cache, - enum btree_id btree_id, - struct bpos *pos, - unsigned level, u32 iter_seq, unsigned node, u32 node_seq), - TP_ARGS(trans_ip, caller_ip, key_cache, btree_id, pos, - level, iter_seq, node, node_seq) + __entry->bytes) ); #endif /* _TRACE_BCACHE_H */ diff --git a/libbcachefs.c b/libbcachefs.c index 34246dc..f78ebf0 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -29,17 +29,12 @@ #define NSEC_PER_SEC 1000000000L -/* minimum size filesystem we can create, given a bucket size: */ -static u64 min_size(unsigned bucket_size) -{ - return BCH_MIN_NR_NBUCKETS * bucket_size; -} - static void init_layout(struct bch_sb_layout *l, unsigned block_size, unsigned sb_size, u64 sb_start, u64 sb_end) { + u64 sb_pos = sb_start; unsigned i; memset(l, 0, sizeof(*l)); @@ -51,25 +46,32 @@ static void init_layout(struct bch_sb_layout *l, /* Create two superblocks in the allowed range: */ for (i = 0; i < l->nr_superblocks; i++) { - if (sb_start != BCH_SB_SECTOR) - sb_start = round_up(sb_start, block_size); + if (sb_pos != BCH_SB_SECTOR) + sb_pos = round_up(sb_pos, block_size); - l->sb_offset[i] = cpu_to_le64(sb_start); - sb_start += sb_size; + l->sb_offset[i] = cpu_to_le64(sb_pos); + sb_pos += sb_size; } - if (sb_start >= sb_end) - die("insufficient space for superblocks"); + if (sb_pos > sb_end) + die("insufficient space for superblocks: start %llu end %llu > %llu size %u", + sb_start, sb_pos, sb_end, sb_size); +} + +/* minimum size filesystem we can create, given a bucket size: */ +static u64 min_size(unsigned bucket_size) +{ + return BCH_MIN_NR_NBUCKETS * bucket_size; } void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) { if (!dev->size) - dev->size = get_size(dev->path, dev->fd) >> 9; + dev->size = get_size(dev->path, dev->fd); if (!dev->bucket_size) { if (dev->size < min_size(opts.block_size)) - die("cannot format %s, too small (%llu sectors, min %llu)", + die("cannot format %s, too small (%llu bytes, min %llu)", dev->path, dev->size, min_size(opts.block_size)); /* Bucket size must be >= block size: */ @@ -81,16 +83,16 @@ void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) opts.btree_node_size); /* Want a bucket size of at least 128k, if possible: */ - dev->bucket_size = max(dev->bucket_size, 256U); + dev->bucket_size = max(dev->bucket_size, 128ULL << 10); if (dev->size >= min_size(dev->bucket_size)) { unsigned scale = max(1, - ilog2(dev->size / min_size(dev->bucket_size)) / 4); + ilog2(dev->size / min_size(dev->bucket_size)) / 4); scale = rounddown_pow_of_two(scale); /* max bucket size 1 mb */ - dev->bucket_size = min(dev->bucket_size * scale, 1U << 11); + dev->bucket_size = min(dev->bucket_size * scale, 1ULL << 20); } else { do { dev->bucket_size /= 2; @@ -98,19 +100,24 @@ void bch2_pick_bucket_size(struct bch_opts opts, struct dev_opts *dev) } } - dev->nbuckets = dev->size / dev->bucket_size; + dev->nbuckets = dev->size / dev->bucket_size; if (dev->bucket_size < opts.block_size) - die("Bucket size cannot be smaller than block size"); + die("Bucket size (%llu) cannot be smaller than block size (%u)", + dev->bucket_size, opts.block_size); if (opt_defined(opts, btree_node_size) && dev->bucket_size < opts.btree_node_size) - die("Bucket size cannot be smaller than btree node size"); + die("Bucket size (%llu) cannot be smaller than btree node size (%u)", + dev->bucket_size, opts.btree_node_size); if (dev->nbuckets < BCH_MIN_NR_NBUCKETS) - die("Not enough buckets: %llu, need %u (bucket size %u)", + die("Not enough buckets: %llu, need %u (bucket size %llu)", dev->nbuckets, BCH_MIN_NR_NBUCKETS, dev->bucket_size); + if (dev->bucket_size > (u32) U16_MAX << 9) + die("Bucket size (%llu) too big (max %u)", + dev->bucket_size, (u32) U16_MAX << 9); } static unsigned parse_target(struct bch_sb_handle *sb, @@ -165,7 +172,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* calculate btree node size: */ if (!opt_defined(fs_opts, btree_node_size)) { /* 256k default btree node size */ - opt_set(fs_opts, btree_node_size, 512); + opt_set(fs_opts, btree_node_size, 256 << 10); for (i = devs; i < devs + nr_devs; i++) fs_opts.btree_node_size = @@ -173,12 +180,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, i->bucket_size); } - if (!is_power_of_2(fs_opts.block_size)) - die("block size must be power of 2"); - - if (!is_power_of_2(fs_opts.btree_node_size)) - die("btree node size must be power of 2"); - if (uuid_is_null(opts.uuid.b)) uuid_generate(opts.uuid.b); @@ -188,7 +189,6 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, sb.sb->version = le16_to_cpu(opts.version); sb.sb->version_min = le16_to_cpu(opts.version); sb.sb->magic = BCACHE_MAGIC; - sb.sb->block_size = cpu_to_le16(fs_opts.block_size); sb.sb->user_uuid = opts.uuid; sb.sb->nr_devices = nr_devs; @@ -205,22 +205,15 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { - const struct bch_option *opt = &bch2_opt_table[opt_id]; u64 v; - if (opt->set_sb == SET_NO_SB_OPT) - continue; - v = bch2_opt_defined_by_id(&fs_opts, opt_id) ? bch2_opt_get_by_id(&fs_opts, opt_id) : bch2_opt_get_by_id(&bch2_opts_default, opt_id); - opt->set_sb(sb.sb, v); + __bch2_opt_set_sb(sb.sb, &bch2_opt_table[opt_id], v); } - SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb, - ilog2(opts.encoded_extent_max)); - struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now)) die("error getting current time: %m"); @@ -231,7 +224,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* Member info: */ mi = bch2_sb_resize_members(&sb, (sizeof(*mi) + sizeof(struct bch_member) * - nr_devs) / sizeof(u64)); + nr_devs) / sizeof(u64)); for (i = devs; i < devs + nr_devs; i++) { struct bch_member *m = mi->members + (i - devs); @@ -239,25 +232,31 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, uuid_generate(m->uuid.b); m->nbuckets = cpu_to_le64(i->nbuckets); m->first_bucket = 0; - m->bucket_size = cpu_to_le16(i->bucket_size); + m->bucket_size = cpu_to_le16(i->bucket_size >> 9); - SET_BCH_MEMBER_REPLACEMENT(m, BCH_CACHE_REPLACEMENT_lru); SET_BCH_MEMBER_DISCARD(m, i->discard); SET_BCH_MEMBER_DATA_ALLOWED(m, i->data_allowed); SET_BCH_MEMBER_DURABILITY(m, i->durability + 1); } - /* Disk groups */ + /* Disk labels*/ for (i = devs; i < devs + nr_devs; i++) { - struct bch_member *m = mi->members + (i - devs); + struct bch_member *m; int idx; - if (!i->group) + if (!i->label) continue; - idx = bch2_disk_path_find_or_create(&sb, i->group); + idx = bch2_disk_path_find_or_create(&sb, i->label); if (idx < 0) - die("error creating disk path: %s", idx); + die("error creating disk path: %s", strerror(-idx)); + + /* + * Recompute mi and m after each sb modification: its location + * in memory may have changed due to reallocation. + */ + mi = bch2_sb_get_members(sb.sb); + m = mi->members + (i - devs); SET_BCH_MEMBER_GROUP(m, idx + 1); } @@ -281,11 +280,13 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, } for (i = devs; i < devs + nr_devs; i++) { + u64 size_sectors = i->size >> 9; + sb.sb->dev_idx = i - devs; if (!i->sb_offset) { i->sb_offset = BCH_SB_SECTOR; - i->sb_end = i->size; + i->sb_end = size_sectors; } init_layout(&sb.sb->layout, fs_opts.block_size, @@ -301,9 +302,9 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, */ if (i->sb_offset == BCH_SB_SECTOR) { struct bch_sb_layout *l = &sb.sb->layout; - u64 backup_sb = i->size - (1 << l->sb_max_size_bits); + u64 backup_sb = size_sectors - (1 << l->sb_max_size_bits); - backup_sb = rounddown(backup_sb, i->bucket_size); + backup_sb = rounddown(backup_sb, i->bucket_size >> 9); l->sb_offset[l->nr_superblocks++] = cpu_to_le64(backup_sb); } @@ -311,7 +312,8 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, /* Zero start of disk */ static const char zeroes[BCH_SB_SECTOR << 9]; - xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0); + xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0, + "zeroing start of disk"); } bch2_super_write(i->fd, sb.sb); @@ -332,12 +334,14 @@ void bch2_super_write(int fd, struct bch_sb *sb) if (sb->offset == BCH_SB_SECTOR) { /* Write backup layout */ xpwrite(fd, &sb->layout, sizeof(sb->layout), - BCH_SB_LAYOUT_SECTOR << 9); + BCH_SB_LAYOUT_SECTOR << 9, + "backup layout"); } sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb); xpwrite(fd, sb, vstruct_bytes(sb), - le64_to_cpu(sb->offset) << 9); + le64_to_cpu(sb->offset) << 9, + "superblock"); } fsync(fd); @@ -410,10 +414,10 @@ static int bch2_sb_get_target(struct bch_sb *sb, char *buf, size_t len, u64 v) struct bch_disk_group *g = gi->entries + t.group; if (t.group < disk_groups_nr(gi) && !BCH_GROUP_DELETED(g)) { - ret = scnprintf(buf, len, "Group %u (%.*s)", t.group, + ret = scnprintf(buf, len, "Label %u (%.*s)", t.group, BCH_SB_LABEL_SIZE, g->label); } else { - ret = scnprintf(buf, len, "Bad group %u", t.group); + ret = scnprintf(buf, len, "Bad label %u", t.group); } break; } @@ -475,7 +479,7 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, char member_uuid_str[40]; char data_allowed_str[100]; char data_has_str[100]; - char group[BCH_SB_LABEL_SIZE+10]; + char label [BCH_SB_LABEL_SIZE+10]; char time_str[64]; if (!bch2_member_exists(m)) @@ -487,14 +491,14 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, unsigned idx = BCH_MEMBER_GROUP(m) - 1; if (idx < disk_groups_nr(gi)) { - snprintf(group, sizeof(group), "%.*s (%u)", + scnprintf(label, sizeof(label), "%.*s (%u)", BCH_SB_LABEL_SIZE, gi->entries[idx].label, idx); } else { - strcpy(group, "(bad disk groups section)"); + strcpy(label, "(bad disk labels section)"); } } else { - strcpy(group, "(none)"); + strcpy(label, "(none)"); } bch2_flags_to_text(&PBUF(data_allowed_str), @@ -531,7 +535,6 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, " Has data: %s\n" - " Replacement policy: %s\n" " Discard: %llu\n", i, member_uuid_str, pr_units(le16_to_cpu(m->bucket_size) * @@ -545,14 +548,10 @@ static void bch2_sb_print_members(struct bch_sb *sb, struct bch_sb_field *f, ? bch2_member_states[BCH_MEMBER_STATE(m)] : "unknown", - group, + label, data_allowed_str, data_has_str, - BCH_MEMBER_REPLACEMENT(m) < BCH_CACHE_REPLACEMENT_NR - ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)] - : "unknown", - BCH_MEMBER_DISCARD(m)); } } @@ -573,7 +572,7 @@ static void bch2_sb_print_crypt(struct bch_sb *sb, struct bch_sb_field *f, } static void bch2_sb_print_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) + enum units units) { struct bch_sb_field_replicas_v0 *replicas = field_to_type(f, replicas_v0); struct bch_replicas_entry_v0 *e; @@ -636,7 +635,7 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f, } static void bch2_sb_print_journal_seq_blacklist(struct bch_sb *sb, struct bch_sb_field *f, - enum units units) + enum units units) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); unsigned i, nr = blacklist_nr_entries(bl); @@ -899,7 +898,9 @@ struct bchfs_handle bcache_fs_open(const char *path) free(ctl); } else { /* It's a path: */ - ret.ioctl_fd = xopen(path, O_RDONLY); + ret.ioctl_fd = open(path, O_RDONLY); + if (ret.ioctl_fd < 0) + die("Error opening filesystem at %s: %m", path); struct bch_ioctl_query_uuid uuid; if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid) < 0) @@ -1062,7 +1063,7 @@ struct bch_opt_strs bch2_cmdline_opts_get(int *argc, char *argv[], optid = bch2_opt_lookup(optstr); if (optid < 0 || - !(bch2_opt_table[optid].mode & opt_types)) { + !(bch2_opt_table[optid].flags & opt_types)) { i++; goto next; } @@ -1102,7 +1103,8 @@ struct bch_opts bch2_parse_opts(struct bch_opt_strs strs) bch2_opt_table[i].type == BCH_OPT_FN) continue; - ret = bch2_opt_parse(NULL, &bch2_opt_table[i], + ret = bch2_opt_parse(NULL, "option", + &bch2_opt_table[i], strs.by_id[i], &v); if (ret < 0) die("Invalid %s: %s", @@ -1130,7 +1132,7 @@ void bch2_opts_usage(unsigned opt_types) for (opt = bch2_opt_table; opt < bch2_opt_table + bch2_opts_nr; opt++) { - if (!(opt->mode & opt_types)) + if (!(opt->flags & opt_types)) continue; c += printf(" --%s", opt->attr.name); diff --git a/libbcachefs.h b/libbcachefs.h index 7cdbf69..ab4f0cd 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -35,7 +35,6 @@ struct format_opts { uuid_le uuid; unsigned version; unsigned superblock_size; - unsigned encoded_extent_max; bool encrypted; char *passphrase; }; @@ -45,16 +44,15 @@ static inline struct format_opts format_opts_default() return (struct format_opts) { .version = bcachefs_metadata_version_current, .superblock_size = SUPERBLOCK_SIZE_DEFAULT, - .encoded_extent_max = 128, }; } struct dev_opts { int fd; char *path; - u64 size; /* 512 byte sectors */ - unsigned bucket_size; - const char *group; + u64 size; /* bytes*/ + u64 bucket_size; /* bytes */ + const char *label; unsigned data_allowed; unsigned durability; bool discard; diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index eb907e5..5070caf 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -212,36 +212,38 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct inode *vinode, int type) +struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c_xattr xattr; struct posix_acl *acl = NULL; struct bkey_s_c k; int ret; + if (rcu) + return ERR_PTR(-ECHILD); + bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &hash, inode->v.i_ino, + ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); - if (IS_ERR(iter)) { - if (PTR_ERR(iter) == -EINTR) + if (ret) { + if (ret == -EINTR) goto retry; - - if (PTR_ERR(iter) != -ENOENT) - acl = ERR_CAST(iter); + if (ret != -ENOENT) + acl = ERR_PTR(ret); goto out; } - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) { acl = ERR_PTR(ret); @@ -254,17 +256,17 @@ retry: if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); - bch2_trans_iter_put(&trans, iter); out: + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return acl; } -int bch2_set_acl_trans(struct btree_trans *trans, +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); int ret; if (type == ACL_TYPE_DEFAULT && @@ -277,14 +279,14 @@ int bch2_set_acl_trans(struct btree_trans *trans, if (IS_ERR(xattr)) return PTR_ERR(xattr); - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &xattr->k_i, 0); + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &search); + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); } return ret == -ENOENT ? 0 : ret; @@ -296,9 +298,8 @@ int bch2_set_acl(struct user_namespace *mnt_userns, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -309,9 +310,8 @@ retry: bch2_trans_begin(&trans); acl = _acl; - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -323,27 +323,24 @@ retry: goto btree_err; } - hash_info = bch2_hash_info_init(c, &inode_u); - - ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); + ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; inode_u.bi_ctime = bch2_current_time(c); inode_u.bi_mode = mode; - ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, 0); + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, 0); btree_err: - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); if (ret == -EINTR) goto retry; if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); @@ -354,28 +351,27 @@ err: return ret; } -int bch2_acl_chmod(struct btree_trans *trans, +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; struct posix_acl *acl; struct bkey_s_c k; int ret; - iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, - &hash_info, inode->bi_inum, + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); if (ret) return ret == -ENOENT ? 0 : ret; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); xattr = bkey_s_c_to_xattr(k); if (ret) goto err; @@ -396,12 +392,12 @@ int bch2_acl_chmod(struct btree_trans *trans, goto err; } - new->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, &new->k_i, 0); + new->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); *new_acl = acl; acl = NULL; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); if (!IS_ERR_OR_NULL(acl)) kfree(acl); return ret; diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 25fc54d..2d76a48 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -26,27 +26,26 @@ typedef struct { __le32 a_version; } bch_acl_header; -struct posix_acl *bch2_get_acl(struct inode *, int); +struct posix_acl *bch2_get_acl(struct inode *, int, bool); -int bch2_set_acl_trans(struct btree_trans *, +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, - const struct bch_hash_info *, struct posix_acl *, int); int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else -static inline int bch2_set_acl_trans(struct btree_trans *trans, +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { return 0; } -static inline int bch2_acl_chmod(struct btree_trans *trans, +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 886861a..023db62 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "btree_update_interior.h" #include "btree_gc.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "clock.h" #include "debug.h" #include "ec.h" @@ -147,10 +148,44 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, return 0; } -static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, +static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + out->journal_seq = le64_to_cpu(a.v->journal_seq); + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst, const struct bkey_alloc_unpacked src) { - struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k); unsigned nr_fields = 0, last_nonzero_fieldnr = 0; u8 *out = a->v.data; u8 *end = (void *) &dst[1]; @@ -161,6 +196,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, a->v.gen = src.gen; a->v.oldest_gen = src.oldest_gen; a->v.data_type = src.data_type; + a->v.journal_seq = cpu_to_le64(src.journal_seq); #define x(_name, _bits) \ nr_fields++; \ @@ -194,19 +230,40 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) .gen = 0, }; - if (k.k->type == KEY_TYPE_alloc_v2) - bch2_alloc_unpack_v2(&ret, k); - else if (k.k->type == KEY_TYPE_alloc) + switch (k.k->type) { + case KEY_TYPE_alloc: bch2_alloc_unpack_v1(&ret, k); + break; + case KEY_TYPE_alloc_v2: + bch2_alloc_unpack_v2(&ret, k); + break; + case KEY_TYPE_alloc_v3: + bch2_alloc_unpack_v3(&ret, k); + break; + } return ret; } -void bch2_alloc_pack(struct bch_fs *c, - struct bkey_alloc_buf *dst, - const struct bkey_alloc_unpacked src) +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans, + const struct bkey_alloc_unpacked src) { - bch2_alloc_pack_v2(dst, src); + struct bkey_alloc_buf *dst; + + dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + if (!IS_ERR(dst)) + bch2_alloc_pack_v3(dst, src); + + return dst; +} + +int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_alloc_unpacked *u, unsigned trigger_flags) +{ + struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u); + + return PTR_ERR_OR_ZERO(a) ?: + bch2_trans_update(trans, iter, &a->k, trigger_flags); } static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) @@ -249,137 +306,81 @@ const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - - pr_buf(out, "gen %u oldest_gen %u data_type %s", - u.gen, u.oldest_gen, bch2_data_types[u.data_type]); -#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); - BCH_ALLOC_FIELDS_V2() -#undef x -} - -static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k) +const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bch_dev *ca; - struct bucket *g; struct bkey_alloc_unpacked u; - if (k.k->type != KEY_TYPE_alloc && - k.k->type != KEY_TYPE_alloc_v2) - return 0; - - ca = bch_dev_bkey_exists(c, k.k->p.inode); - g = bucket(ca, k.k->p.offset); - u = bch2_alloc_unpack(k); + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; - g->_mark.gen = u.gen; - g->_mark.data_type = u.data_type; - g->_mark.dirty_sectors = u.dirty_sectors; - g->_mark.cached_sectors = u.cached_sectors; - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; - g->gen_valid = 1; + if (bch2_alloc_unpack_v3(&u, k)) + return "unpack error"; - return 0; + return NULL; } -int bch2_alloc_read(struct bch_fs *c) +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) { - int ret; - - down_read(&c->gc_lock); - ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn); - up_read(&c->gc_lock); - if (ret) { - bch_err(c, "error reading alloc info: %i", ret); - return ret; - } + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); - return 0; + pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu", + u.gen, u.oldest_gen, bch2_data_types[u.data_type], + u.journal_seq); +#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name); + BCH_ALLOC_FIELDS_V2() +#undef x } -static int bch2_alloc_write_key(struct btree_trans *trans, - struct btree_iter *iter, - unsigned flags) +int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) { - struct bch_fs *c = trans->c; + struct btree_trans trans; + struct btree_iter iter; struct bkey_s_c k; struct bch_dev *ca; struct bucket *g; - struct bucket_mark m; - struct bkey_alloc_unpacked old_u, new_u; - struct bkey_alloc_buf a; + struct bkey_alloc_unpacked u; int ret; -retry: - bch2_trans_begin(trans); - - ret = bch2_btree_key_cache_flush(trans, - BTREE_ID_alloc, iter->pos); - if (ret) - goto err; - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - goto err; - - old_u = bch2_alloc_unpack(k); - - percpu_down_read(&c->mark_lock); - ca = bch_dev_bkey_exists(c, iter->pos.inode); - g = bucket(ca, iter->pos.offset); - m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(iter, g, m); - percpu_up_read(&c->mark_lock); - - if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return 0; - bch2_alloc_pack(c, &a, new_u); - ret = bch2_trans_update(trans, iter, &a.k, - BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags); -err: - if (ret == -EINTR) - goto retry; - return ret; -} - -int bch2_alloc_write(struct bch_fs *c, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bch_dev *ca; - unsigned i; - int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = __bucket(ca, k.k->p.offset, gc); + u = bch2_alloc_unpack(k); + + if (!gc) + *bucket_gen(ca, k.k->p.offset) = u.gen; + + g->_mark.gen = u.gen; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = !gc ? u.oldest_gen : u.gen; + g->gen_valid = 1; + + if (!gc || + (metadata_only && + (u.data_type == BCH_DATA_user || + u.data_type == BCH_DATA_cached || + u.data_type == BCH_DATA_parity))) { + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->_mark.stripe = u.stripe != 0; + g->stripe = u.stripe; + g->stripe_redundancy = u.stripe_redundancy; + } - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + } + bch2_trans_iter_exit(&trans, &iter); - for_each_member_device(ca, c, i) { - bch2_btree_iter_set_pos(iter, - POS(ca->dev_idx, ca->mi.first_bucket)); + bch2_trans_exit(&trans); - while (iter->pos.offset < ca->mi.nbuckets) { - bch2_trans_cond_resched(&trans); + if (ret) + bch_err(c, "error reading alloc info: %i", ret); - ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret) { - percpu_ref_put(&ca->ref); - goto err; - } - bch2_btree_iter_advance(iter); - } - } -err: - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); return ret; } @@ -389,31 +390,21 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { struct bch_fs *c = trans->c; - struct bch_dev *ca = bch_dev_bkey_exists(c, dev); - struct btree_iter *iter; - struct bucket *g; - struct bkey_alloc_buf *a; + struct btree_iter iter; + struct bkey_s_c k; struct bkey_alloc_unpacked u; u64 *time, now; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; - - a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - ret = PTR_ERR_OR_ZERO(a); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto out; - percpu_down_read(&c->mark_lock); - g = bucket(ca, bucket_nr); - u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); - percpu_up_read(&c->mark_lock); + u = bch2_alloc_unpack(k); time = rw == READ ? &u.read_time : &u.write_time; now = atomic64_read(&c->io_clock[rw].now); @@ -422,11 +413,10 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, *time = now; - bch2_alloc_pack(c, a, u); - ret = bch2_trans_update(trans, iter, &a->k, 0) ?: + ret = bch2_alloc_write(trans, &iter, &u, 0) ?: bch2_trans_commit(trans, NULL, NULL, 0); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -453,6 +443,18 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, test_bit(b, ca->buckets_nouse)) return false; + if (ca->new_fs_bucket_idx) { + /* + * Device or filesystem is still being initialized, and we + * haven't fully marked superblocks & journal: + */ + if (is_superblock_bucket(ca, b)) + return false; + + if (b < ca->new_fs_bucket_idx) + return false; + } + gc_gen = bucket_gc_gen(bucket(ca, b)); ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2; @@ -469,7 +471,7 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, u64 now, u64 last_seq_ondisk) { - unsigned used = bucket_sectors_used(m); + unsigned used = m.cached_sectors; if (used) { /* @@ -488,8 +490,7 @@ static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, * keys when there's only a small difference, so that we can * keep sequential buckets together: */ - return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| - (bucket_gc_gen(g) >> 4); + return bucket_gc_gen(g) >> 4; } } @@ -521,7 +522,7 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) buckets = bucket_array(ca); ca->alloc_heap.used = 0; now = atomic64_read(&c->io_clock[READ].now); - last_seq_ondisk = c->journal.last_seq_ondisk; + last_seq_ondisk = c->journal.flushed_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -538,6 +539,14 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) if (!bch2_can_invalidate_bucket(ca, b, m)) continue; + if (!m.data_type && + bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + last_seq_ondisk, + ca->dev_idx, b)) { + ca->buckets_waiting_on_journal++; + continue; + } + if (e.nr && e.bucket + e.nr == b && e.key == key) { e.nr++; } else { @@ -568,94 +577,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) up_read(&ca->bucket_lock); } -static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t b, start; - - if (ca->fifo_last_bucket < ca->mi.first_bucket || - ca->fifo_last_bucket >= ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - start = ca->fifo_last_bucket; - - do { - ca->fifo_last_bucket++; - if (ca->fifo_last_bucket == ca->mi.nbuckets) - ca->fifo_last_bucket = ca->mi.first_bucket; - - b = ca->fifo_last_bucket; - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) { - struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - if (heap_full(&ca->alloc_heap)) - break; - } - - cond_resched(); - } while (ca->fifo_last_bucket != start); -} - -static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -{ - struct bucket_array *buckets = bucket_array(ca); - struct bucket_mark m; - size_t checked, i; - - for (checked = 0; - checked < ca->mi.nbuckets / 2; - checked++) { - size_t b = bch2_rand_range(ca->mi.nbuckets - - ca->mi.first_bucket) + - ca->mi.first_bucket; - - m = READ_ONCE(buckets->b[b].mark); - - if (bch2_can_invalidate_bucket(ca, b, m)) { - struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; - - heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); - if (heap_full(&ca->alloc_heap)) - break; - } - - cond_resched(); - } - - sort(ca->alloc_heap.data, - ca->alloc_heap.used, - sizeof(ca->alloc_heap.data[0]), - bucket_idx_cmp, NULL); - - /* remove duplicates: */ - for (i = 0; i + 1 < ca->alloc_heap.used; i++) - if (ca->alloc_heap.data[i].bucket == - ca->alloc_heap.data[i + 1].bucket) - ca->alloc_heap.data[i].nr = 0; -} - static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) { size_t i, nr = 0; ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; + ca->buckets_waiting_on_journal = 0; - switch (ca->mi.replacement) { - case BCH_CACHE_REPLACEMENT_lru: - find_reclaimable_buckets_lru(c, ca); - break; - case BCH_CACHE_REPLACEMENT_fifo: - find_reclaimable_buckets_fifo(c, ca); - break; - case BCH_CACHE_REPLACEMENT_random: - find_reclaimable_buckets_random(c, ca); - break; - } + find_reclaimable_buckets_lru(c, ca); heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); @@ -665,92 +595,61 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) return nr; } -/* - * returns sequence number of most recent journal entry that updated this - * bucket: - */ -static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -{ - if (m.journal_seq_valid) { - u64 journal_seq = atomic64_read(&c->journal.seq); - u64 bucket_seq = journal_seq; - - bucket_seq &= ~((u64) U16_MAX); - bucket_seq |= m.journal_seq; - - if (bucket_seq > journal_seq) - bucket_seq -= 1 << 16; - - return bucket_seq; - } else { - return 0; - } -} - static int bucket_invalidate_btree(struct btree_trans *trans, - struct bch_dev *ca, u64 b) + struct bch_dev *ca, u64 b, + struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; - struct bkey_alloc_buf *a; - struct bkey_alloc_unpacked u; - struct bucket *g; - struct bucket_mark m; - struct btree_iter *iter = - bch2_trans_get_iter(trans, BTREE_ID_alloc, - POS(ca->dev_idx, b), - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); + struct btree_iter iter; + struct bkey_s_c k; int ret; - a = bch2_trans_kmalloc(trans, sizeof(*a)); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, b), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); if (ret) goto err; - percpu_down_read(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - u = alloc_mem_to_key(iter, g, m); - percpu_up_read(&c->mark_lock); - - u.gen++; - u.data_type = 0; - u.dirty_sectors = 0; - u.cached_sectors = 0; - u.read_time = atomic64_read(&c->io_clock[READ].now); - u.write_time = atomic64_read(&c->io_clock[WRITE].now); + *u = bch2_alloc_unpack(k); + u->gen++; + u->data_type = 0; + u->dirty_sectors = 0; + u->cached_sectors = 0; + u->read_time = atomic64_read(&c->io_clock[READ].now); + u->write_time = atomic64_read(&c->io_clock[WRITE].now); - bch2_alloc_pack(c, a, u); - ret = bch2_trans_update(trans, iter, &a->k, - BTREE_TRIGGER_BUCKET_INVALIDATE); + ret = bch2_alloc_write(trans, &iter, u, + BTREE_TRIGGER_BUCKET_INVALIDATE); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq, unsigned flags) { - struct bucket *g; - struct bucket_mark m; + struct bkey_alloc_unpacked u; size_t b; + u64 commit_seq = 0; int ret = 0; + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: + */ + if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) + return 1; + BUG_ON(!ca->alloc_heap.used || !ca->alloc_heap.data[0].nr); b = ca->alloc_heap.data[0].bucket; /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - g = bucket(ca, b); - m = READ_ONCE(g->mark); - - BUG_ON(m.dirty_sectors); bch2_mark_alloc_bucket(c, ca, b, true); @@ -759,37 +658,15 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(!fifo_push(&ca->free_inc, b)); spin_unlock(&c->freelist_lock); - /* - * If we're not invalidating cached data, we only increment the bucket - * gen in memory here, the incremented gen will be updated in the btree - * by bch2_trans_mark_pointer(): - */ - if (!m.cached_sectors && - !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { - BUG_ON(m.data_type); - bucket_cmpxchg(g, m, m.gen++); - percpu_up_read(&c->mark_lock); - goto out; - } - percpu_up_read(&c->mark_lock); - /* - * If the read-only path is trying to shut down, we can't be generating - * new btree updates: - */ - if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { - ret = 1; - goto out; - } - - ret = bch2_trans_do(c, NULL, journal_seq, + ret = bch2_trans_do(c, NULL, &commit_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| BTREE_INSERT_JOURNAL_RESERVED| flags, - bucket_invalidate_btree(&trans, ca, b)); -out: + bucket_invalidate_btree(&trans, ca, b, &u)); + if (!ret) { /* remove from alloc_heap: */ struct alloc_heap_entry e, *top = ca->alloc_heap.data; @@ -801,11 +678,17 @@ out: heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); /* - * Make sure we flush the last journal entry that updated this - * bucket (i.e. deleting the last reference) before writing to - * this bucket again: + * If we invalidating cached data then we need to wait on the + * journal commit: */ - *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); + if (u.data_type) + *journal_seq = max(*journal_seq, commit_seq); + + /* + * We already waiting on u.alloc_seq when we filtered out + * buckets that need journal commit: + */ + BUG_ON(*journal_seq > u.journal_seq); } else { size_t b2; @@ -856,10 +739,10 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) /* If we used NOWAIT, don't return the error: */ if (!fifo_empty(&ca->free_inc)) ret = 0; - if (ret) { + if (ret < 0) bch_err(ca, "error invalidating buckets: %i", ret); + if (ret) return ret; - } if (journal_seq) ret = bch2_journal_flush_seq(&c->journal, journal_seq); @@ -972,8 +855,14 @@ static int bch2_allocator_thread(void *arg) gc_count = c->gc_count; nr = find_reclaimable_buckets(c, ca); - trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, - ca->inc_gen_really_needs_gc); + if (!nr && ca->buckets_waiting_on_journal) { + ret = bch2_journal_flush(&c->journal); + if (ret) + goto stop; + } else if (nr < (ca->mi.nbuckets >> 6) && + ca->buckets_waiting_on_journal >= nr / 2) { + bch2_journal_flush_async(&c->journal, NULL); + } if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ca->inc_gen_really_needs_gc) && @@ -981,6 +870,9 @@ static int bch2_allocator_thread(void *arg) atomic_inc(&c->kick_gc); wake_up_process(c->gc_thread); } + + trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc, + ca->inc_gen_really_needs_gc); } ret = bch2_invalidate_buckets(c, ca); @@ -1015,7 +907,7 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); for_each_online_member(ca, c, i) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ra_pages += bdi->ra_pages; } @@ -1085,7 +977,7 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ob++) { spin_lock(&ob->lock); if (ob->valid && !ob->on_partial_list && - ob->ptr.dev == ca->dev_idx) + ob->dev == ca->dev_idx) ret = true; spin_unlock(&ob->lock); } @@ -1232,22 +1124,3 @@ void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); } - -void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct open_bucket *ob; - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) { - pr_buf(out, "%zu ref %u type %s\n", - ob - c->open_buckets, - atomic_read(&ob->pin), - bch2_data_types[ob->type]); - } - spin_unlock(&ob->lock); - } - -} diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index a4f6bf5..98c7866 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -4,11 +4,14 @@ #include "bcachefs.h" #include "alloc_types.h" +#include "buckets.h" #include "debug.h" +#include "super.h" extern const char * const bch2_allocator_states[]; struct bkey_alloc_unpacked { + u64 journal_seq; u64 bucket; u8 dev; u8 gen; @@ -19,23 +22,6 @@ struct bkey_alloc_unpacked { #undef x }; -struct bkey_alloc_buf { - struct bkey_i k; - - union { - struct { -#define x(_name, _bits) + _bits / 8 - u8 _pad[8 + BCH_ALLOC_FIELDS_V1()]; -#undef x - } _v1; - struct { -#define x(_name, _bits) + 8 + _bits / 8 - u8 _pad[8 + BCH_ALLOC_FIELDS_V2()]; -#undef x - } _v2; - }; -} __attribute__((packed, aligned(8))); - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -52,33 +38,28 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ; } +struct bkey_alloc_buf { + struct bkey_i k; + struct bch_alloc_v3 v; + +#define x(_name, _bits) + _bits / 8 + u8 _pad[0 + BCH_ALLOC_FIELDS_V2()]; +#undef x +} __attribute__((packed, aligned(8))); + struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *, - const struct bkey_alloc_unpacked); +struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *, + const struct bkey_alloc_unpacked); +int bch2_alloc_write(struct btree_trans *, struct btree_iter *, + struct bkey_alloc_unpacked *, unsigned); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -static inline struct bkey_alloc_unpacked -alloc_mem_to_key(struct btree_iter *iter, - struct bucket *g, struct bucket_mark m) -{ - return (struct bkey_alloc_unpacked) { - .dev = iter->pos.inode, - .bucket = iter->pos.offset, - .gen = m.gen, - .oldest_gen = g->oldest_gen, - .data_type = m.data_type, - .dirty_sectors = m.dirty_sectors, - .cached_sectors = m.cached_sectors, - .read_time = g->io_time[READ], - .write_time = g->io_time[WRITE], - }; -} - #define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c); const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc (struct bkey_ops) { \ @@ -91,7 +72,19 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ } -int bch2_alloc_read(struct bch_fs *); +#define bch2_bkey_ops_alloc_v3 (struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ +} + +static inline bool bkey_is_alloc(const struct bkey *k) +{ + return k->type == KEY_TYPE_alloc || + k->type == KEY_TYPE_alloc_v2 || + k->type == KEY_TYPE_alloc_v3; +} + +int bch2_alloc_read(struct bch_fs *, bool, bool); static inline void bch2_wake_allocator(struct bch_dev *ca) { @@ -129,9 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); -int bch2_alloc_write(struct bch_fs *, unsigned); void bch2_fs_allocator_background_init(struct bch_fs *); -void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); - #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 412fed4..9b81ed2 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -43,9 +43,32 @@ * reference _after_ doing the index update that makes its allocation reachable. */ +static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + ob->hash = *slot; + *slot = idx; +} + +static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + while (*slot != idx) { + BUG_ON(!*slot); + slot = &c->open_buckets[*slot].hash; + } + + *slot = ob->hash; + ob->hash = 0; +} + void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (ob->ec) { bch2_ec_bucket_written(c, ob); @@ -55,14 +78,16 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false); + bch2_mark_alloc_bucket(c, ca, ob->bucket, false); ob->valid = false; - ob->type = 0; + ob->data_type = 0; spin_unlock(&ob->lock); percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); + bch2_open_bucket_hash_remove(c, ob); + ob->freelist = c->open_buckets_freelist; c->open_buckets_freelist = ob - c->open_buckets; @@ -81,8 +106,7 @@ void bch2_open_bucket_write_error(struct bch_fs *c, unsigned i; open_bucket_for_each(c, obs, ob, i) - if (ob->ptr.dev == dev && - ob->ec) + if (ob->dev == dev && ob->ec) bch2_ec_bucket_cancel(c, ob); } @@ -95,7 +119,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ob = c->open_buckets + c->open_buckets_freelist; c->open_buckets_freelist = ob->freelist; atomic_set(&ob->pin, 1); - ob->type = 0; + ob->data_type = 0; c->open_buckets_nr_free--; return ob; @@ -105,8 +129,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct write_point *wp, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - bool may_realloc = wp->type == BCH_DATA_user; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + bool may_realloc = wp->data_type == BCH_DATA_user; BUG_ON(ca->open_buckets_partial_nr > ARRAY_SIZE(ca->open_buckets_partial)); @@ -127,37 +151,18 @@ static void open_bucket_free_unused(struct bch_fs *c, } } -static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, obs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - - BUG_ON(ptr_stale(ca, &ob->ptr)); - } -#endif -} - /* _only_ for allocating the journal on a new device: */ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) { - struct bucket_array *buckets; - ssize_t b; + while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { + u64 b = ca->new_fs_bucket_idx++; - rcu_read_lock(); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) - if (is_available_bucket(buckets->b[b].mark) && - !buckets->b[b].mark.owned_by_allocator) - goto success; - b = -1; -success: - rcu_read_unlock(); - return b; + if (!is_superblock_bucket(ca, b) && + (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) + return b; + } + + return -1; } static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) @@ -251,15 +256,14 @@ out: ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->alloc_reserve = reserve; - ob->ptr = (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = bucket(ca, b)->mark.gen, - .offset = bucket_to_sector(ca, b), - .dev = ca->dev_idx, - }; - + ob->dev = ca->dev_idx; + ob->gen = *bucket_gen(ca, b); + ob->bucket = b; spin_unlock(&ob->lock); + ca->nr_open_buckets++; + bch2_open_bucket_hash_add(c, ob); + if (c->blocked_allocate_open_bucket) { bch2_time_stats_update( &c->times[BCH_TIME_blocked_allocate_open_bucket], @@ -274,7 +278,6 @@ out: c->blocked_allocate = 0; } - ca->nr_open_buckets++; spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); @@ -338,9 +341,9 @@ static void add_new_bucket(struct bch_fs *c, struct open_bucket *ob) { unsigned durability = - bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; + bch_dev_bkey_exists(c, ob->dev)->mi.durability; - __clear_bit(ob->ptr.dev, devs_may_alloc->d); + __clear_bit(ob->dev, devs_may_alloc->d); *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ? durability : 1; *have_cache |= !durability; @@ -348,8 +351,7 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -enum bucket_alloc_ret -bch2_bucket_alloc_set(struct bch_fs *c, +int bch2_bucket_alloc_set(struct bch_fs *c, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, @@ -363,7 +365,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; - enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; + int ret = -INSUFFICIENT_DEVICES; unsigned i; BUG_ON(*nr_effective >= nr_replicas); @@ -381,7 +383,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { - ret = -PTR_ERR(ob); + ret = PTR_ERR(ob); if (cl) return ret; @@ -394,7 +396,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, bch2_dev_stripe_increment(ca, stripe); if (*nr_effective >= nr_replicas) - return ALLOC_SUCCESS; + return 0; } return ret; @@ -408,8 +410,7 @@ bch2_bucket_alloc_set(struct bch_fs *c, * it's to a device we don't want: */ -static enum bucket_alloc_ret -bucket_alloc_from_stripe(struct bch_fs *c, +static int bucket_alloc_from_stripe(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, @@ -452,13 +453,13 @@ bucket_alloc_from_stripe(struct bch_fs *c, continue; ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->ptr.dev == devs_sorted.devs[i] && + if (ob->dev == devs_sorted.devs[i] && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) goto got_bucket; } goto out_put_head; got_bucket: - ca = bch_dev_bkey_exists(c, ob->ptr.dev); + ca = bch_dev_bkey_exists(c, ob->dev); ob->ec_idx = ec_idx; ob->ec = h->s; @@ -488,12 +489,12 @@ static void get_buckets_from_writepoint(struct bch_fs *c, unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (*nr_effective < nr_replicas && - test_bit(ob->ptr.dev, devs_may_alloc->d) && + test_bit(ob->dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_user && !*have_cache)) && + (wp->data_type == BCH_DATA_user && !*have_cache)) && (ob->ec || !need_ec)) { add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, @@ -505,8 +506,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static enum bucket_alloc_ret -open_bucket_add_buckets(struct bch_fs *c, +static int open_bucket_add_buckets(struct bch_fs *c, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, @@ -522,11 +522,11 @@ open_bucket_add_buckets(struct bch_fs *c, struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; - enum bucket_alloc_ret ret; + int ret; unsigned i; rcu_read_lock(); - devs = target_rw_devs(c, wp->type, target); + devs = target_rw_devs(c, wp->data_type, target); rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ @@ -534,7 +534,7 @@ open_bucket_add_buckets(struct bch_fs *c, __clear_bit(devs_have->devs[i], devs.d); open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->ptr.dev, devs.d); + __clear_bit(ob->dev, devs.d); if (erasure_code) { if (!ec_open_bucket(c, ptrs)) { @@ -550,8 +550,8 @@ open_bucket_add_buckets(struct bch_fs *c, target, erasure_code, nr_replicas, nr_effective, have_cache, flags, _cl); - if (ret == FREELIST_EMPTY || - ret == OPEN_BUCKETS_EMPTY) + if (ret == -FREELIST_EMPTY || + ret == -OPEN_BUCKETS_EMPTY) return ret; if (*nr_effective >= nr_replicas) return 0; @@ -575,7 +575,7 @@ retry_blocking: ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { + if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { cl = _cl; goto retry_blocking; } @@ -594,7 +594,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, unsigned i, j; open_bucket_for_each(c, obs, ob, i) { - bool drop = !ca || ob->ptr.dev == ca->dev_idx; + bool drop = !ca || ob->dev == ca->dev_idx; if (!drop && ob->ec) { mutex_lock(&ob->ec->lock); @@ -603,7 +603,7 @@ void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, continue; ob2 = c->open_buckets + ob->ec->blocks[j]; - drop |= ob2->ptr.dev == ca->dev_idx; + drop |= ob2->dev == ca->dev_idx; } mutex_unlock(&ob->ec->lock); } @@ -772,7 +772,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned nr_effective, write_points_nr; unsigned ob_flags = 0; bool have_cache; - enum bucket_alloc_ret ret; + int ret; int i; if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) @@ -787,11 +787,11 @@ retry: wp = writepoint_find(c, write_point.v); - if (wp->type == BCH_DATA_user) + if (wp->data_type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; /* metadata may not allocate on cache devices: */ - if (wp->type != BCH_DATA_user) + if (wp->data_type != BCH_DATA_user) have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -821,7 +821,7 @@ alloc_done: if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); - if (ret == INSUFFICIENT_DEVICES && + if (ret == -INSUFFICIENT_DEVICES && nr_effective >= nr_replicas_required) ret = 0; @@ -841,8 +841,6 @@ alloc_done: BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - verify_not_stale(c, &wp->ptrs); - return wp; err: open_bucket_for_each(c, &wp->ptrs, ob, i) @@ -854,27 +852,42 @@ err: mutex_unlock(&wp->lock); - if (ret == FREELIST_EMPTY && + if (ret == -FREELIST_EMPTY && try_decrease_writepoints(c, write_points_nr)) goto retry; switch (ret) { - case OPEN_BUCKETS_EMPTY: - case FREELIST_EMPTY: + case -OPEN_BUCKETS_EMPTY: + case -FREELIST_EMPTY: return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); - case INSUFFICIENT_DEVICES: + case -INSUFFICIENT_DEVICES: return ERR_PTR(-EROFS); default: BUG(); } } +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} + /* * Append pointers to the space we just allocated to @k, and mark @sectors space * as allocated out of @ob */ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i *k, unsigned sectors) + struct bkey_i *k, unsigned sectors, + bool cached) { struct open_bucket *ob; @@ -884,14 +897,14 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, wp->sectors_free -= sectors; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - struct bch_extent_ptr tmp = ob->ptr; + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - tmp.cached = !ca->mi.durability && - wp->type == BCH_DATA_user; + ptr.cached = cached || + (!ca->mi.durability && + wp->data_type == BCH_DATA_user); - tmp.offset += ca->mi.bucket_size - ob->sectors_free; - bch2_bkey_append_ptr(k, tmp); + bch2_bkey_append_ptr(k, ptr); BUG_ON(sectors > ob->sectors_free); ob->sectors_free -= sectors; @@ -921,7 +934,7 @@ static inline void writepoint_init(struct write_point *wp, enum bch_data_type type) { mutex_init(&wp->lock); - wp->type = type; + wp->data_type = type; } void bch2_fs_allocator_foreground_init(struct bch_fs *c) @@ -958,3 +971,22 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) writepoint_hash(c, wp->write_point)); } } + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) { + pr_buf(out, "%zu ref %u type %s\n", + ob - c->open_buckets, + atomic_read(&ob->pin), + bch2_data_types[ob->data_type]); + } + spin_unlock(&ob->lock); + } + +} diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index c658295..d466bda 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -12,13 +12,6 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; -enum bucket_alloc_ret { - ALLOC_SUCCESS, - OPEN_BUCKETS_EMPTY, - FREELIST_EMPTY, /* Allocator thread not keeping up */ - INSUFFICIENT_DEVICES, -}; - struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -92,14 +85,37 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) { - ob->type = wp->type; + ob->data_type = wp->data_type; atomic_inc(&ob->pin); ob_push(c, ptrs, ob); } } -enum bucket_alloc_ret -bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, +static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, + unsigned dev, u64 bucket) +{ + return c->open_buckets_hash + + (jhash_3words(dev, bucket, bucket >> 32, 0) & + (OPEN_BUCKETS_COUNT - 1)); +} + +static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) +{ + open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); + + while (slot) { + struct open_bucket *ob = &c->open_buckets[slot]; + + if (ob->dev == dev && ob->bucket == bucket) + return true; + + slot = ob->hash; + } + + return false; +} + +int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, struct closure *); @@ -113,8 +129,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *, unsigned, struct closure *); +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i *, unsigned); + struct bkey_i *, unsigned, bool); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, @@ -135,4 +152,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 4a1cd8b..409232e 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -37,24 +37,31 @@ typedef FIFO(long) alloc_fifo; #define WRITE_POINT_HASH_NR 32 #define WRITE_POINT_MAX 32 +/* + * 0 is never a valid open_bucket_idx_t: + */ typedef u16 open_bucket_idx_t; struct open_bucket { spinlock_t lock; atomic_t pin; open_bucket_idx_t freelist; + open_bucket_idx_t hash; /* * When an open bucket has an ec_stripe attached, this is the index of * the block in the stripe this open_bucket corresponds to: */ u8 ec_idx; - u8 type; + enum bch_data_type data_type:3; unsigned valid:1; unsigned on_partial_list:1; int alloc_reserve:3; + unsigned sectors_free; - struct bch_extent_ptr ptr; + u8 dev; + u8 gen; + u64 bucket; struct ec_stripe_new *ec; }; @@ -74,7 +81,7 @@ struct write_point { struct mutex lock; u64 last_used; unsigned long write_point; - enum bch_data_type type; + enum bch_data_type data_type; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 051aba6..0e9689f 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -177,8 +177,13 @@ */ #undef pr_fmt +#ifdef __KERNEL__ #define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ +#else +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ +#endif +#include #include #include #include @@ -199,6 +204,7 @@ #include #include "bcachefs_format.h" +#include "errcode.h" #include "fifo.h" #include "opts.h" #include "util.h" @@ -275,9 +281,6 @@ do { \ "significantly affect performance") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ - BCH_DEBUG_PARAM(debug_check_bkeys, \ - "Run bkey_debugcheck (primarily checking GC/allocation "\ - "information) when iterating over keys") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ BCH_DEBUG_PARAM(journal_seq_verify, \ @@ -319,8 +322,12 @@ BCH_DEBUG_PARAMS_DEBUG() #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ + x(btree_node_compact) \ + x(btree_node_merge) \ x(btree_node_sort) \ x(btree_node_read) \ + x(btree_interior_update_foreground) \ + x(btree_interior_update_total) \ x(btree_gc) \ x(btree_lock_contended_read) \ x(btree_lock_contended_intent) \ @@ -328,8 +335,8 @@ BCH_DEBUG_PARAMS_DEBUG() x(data_write) \ x(data_read) \ x(data_promote) \ - x(journal_write) \ - x(journal_delay) \ + x(journal_flush_write) \ + x(journal_noflush_write) \ x(journal_flush_seq) \ x(blocked_journal) \ x(blocked_allocate) \ @@ -345,6 +352,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" #include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" #include "clock_types.h" #include "ec_types.h" #include "journal_types.h" @@ -352,6 +360,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "subvolume_types.h" #include "super_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ @@ -380,6 +389,8 @@ enum gc_phase { GC_PHASE_BTREE_alloc, GC_PHASE_BTREE_quotas, GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, GC_PHASE_PENDING_DELETE, }; @@ -423,6 +434,7 @@ struct bch_dev { struct bch_sb_handle disk_sb; struct bch_sb *sb_read_scratch; int sb_write_error; + dev_t dev; struct bch_devs_mask self; @@ -436,6 +448,8 @@ struct bch_dev { * Or rcu_read_lock(), but only for ptr_stale(): */ struct bucket_array __rcu *buckets[2]; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; unsigned long *buckets_nouse; struct rw_semaphore bucket_lock; @@ -444,6 +458,7 @@ struct bch_dev { struct bch_dev_usage __percpu *usage_gc; /* Allocator: */ + u64 new_fs_bucket_idx; struct task_struct __rcu *alloc_thread; /* @@ -466,6 +481,7 @@ struct bch_dev { size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; enum allocator_states allocator_state; @@ -491,6 +507,7 @@ struct bch_dev { enum { /* startup: */ + BCH_FS_INITIALIZED, BCH_FS_ALLOC_READ_DONE, BCH_FS_ALLOC_CLEAN, BCH_FS_ALLOCATOR_RUNNING, @@ -498,7 +515,6 @@ enum { BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_BTREE_INTERIOR_REPLAY_DONE, BCH_FS_FSCK_DONE, BCH_FS_STARTED, BCH_FS_RW, @@ -518,7 +534,6 @@ enum { /* misc: */ BCH_FS_NEED_ANOTHER_GC, BCH_FS_DELETED_NODES, - BCH_FS_NEED_ALLOC_WRITE, BCH_FS_REBUILD_REPLICAS, BCH_FS_HOLD_BTREE_WRITES, }; @@ -548,6 +563,7 @@ struct journal_keys { enum btree_id btree_id:8; unsigned level:8; bool allocated; + bool overwritten; struct bkey_i *k; u32 journal_seq; u32 journal_offset; @@ -557,12 +573,27 @@ struct journal_keys { u64 journal_seq_base; }; -struct btree_iter_buf { - struct btree_iter *iter; +struct btree_path_buf { + struct btree_path *path; }; #define REPLICAS_DELTA_LIST_MAX (1U << 16) +struct snapshot_t { + u32 parent; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 equiv; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + struct bch_fs { struct closure cl; @@ -609,7 +640,6 @@ struct bch_fs { u16 version; u16 version_min; - u16 encoded_extent_max; u8 nr_devices; u8 clean; @@ -634,6 +664,15 @@ struct bch_fs { struct closure sb_write; struct mutex sb_lock; + /* snapshot.c: */ + GENRADIX(struct snapshot_t) snapshots; + struct bch_snapshot_table __rcu *snapshot_table; + struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + struct snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + /* BTREE CACHE */ struct bio_set btree_bio; struct workqueue_struct *io_complete_wq; @@ -666,13 +705,15 @@ struct bch_fs { /* btree_iter.c: */ struct mutex btree_trans_lock; struct list_head btree_trans_list; - mempool_t btree_iters_pool; + mempool_t btree_paths_pool; mempool_t btree_trans_mem_pool; - struct btree_iter_buf __percpu *btree_iters_bufs; + struct btree_path_buf __percpu *btree_paths_bufs; struct srcu_struct btree_trans_barrier; + bool btree_trans_barrier_initialized; struct btree_key_cache btree_key_cache; + unsigned btree_key_cache_btrees; struct workqueue_struct *btree_update_wq; struct workqueue_struct *btree_io_complete_wq; @@ -721,10 +762,12 @@ struct bch_fs { struct closure_waitlist freelist_wait; u64 blocked_allocate; u64 blocked_allocate_open_bucket; + open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; struct closure_waitlist open_buckets_wait; struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; struct write_point btree_write_point; struct write_point rebalance_write_point; @@ -734,6 +777,8 @@ struct bch_fs { struct mutex write_points_hash_lock; unsigned write_points_nr; + struct buckets_waiting_for_journal buckets_waiting_for_journal; + /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; @@ -759,6 +804,7 @@ struct bch_fs { * it's not while a gc is in progress. */ struct rw_semaphore gc_lock; + struct mutex gc_gens_lock; /* IO PATH */ struct semaphore io_in_flight; @@ -791,8 +837,13 @@ struct bch_fs { struct write_point copygc_write_point; s64 copygc_wait; + /* DATA PROGRESS STATS */ + struct list_head data_progress_list; + struct mutex data_progress_lock; + /* STRIPES: */ - GENRADIX(struct stripe) stripes[2]; + GENRADIX(struct stripe) stripes; + GENRADIX(struct gc_stripe) gc_stripes; ec_stripes_heap ec_stripes_heap; spinlock_t ec_stripes_heap_lock; @@ -816,7 +867,6 @@ struct bch_fs { u64 reflink_hint; reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; - size_t reflink_gc_idx; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -888,10 +938,25 @@ static inline unsigned bucket_bytes(const struct bch_dev *ca) static inline unsigned block_bytes(const struct bch_fs *c) { - return c->opts.block_size << 9; + return c->opts.block_size; +} + +static inline unsigned block_sectors(const struct bch_fs *c) +{ + return c->opts.block_size >> 9; +} + +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> 9; +} + +static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) +{ + return c->btree_key_cache_btrees & (1U << btree); } -static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time) +static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) { struct timespec64 t; s32 rem; @@ -903,13 +968,13 @@ static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time return t; } -static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) +static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) { return (ts.tv_sec * c->sb.time_units_per_sec + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; } -static inline s64 bch2_current_time(struct bch_fs *c) +static inline s64 bch2_current_time(const struct bch_fs *c) { struct timespec64 now; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 98779e4..5153f0e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -76,6 +76,7 @@ #include #include #include +#include "vstructs.h" #define LE_BITMASK(_bits, name, type, field, offset, end) \ static const unsigned name##_OFFSET = offset; \ @@ -323,7 +324,7 @@ static inline void bkey_init(struct bkey *k) */ #define BCH_BKEY_TYPES() \ x(deleted, 0) \ - x(discard, 1) \ + x(whiteout, 1) \ x(error, 2) \ x(cookie, 3) \ x(hash_whiteout, 4) \ @@ -342,7 +343,11 @@ static inline void bkey_init(struct bkey *k) x(inline_data, 17) \ x(btree_ptr_v2, 18) \ x(indirect_inline_data, 19) \ - x(alloc_v2, 20) + x(alloc_v2, 20) \ + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ + x(alloc_v3, 24) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -355,7 +360,7 @@ struct bch_deleted { struct bch_val v; }; -struct bch_discard { +struct bch_whiteout { struct bch_val v; }; @@ -679,6 +684,16 @@ struct bch_inode { __u8 fields[0]; } __attribute__((packed, aligned(8))); +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[0]; +} __attribute__((packed, aligned(8))); + struct bch_inode_generation { struct bch_val v; @@ -686,6 +701,10 @@ struct bch_inode_generation { __le32 pad; } __attribute__((packed, aligned(8))); +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + #define BCH_INODE_FIELDS() \ x(bi_atime, 96) \ x(bi_ctime, 96) \ @@ -709,7 +728,9 @@ struct bch_inode_generation { x(bi_erasure_code, 16) \ x(bi_fields_set, 16) \ x(bi_dir, 64) \ - x(bi_dir_offset, 64) + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -764,6 +785,9 @@ LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + /* Dirents */ /* @@ -781,7 +805,13 @@ struct bch_dirent { struct bch_val v; /* Target inode number: */ + union { __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; /* * Copy of mode bits 12-15 from the target inode - so userspace can get @@ -792,6 +822,9 @@ struct bch_dirent { __u8 d_name[]; } __attribute__((packed, aligned(8))); +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + #define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ sizeof(struct bkey) - \ offsetof(struct bch_dirent, d_name)) @@ -849,6 +882,17 @@ struct bch_alloc_v2 { x(stripe, 32) \ x(stripe_redundancy, 8) +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __attribute__((packed, aligned(8))); + enum { #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, BCH_ALLOC_FIELDS_V1() @@ -902,18 +946,24 @@ struct bch_stripe { struct bch_reflink_p { struct bch_val v; __le64 idx; - - __le32 reservation_generation; - __u8 nr_replicas; - __u8 pad[3]; -}; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __attribute__((packed, aligned(8))); struct bch_reflink_v { struct bch_val v; __le64 refcount; union bch_extent_entry start[0]; __u64 _data[0]; -}; +} __attribute__((packed, aligned(8))); struct bch_indirect_inline_data { struct bch_val v; @@ -928,6 +978,43 @@ struct bch_inline_data { u8 data[0]; }; +/* Subvolumes: */ + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +/* Snapshots */ + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + __le32 pad; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -977,15 +1064,12 @@ struct bch_member { }; LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -/* 4-10 unused, was TIER, HAS_(META)DATA */ -LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -#define BCH_TIER_MAX 4U - #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); @@ -1004,18 +1088,6 @@ enum bch_member_state { BCH_MEMBER_STATE_NR }; -#define BCH_CACHE_REPLACEMENT_POLICIES() \ - x(lru, 0) \ - x(fifo, 1) \ - x(random, 2) - -enum bch_cache_replacement_policies { -#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n, - BCH_CACHE_REPLACEMENT_POLICIES() -#undef x - BCH_CACHE_REPLACEMENT_NR -}; - struct bch_sb_field_members { struct bch_sb_field field; struct bch_member members[0]; @@ -1211,7 +1283,11 @@ enum bcachefs_metadata_version { bcachefs_metadata_version_snapshot = 12, bcachefs_metadata_version_inode_backpointers = 13, bcachefs_metadata_version_btree_ptr_sectors_written = 14, - bcachefs_metadata_version_max = 15, + bcachefs_metadata_version_snapshot_2 = 15, + bcachefs_metadata_version_reflink_p_fix = 16, + bcachefs_metadata_version_subvol_dirent = 17, + bcachefs_metadata_version_inode_v2 = 18, + bcachefs_metadata_version_max = 19, }; #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) @@ -1348,6 +1424,10 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); /* * Features: @@ -1355,7 +1435,7 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist * reflink: gates KEY_TYPE_reflink * inline_data: gates KEY_TYPE_inline_data - * new_siphash: gates BCH_STR_HASH_SIPHASH + * new_siphash: gates BCH_STR_HASH_siphash * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE */ #define BCH_SB_FEATURES() \ @@ -1431,12 +1511,17 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash_old, 2) \ + x(siphash, 3) + enum bch_str_hash_type { - BCH_STR_HASH_CRC32C = 0, - BCH_STR_HASH_CRC64 = 1, - BCH_STR_HASH_SIPHASH_OLD = 2, - BCH_STR_HASH_SIPHASH = 3, - BCH_STR_HASH_NR = 4, +#define x(t, n) BCH_STR_HASH_##t = n, + BCH_STR_HASH_TYPES() +#undef x + BCH_STR_HASH_NR }; #define BCH_STR_HASH_OPTS() \ @@ -1451,34 +1536,39 @@ enum bch_str_hash_opts { BCH_STR_HASH_OPT_NR }; +#define BCH_CSUM_TYPES() \ + x(none, 0) \ + x(crc32c_nonzero, 1) \ + x(crc64_nonzero, 2) \ + x(chacha20_poly1305_80, 3) \ + x(chacha20_poly1305_128, 4) \ + x(crc32c, 5) \ + x(crc64, 6) \ + x(xxhash, 7) + enum bch_csum_type { - BCH_CSUM_NONE = 0, - BCH_CSUM_CRC32C_NONZERO = 1, - BCH_CSUM_CRC64_NONZERO = 2, - BCH_CSUM_CHACHA20_POLY1305_80 = 3, - BCH_CSUM_CHACHA20_POLY1305_128 = 4, - BCH_CSUM_CRC32C = 5, - BCH_CSUM_CRC64 = 6, - BCH_CSUM_XXHASH = 7, - BCH_CSUM_NR = 8, +#define x(t, n) BCH_CSUM_##t = n, + BCH_CSUM_TYPES() +#undef x + BCH_CSUM_NR }; static const unsigned bch_crc_bytes[] = { - [BCH_CSUM_NONE] = 0, - [BCH_CSUM_CRC32C_NONZERO] = 4, - [BCH_CSUM_CRC32C] = 4, - [BCH_CSUM_CRC64_NONZERO] = 8, - [BCH_CSUM_CRC64] = 8, - [BCH_CSUM_XXHASH] = 8, - [BCH_CSUM_CHACHA20_POLY1305_80] = 10, - [BCH_CSUM_CHACHA20_POLY1305_128] = 16, + [BCH_CSUM_none] = 0, + [BCH_CSUM_crc32c_nonzero] = 4, + [BCH_CSUM_crc32c] = 4, + [BCH_CSUM_crc64_nonzero] = 8, + [BCH_CSUM_crc64] = 8, + [BCH_CSUM_xxhash] = 8, + [BCH_CSUM_chacha20_poly1305_80] = 10, + [BCH_CSUM_chacha20_poly1305_128] = 16, }; static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) { switch (type) { - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: return true; default: return false; @@ -1572,7 +1662,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(usage, 5) \ x(data_usage, 6) \ x(clock, 7) \ - x(dev_usage, 8) + x(dev_usage, 8) \ + x(log, 9) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1602,11 +1693,16 @@ struct jset_entry_blacklist_v2 { __le64 end; }; +#define BCH_FS_USAGE_TYPES() \ + x(reserved, 0) \ + x(inodes, 1) \ + x(key_version, 2) + enum { - FS_USAGE_RESERVED = 0, - FS_USAGE_INODES = 1, - FS_USAGE_KEY_VERSION = 2, - FS_USAGE_NR = 3 +#define x(f, nr) BCH_FS_USAGE_##f = nr, + BCH_FS_USAGE_TYPES() +#undef x + BCH_FS_USAGE_NR }; struct jset_entry_usage { @@ -1644,6 +1740,17 @@ struct jset_entry_dev_usage { struct jset_entry_dev_usage_type d[]; } __attribute__((packed)); +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ + return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); +} + +struct jset_entry_log { + struct jset_entry entry; + u8 d[]; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1695,7 +1802,9 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); x(alloc, 4) \ x(quotas, 5) \ x(stripes, 6) \ - x(reflink, 7) + x(reflink, 7) \ + x(subvolumes, 8) \ + x(snapshots, 9) enum btree_id { #define x(kwd, val) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index f679fc2..930981a 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -78,6 +78,9 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) #define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) +#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) +#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) + /* ioctl below act on a particular file, not the filesystem as a whole: */ #define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) @@ -349,4 +352,16 @@ struct bch_ioctl_disk_resize_journal { __u64 nbuckets; }; +struct bch_ioctl_subvolume { + __u32 flags; + __u32 dirfd; + __u16 mode; + __u16 pad[3]; + __u64 dst_ptr; + __u64 src_ptr; +}; + +#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) +#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 2e45d88..7dee3d8 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -55,7 +55,7 @@ static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) #define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) #define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) enum bkey_lr_packed { BKEY_PACKED_BOTH, @@ -163,37 +163,6 @@ static inline struct bpos bpos_max(struct bpos l, struct bpos r) return bpos_cmp(l, r) > 0 ? l : r; } -#define sbb(a, b, borrow) \ -do { \ - typeof(a) d1, d2; \ - \ - d1 = a - borrow; \ - borrow = d1 > a; \ - \ - d2 = d1 - b; \ - borrow += d2 > d1; \ - a = d2; \ -} while (0) - -/* returns a - b: */ -static inline struct bpos bpos_sub(struct bpos a, struct bpos b) -{ - int borrow = 0; - - sbb(a.snapshot, b.snapshot, borrow); - sbb(a.offset, b.offset, borrow); - sbb(a.inode, b.inode, borrow); - return a; -} - -static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -{ - if (bpos_cmp(l, r) > 0) - swap(l, r); - - return bpos_sub(r, l); -} - void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index f8adbf4..e83aeb6 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -11,6 +11,7 @@ #include "inode.h" #include "quota.h" #include "reflink.h" +#include "subvolume.h" #include "xattr.h" const char * const bch2_bkey_types[] = { @@ -30,7 +31,7 @@ static const char *deleted_key_invalid(const struct bch_fs *c, .key_invalid = deleted_key_invalid, \ } -#define bch2_bkey_ops_discard (struct bkey_ops) { \ +#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ .key_invalid = deleted_key_invalid, \ } @@ -100,6 +101,8 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) static unsigned bch2_key_types_allowed[] = { [BKEY_TYPE_extents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_error)| (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_extent)| @@ -107,26 +110,45 @@ static unsigned bch2_key_types_allowed[] = { (1U << KEY_TYPE_reflink_p)| (1U << KEY_TYPE_inline_data), [BKEY_TYPE_inodes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_v2)| (1U << KEY_TYPE_inode_generation), [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_dirent), [BKEY_TYPE_xattrs] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| (1U << KEY_TYPE_cookie)| (1U << KEY_TYPE_hash_whiteout)| (1U << KEY_TYPE_xattr), [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_alloc)| - (1U << KEY_TYPE_alloc_v2), + (1U << KEY_TYPE_alloc_v2)| + (1U << KEY_TYPE_alloc_v3), [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_quota), [BKEY_TYPE_stripes] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_stripe), [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_reflink_v)| (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_subvolumes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_subvolume), + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| (1U << KEY_TYPE_btree_ptr)| (1U << KEY_TYPE_btree_ptr_v2), }; @@ -134,21 +156,18 @@ static unsigned bch2_key_types_allowed[] = { const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type) { - unsigned key_types_allowed = (1U << KEY_TYPE_deleted)| - bch2_key_types_allowed[type] ; - if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (!(key_types_allowed & (1U << k.k->type))) + if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) return "invalid key type for this btree"; if (type == BKEY_TYPE_btree && bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) return "value too big"; - if (btree_node_type_is_extents(type)) { - if ((k.k->size == 0) != bkey_deleted(k.k)) + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (k.k->size == 0) return "bad size field"; if (k.k->size > k.k->p.offset) @@ -165,7 +184,7 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, if (type != BKEY_TYPE_btree && btree_type_has_snapshots(type) && - k.k->p.snapshot != U32_MAX) + !k.k->p.snapshot) return "invalid snapshot field"; if (type != BKEY_TYPE_btree && @@ -193,28 +212,14 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) return NULL; } -void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -{ - const char *invalid; - - BUG_ON(!k.k->u64s); - - invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: - bch2_bkey_in_btree_node(b, k); - if (invalid) { - char buf[160]; - - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); - } -} - void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { if (!bpos_cmp(pos, POS_MIN)) pr_buf(out, "POS_MIN"); else if (!bpos_cmp(pos, POS_MAX)) pr_buf(out, "POS_MAX"); + else if (!bpos_cmp(pos, SPOS_MAX)) + pr_buf(out, "SPOS_MAX"); else { if (pos.inode == U64_MAX) pr_buf(out, "U64_MAX"); diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 3012035..4fdac54 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -34,8 +34,6 @@ const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type); const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); - void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); void bch2_val_to_text(struct printbuf *, struct bch_fs *, diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 537ab79..b1385a7 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -117,23 +117,6 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, return nr; } -static void extent_sort_append(struct bch_fs *c, - struct bkey_format *f, - struct btree_nr_keys *nr, - struct bkey_packed **out, - struct bkey_s k) -{ - if (!bkey_deleted(k.k)) { - if (!bch2_bkey_pack_key(*out, k.k, f)) - memcpy_u64s_small(*out, k.k, BKEY_U64s); - - memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); - - btree_keys_account_key_add(nr, 0, *out); - *out = bkey_next(*out); - } -} - /* Sort + repack in a new format: */ struct btree_nr_keys bch2_sort_repack(struct bset *dst, struct btree *src, @@ -144,6 +127,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, struct bkey_format *in_f = &src->format; struct bkey_packed *in, *out = vstruct_last(dst); struct btree_nr_keys nr; + bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); memset(&nr, 0, sizeof(nr)); @@ -151,8 +135,10 @@ bch2_sort_repack(struct bset *dst, struct btree *src, if (filter_whiteouts && bkey_deleted(in)) continue; - if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) + if (!transform) + bkey_copy(out, in); + else if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) out->format = KEY_FORMAT_LOCAL_BTREE; else bch2_bkey_unpack(src, (void *) out, in); @@ -165,47 +151,6 @@ bch2_sort_repack(struct bset *dst, struct btree *src, return nr; } -/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -struct btree_nr_keys -bch2_sort_repack_merge(struct bch_fs *c, - struct bset *dst, struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_packed *out = vstruct_last(dst), *k_packed; - struct bkey_buf k; - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - bch2_bkey_buf_init(&k); - - while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_deleted(k_packed)) - continue; - - /* - * NOTE: - * bch2_bkey_normalize may modify the key we pass it (dropping - * stale pointers) and we don't have a write lock on the src - * node; we have to make a copy of the entire key before calling - * normalize - */ - bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s); - bch2_bkey_unpack(src, k.k, k_packed); - - if (filter_whiteouts && - bch2_bkey_normalize(c, bkey_i_to_s(k.k))) - continue; - - extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - bch2_bkey_buf_exit(&k, c); - return nr; -} - static inline int sort_keys_cmp(struct btree *b, struct bkey_packed *l, struct bkey_packed *r) diff --git a/libbcachefs/bkey_sort.h b/libbcachefs/bkey_sort.h index 1059996..79cf11d 100644 --- a/libbcachefs/bkey_sort.h +++ b/libbcachefs/bkey_sort.h @@ -37,11 +37,6 @@ struct btree_nr_keys bch2_sort_repack(struct bset *, struct btree *, struct btree_node_iter *, struct bkey_format *, bool); -struct btree_nr_keys -bch2_sort_repack_merge(struct bch_fs *, - struct bset *, struct btree *, - struct btree_node_iter *, - struct bkey_format *, bool); unsigned bch2_sort_keys(struct bkey_packed *, struct sort_iter *, bool); diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 0eb85ac..6000a87 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -197,9 +197,11 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, return; /* Verify no duplicates: */ - btree_node_iter_for_each(iter, set) + btree_node_iter_for_each(iter, set) { + BUG_ON(set->k > set->end); btree_node_iter_for_each(iter, s2) BUG_ON(set != s2 && set->end == s2->end); + } /* Verify that set->end is correct: */ btree_node_iter_for_each(iter, set) { @@ -471,7 +473,7 @@ static inline struct bkey_packed *tree_to_bkey(const struct btree *b, unsigned j) { return cacheline_to_bkey(b, t, - __eytzinger1_to_inorder(j, t->size, t->extra), + __eytzinger1_to_inorder(j, t->size - 1, t->extra), bkey_float(b, t, j)->key_offset); } @@ -605,10 +607,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, } __always_inline -static inline void __make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +static inline void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); @@ -677,34 +679,6 @@ static inline void __make_bfloat(struct btree *b, struct bset_tree *t, f->mantissa = mantissa; } -static void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) -{ - struct bkey_i *k; - - if (is_power_of_2(j) && - !min_key->u64s) { - if (!bkey_pack_pos(min_key, b->data->min_key, b)) { - k = (void *) min_key; - bkey_init(&k->k); - k->k.p = b->data->min_key; - } - } - - if (is_power_of_2(j + 1) && - !max_key->u64s) { - if (!bkey_pack_pos(max_key, b->data->max_key, b)) { - k = (void *) max_key; - bkey_init(&k->k); - k->k.p = b->data->max_key; - } - } - - __make_bfloat(b, t, j, min_key, max_key); -} - /* bytes remaining - only valid for last bset: */ static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) { @@ -761,7 +735,7 @@ retry: t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; /* First we figure out where the first key in each cacheline is */ - eytzinger1_for_each(j, t->size) { + eytzinger1_for_each(j, t->size - 1) { while (bkey_to_cacheline(b, t, k) < cacheline) prev = k, k = bkey_next(k); @@ -793,10 +767,10 @@ retry: } /* Then we build the tree */ - eytzinger1_for_each(j, t->size) - __make_bfloat(b, t, j, - bkey_to_packed(&min_key), - bkey_to_packed(&max_key)); + eytzinger1_for_each(j, t->size - 1) + make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); } static void bset_alloc_tree(struct btree *b, struct bset_tree *t) @@ -895,7 +869,7 @@ static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, do { p = j ? tree_to_bkey(b, t, __inorder_to_eytzinger1(j--, - t->size, t->extra)) + t->size - 1, t->extra)) : btree_bkey_first(b, t); } while (p >= k); break; @@ -941,91 +915,6 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, /* Insert */ -static void rw_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - unsigned offset = __btree_node_key_to_offset(b, k); - unsigned j = rw_aux_tree_bsearch(b, t, offset); - - if (j < t->size && - rw_aux_tree(b, t)[j].offset == offset) - rw_aux_tree_set(b, t, j, k); - - bch2_bset_verify_rw_aux_tree(b, t); -} - -static void ro_aux_tree_fix_invalidated_key(struct btree *b, - struct bset_tree *t, - struct bkey_packed *k) -{ - struct bkey_packed min_key, max_key; - unsigned inorder, j; - - EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); - - /* signal to make_bfloat() that they're uninitialized: */ - min_key.u64s = max_key.u64s = 0; - - if (bkey_next(k) == btree_bkey_last(b, t)) { - for (j = 1; j < t->size; j = j * 2 + 1) - make_bfloat(b, t, j, &min_key, &max_key); - } - - inorder = bkey_to_cacheline(b, t, k); - - if (inorder && - inorder < t->size) { - j = __inorder_to_eytzinger1(inorder, t->size, t->extra); - - if (k == tree_to_bkey(b, t, j)) { - /* Fix the node this key corresponds to */ - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the right boundary */ - for (j = eytzinger1_left_child(j); - j < t->size; - j = eytzinger1_right_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } - - if (inorder + 1 < t->size) { - j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); - - if (k == tree_to_prev_bkey(b, t, j)) { - make_bfloat(b, t, j, &min_key, &max_key); - - /* Children for which this key is the left boundary */ - for (j = eytzinger1_right_child(j); - j < t->size; - j = eytzinger1_left_child(j)) - make_bfloat(b, t, j, &min_key, &max_key); - } - } -} - -/** - * bch2_bset_fix_invalidated_key() - given an existing key @k that has been - * modified, fix any auxiliary search tree by remaking all the nodes in the - * auxiliary search tree that @k corresponds to - */ -void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -{ - struct bset_tree *t = bch2_bkey_to_bset(b, k); - - switch (bset_aux_tree_type(t)) { - case BSET_NO_AUX_TREE: - break; - case BSET_RO_AUX_TREE: - ro_aux_tree_fix_invalidated_key(b, t, k); - break; - case BSET_RW_AUX_TREE: - rw_aux_tree_fix_invalidated_key(b, t, k); - break; - } -} - static void bch2_bset_fix_lookup_table(struct btree *b, struct bset_tree *t, struct bkey_packed *_where, @@ -1260,7 +1149,7 @@ slowpath: n = n * 2 + (cmp < 0); } while (n < t->size); - inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); + inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); /* * n would have been the node we recursed to - the low bit tells us if @@ -1271,7 +1160,7 @@ slowpath: if (unlikely(!inorder)) return btree_bkey_first(b, t); - f = &base->f[eytzinger1_prev(n >> 1, t->size)]; + f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; } return cacheline_to_bkey(b, t, inorder, f->key_offset); @@ -1545,10 +1434,6 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, EBUG_ON(iter->data->k > iter->data->end); - while (!__btree_node_iter_set_end(iter, 0) && - !__bch2_btree_node_iter_peek_all(iter, b)->u64s) - iter->data->k++; - if (unlikely(__btree_node_iter_set_end(iter, 0))) { bch2_btree_node_iter_set_drop(iter, iter->data); return; @@ -1692,7 +1577,7 @@ void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, if (!inorder || inorder >= t->size) return; - j = __inorder_to_eytzinger1(inorder, t->size, t->extra); + j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); if (k != tree_to_bkey(b, t, j)) return; diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index e42f866..0d46534 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -361,7 +361,6 @@ void bch2_bset_init_first(struct btree *, struct bset *); void bch2_bset_init_next(struct bch_fs *, struct btree *, struct btree_node_entry *); void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); void bch2_bset_insert(struct btree *, struct btree_node_iter *, struct bkey_packed *, struct bkey_i *, unsigned); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index cd0c500..986d08d 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -83,6 +83,8 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + if (b->aux_data == MAP_FAILED) + b->aux_data = NULL; #endif if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); @@ -128,7 +130,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) { - rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + BUG_ON(ret); /* Cause future lookups for this node to fail: */ b->hash_val = 0; @@ -273,6 +276,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long touched = 0; unsigned long freed = 0; unsigned i, flags; + unsigned long ret = SHRINK_STOP; if (bch2_btree_shrinker_disabled) return SHRINK_STOP; @@ -281,7 +285,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (sc->gfp_mask & __GFP_FS) mutex_lock(&bc->lock); else if (!mutex_trylock(&bc->lock)) - return -1; + goto out_norestore; flags = memalloc_nofs_save(); @@ -298,13 +302,19 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { + /* + * Leave a few nodes on the freeable list, so that a btree split + * won't have to hit the system allocator: + */ + if (++i <= 3) + continue; + touched++; - if (freed >= nr) + if (touched >= nr) break; - if (++i > 3 && - !btree_node_reclaim(c, b)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -315,7 +325,7 @@ restart: list_for_each_entry_safe(b, t, &bc->live, list) { touched++; - if (freed >= nr) { + if (touched >= nr) { /* Save position */ if (&t->list != &bc->live) list_move_tail(&bc->live, &t->list); @@ -350,8 +360,14 @@ restart: mutex_unlock(&bc->lock); out: + ret = (unsigned long) freed * btree_pages(c); memalloc_nofs_restore(flags); - return (unsigned long) freed * btree_pages(c); +out_norestore: + trace_btree_cache_scan(sc->nr_to_scan, + sc->nr_to_scan / btree_pages(c), + btree_cache_can_free(bc), + ret); + return ret; } static unsigned long bch2_btree_cache_count(struct shrinker *shrink, @@ -632,7 +648,8 @@ err: /* Slowpath, don't want it inlined into btree_iter_traverse() */ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_iter *iter, + struct btree_trans *trans, + struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level, @@ -648,8 +665,10 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * Parent node must be locked, else we could read in a btree node that's * been freed: */ - if (iter && !bch2_btree_node_relock(iter, level + 1)) { - btree_trans_restart(iter->trans); + if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_trans_restart_relock_parent_for_fill(trans->fn, + _THIS_IP_, btree_id, &path->pos); + btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -680,23 +699,25 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ - if (iter && sync) - bch2_trans_unlock(iter->trans); + if (trans && sync) + bch2_trans_unlock(trans); bch2_btree_node_read(c, b, sync); if (!sync) return NULL; - if (iter && - (!bch2_trans_relock(iter->trans) || - !bch2_btree_iter_relock_intent(iter))) { - BUG_ON(!iter->trans->restarted); + if (trans && + (!bch2_trans_relock(trans) || + !bch2_btree_path_relock_intent(trans, path))) { + BUG_ON(!trans->restarted); return ERR_PTR(-EINTR); } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - btree_trans_restart(iter->trans); + trace_trans_restart_relock_after_fill(trans->fn, _THIS_IP_, + btree_id, &path->pos); + btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -754,7 +775,7 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) * The btree node will have either a read or a write lock held, depending on * the @write parameter. */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter *iter, +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type, unsigned long trace_ip) @@ -766,11 +787,17 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_iter * EBUG_ON(level >= BTREE_MAX_DEPTH); - if (c->opts.btree_node_mem_ptr_optimization) { - b = btree_node_mem_ptr(k); - if (b) + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (likely(c->opts.btree_node_mem_ptr_optimization && + b && + b->hash_val == btree_ptr_hash_val(k))) goto lock_node; - } retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -779,7 +806,7 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, iter, k, iter->btree_id, + b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, level, lock_type, true); /* We raced and found the btree node in the cache */ @@ -818,10 +845,10 @@ lock_node: * the parent was modified, when the pointer to the node we want * was removed - and we'll bail out: */ - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); - if (!btree_node_lock(b, k->k.p, level, iter, lock_type, + if (!btree_node_lock(trans, path, b, k->k.p, level, lock_type, lock_node_check_fn, (void *) k, trace_ip)) { if (!trans->restarted) goto retry; @@ -832,13 +859,13 @@ lock_node: b->c.level != level || race_fault())) { six_unlock_type(&b->c.lock, lock_type); - if (bch2_btree_node_relock(iter, level + 1)) + if (bch2_btree_node_relock(trans, path, level + 1)) goto retry; - trace_trans_restart_btree_node_reused(trans->ip, + trace_trans_restart_btree_node_reused(trans->fn, trace_ip, - iter->btree_id, - &iter->real_pos); + path->btree_id, + &path->pos); btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -853,12 +880,12 @@ lock_node: bch2_btree_node_wait_on_read(b); /* - * should_be_locked is not set on this iterator yet, so we need - * to relock it specifically: + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: */ - if (iter && + if (trans && (!bch2_trans_relock(trans) || - !bch2_btree_iter_relock_intent(iter))) { + !bch2_btree_path_relock_intent(trans, path))) { BUG_ON(!trans->restarted); return ERR_PTR(-EINTR); } @@ -886,7 +913,7 @@ lock_node: return ERR_PTR(-EIO); } - EBUG_ON(b->c.btree_id != iter->btree_id); + EBUG_ON(b->c.btree_id != path->btree_id); EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); btree_check_header(c, b); @@ -917,7 +944,7 @@ retry: if (nofill) goto out; - b = bch2_btree_node_fill(c, NULL, k, btree_id, + b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, level, SIX_LOCK_read, true); /* We raced and found the btree node in the cache */ @@ -975,21 +1002,24 @@ out: return b; } -int bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, +int bch2_btree_node_prefetch(struct bch_fs *c, + struct btree_trans *trans, + struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level) { struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(iter && !btree_node_locked(iter, level + 1)); + BUG_ON(trans && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); if (b) return 0; - b = bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false); + b = bch2_btree_node_fill(c, trans, path, k, btree_id, + level, SIX_LOCK_read, false); return PTR_ERR_OR_ZERO(b); } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 5032293..f7e1098 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -22,14 +22,14 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_iter *, +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, enum six_lock_type, unsigned long); struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, enum btree_id, unsigned, bool); -int bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, +int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, const struct bkey_i *, enum btree_id, unsigned); void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *); @@ -71,7 +71,7 @@ static inline bool btree_node_hashed(struct btree *b) static inline size_t btree_bytes(struct bch_fs *c) { - return c->opts.btree_node_size << 9; + return c->opts.btree_node_size; } static inline size_t btree_max_u64s(struct bch_fs *c) @@ -86,7 +86,7 @@ static inline size_t btree_pages(struct bch_fs *c) static inline unsigned btree_blocks(struct bch_fs *c) { - return c->opts.btree_node_size >> c->block_bits; + return btree_sectors(c) >> c->block_bits; } #define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 3dd1094..648779c 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -9,6 +9,7 @@ #include "alloc_foreground.h" #include "bkey_methods.h" #include "bkey_buf.h" +#include "btree_key_cache.h" #include "btree_locking.h" #include "btree_update_interior.h" #include "btree_io.h" @@ -155,6 +156,34 @@ static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) } } +static void bch2_btree_node_update_key_early(struct bch_fs *c, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(c, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) { struct bkey_i_btree_ptr_v2 *new; @@ -169,7 +198,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new->v.min_key = new_min; SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); if (ret) { kfree(new); return ret; @@ -198,7 +227,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new->k.p = new_max; SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); - ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i); + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); if (ret) { kfree(new); return ret; @@ -498,28 +527,15 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, char buf[200]; int ret = 0; + /* + * XXX + * use check_bucket_ref here + */ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr, true); - struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); - if (fsck_err_on(g->mark.data_type && - g->mark.data_type != data_type, c, - "bucket %u:%zu different types of data in same bucket: %s, %s\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[g->mark.data_type], - bch2_data_types[data_type], - (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { - if (data_type == BCH_DATA_btree) { - g2->_mark.data_type = g->_mark.data_type = data_type; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); - } else { - do_update = true; - } - } - if (fsck_err_on(!g->gen_valid, c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" "while marking %s", @@ -528,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; } else { do_update = true; } @@ -544,18 +559,26 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, p.ptr.gen, g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { if (!p.ptr.cached) { - g2->_mark.gen = g->_mark.gen = p.ptr.gen; - g2->gen_valid = g->gen_valid = true; - g2->_mark.data_type = 0; - g2->_mark.dirty_sectors = 0; - g2->_mark.cached_sectors = 0; + g->_mark.gen = p.ptr.gen; + g->gen_valid = true; + g->_mark.data_type = 0; + g->_mark.dirty_sectors = 0; + g->_mark.cached_sectors = 0; set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); } else { do_update = true; } } + if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) + do_update = true; + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->mark.gen) < 0, c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" @@ -566,8 +589,27 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) do_update = true; + if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen) + continue; + + if (fsck_err_on(g->mark.data_type && + g->mark.data_type != data_type, c, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[g->mark.data_type], + bch2_data_types[data_type], + (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) { + if (data_type == BCH_DATA_btree) { + g->_mark.data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + if (p.has_ec) { - struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx); + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); if (fsck_err_on(!m || !m->alive, c, "pointer to nonexistent stripe %llu\n" @@ -613,20 +655,21 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id, ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); ptr->gen = g->mark.gen; } } else { bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); (ptr->cached && (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) || (!ptr->cached && gen_cmp(ptr->gen, g->mark.gen) < 0) || + gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX || (g->mark.data_type && g->mark.data_type != data_type); })); @@ -634,7 +677,7 @@ again: ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); bkey_extent_entry_for_each(ptrs, entry) { if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { - struct stripe *m = genradix_ptr(&c->stripes[true], + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, entry->stripe_ptr.idx); union bch_extent_entry *next_ptr; @@ -659,11 +702,20 @@ found: } } - ret = bch2_journal_key_insert(c, btree_id, level, new); - if (ret) + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { kfree(new); - else - *k = bkey_i_to_s_c(new); + return ret; + } + + if (level) + bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new); + + bch2_bkey_val_to_text(&PBUF(buf), c, *k); + bch_info(c, "updated %s", buf); + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf); + *k = bkey_i_to_s_c(new); } fsck_err: return ret; @@ -671,19 +723,21 @@ fsck_err: /* marking of btree keys/nodes: */ -static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, unsigned level, bool is_root, struct bkey_s_c *k, - u8 *max_stale, bool initial) + bool initial) { - struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr; + struct bch_fs *c = trans->c; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; unsigned flags = - BTREE_TRIGGER_INSERT| BTREE_TRIGGER_GC| (initial ? BTREE_TRIGGER_NOATOMIC : 0); int ret = 0; + deleted.p = k->k->p; + if (initial) { BUG_ON(bch2_journal_seq_verify && k->k->version.lo > journal_cur_seq(&c->journal)); @@ -697,31 +751,9 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id, k->k->version.lo, atomic64_read(&c->key_version))) atomic64_set(&c->key_version, k->k->version.lo); - - if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c, - "superblock not marked as containing replicas (type %u)", - k->k->type)) { - ret = bch2_mark_bkey_replicas(c, *k); - if (ret) { - bch_err(c, "error marking bkey replicas: %i", ret); - goto err; - } - } } - ptrs = bch2_bkey_ptrs_c(*k); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, true); - - if (gen_after(g->oldest_gen, ptr->gen)) - g->oldest_gen = ptr->gen; - - *max_stale = max(*max_stale, ptr_stale(ca, ptr)); - } - - bch2_mark_key(c, *k, flags); + ret = bch2_mark_key(trans, old, *k, flags); fsck_err: err: if (ret) @@ -729,17 +761,15 @@ err: return ret; } -static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, - bool initial) +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) { + struct bch_fs *c = trans->c; struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; struct bkey_buf prev, cur; int ret = 0; - *max_stale = 0; - if (!btree_node_type_needs_gc(btree_node_type(b))) return 0; @@ -749,8 +779,8 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, bkey_init(&prev.k->k); while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, - &k, max_stale, initial); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, + &k, initial); if (ret) break; @@ -771,52 +801,32 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, return ret; } -static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, bool initial, bool metadata_only) { - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct btree *b; unsigned depth = metadata_only ? 1 : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); - __for_each_btree_node(&trans, iter, btree_id, POS_MIN, - 0, depth, BTREE_ITER_PREFETCH, b) { + __for_each_btree_node(trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { bch2_verify_btree_nr_keys(b); gc_pos_set(c, gc_pos_btree_node(b)); - ret = btree_gc_mark_node(c, b, &max_stale, initial); + ret = btree_gc_mark_node(trans, b, initial); if (ret) break; - - if (!initial) { - if (max_stale > 64) - bch2_btree_node_rewrite(&trans, iter, - b->data->keys.seq, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - else if (!bch2_btree_gc_rewrite_disabled && - (bch2_btree_gc_always_rewrite || max_stale > 16)) - bch2_btree_node_rewrite(&trans, iter, - b->data->keys.seq, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_GC_LOCK_HELD); - } - - bch2_trans_cond_resched(&trans); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; @@ -825,8 +835,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, if (!btree_node_fake(b)) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, - &k, &max_stale, initial); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + true, &k, initial); } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); mutex_unlock(&c->btree_root_lock); @@ -834,13 +844,13 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return ret; } -static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, +static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, unsigned target_depth) { + struct bch_fs *c = trans->c; struct btree_and_journal_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - u8 max_stale = 0; char buf[200]; int ret = 0; @@ -853,8 +863,8 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false, - &k, &max_stale, true); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); if (ret) { bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret); goto fsck_err; @@ -920,7 +930,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, break; } - ret = bch2_gc_btree_init_recurse(c, child, + ret = bch2_gc_btree_init_recurse(trans, child, target_depth); six_unlock_read(&child->c.lock); @@ -935,16 +945,16 @@ fsck_err: return ret; } -static int bch2_gc_btree_init(struct bch_fs *c, +static int bch2_gc_btree_init(struct btree_trans *trans, enum btree_id btree_id, bool metadata_only) { + struct bch_fs *c = trans->c; struct btree *b; unsigned target_depth = metadata_only ? 1 : bch2_expensive_debug_checks ? 0 : !btree_node_type_needs_gc(btree_id) ? 1 : 0; - u8 max_stale = 0; char buf[100]; int ret = 0; @@ -971,13 +981,13 @@ static int bch2_gc_btree_init(struct bch_fs *c, } if (b->c.level >= target_depth) - ret = bch2_gc_btree_init_recurse(c, b, target_depth); + ret = bch2_gc_btree_init_recurse(trans, b, target_depth); if (!ret) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true, - &k, &max_stale, true); + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, + &k, true); } fsck_err: six_unlock_read(&b->c.lock); @@ -995,21 +1005,26 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) { + struct btree_trans trans; enum btree_id ids[BTREE_ID_NR]; unsigned i; int ret = 0; + bch2_trans_init(&trans, c, 0, 0); + for (i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); for (i = 0; i < BTREE_ID_NR && !ret; i++) ret = initial - ? bch2_gc_btree_init(c, ids[i], metadata_only) - : bch2_gc_btree(c, ids[i], initial, metadata_only); + ? bch2_gc_btree_init(&trans, ids[i], metadata_only) + : bch2_gc_btree(&trans, ids[i], initial, metadata_only); if (ret < 0) bch_err(c, "%s: ret %i", __func__, ret); + + bch2_trans_exit(&trans); return ret; } @@ -1031,23 +1046,13 @@ static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, } while (start < end); } -void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, - unsigned flags) +static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + unsigned flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; unsigned i; u64 b; - /* - * This conditional is kind of gross, but we may be called from the - * device add path, before the new device has actually been added to the - * running filesystem: - */ - if (c) { - lockdep_assert_held(&c->sb_lock); - percpu_down_read(&c->mark_lock); - } - for (i = 0; i < layout->nr_superblocks; i++) { u64 offset = le64_to_cpu(layout->sb_offset[i]); @@ -1066,9 +1071,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } - - if (c) - percpu_up_read(&c->mark_lock); } static void bch2_mark_superblocks(struct bch_fs *c) @@ -1096,8 +1098,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - bch2_mark_key(c, bkey_i_to_s_c(&d->key), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_GC); + bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -1108,7 +1109,8 @@ static void bch2_gc_free(struct bch_fs *c) struct bch_dev *ca; unsigned i; - genradix_free(&c->stripes[1]); + genradix_free(&c->reflink_gc_table); + genradix_free(&c->gc_stripes); for_each_member_device(ca, c, i) { kvpfree(rcu_dereference_protected(ca->buckets[1], 1), @@ -1133,13 +1135,14 @@ static int bch2_gc_done(struct bch_fs *c, unsigned i, dev; int ret = 0; + percpu_down_write(&c->mark_lock); + #define copy_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ if (verify) \ fsck_err(c, _msg ": got %llu, should be %llu" \ , ##__VA_ARGS__, dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_stripe_field(_f, _msg, ...) \ if (dst->_f != src->_f) { \ @@ -1149,85 +1152,28 @@ static int bch2_gc_done(struct bch_fs *c, iter.pos, ##__VA_ARGS__, \ dst->_f, src->_f); \ dst->_f = src->_f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ - } -#define copy_bucket_field(_f) \ - if (dst->b[b].mark._f != src->b[b].mark._f) { \ - if (verify) \ - fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ - ": got %u, should be %u", dev, b, \ - dst->b[b].mark.gen, \ - bch2_data_types[dst->b[b].mark.data_type],\ - dst->b[b].mark._f, src->b[b].mark._f); \ - dst->b[b]._mark._f = src->b[b].mark._f; \ - set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \ } #define copy_dev_field(_f, _msg, ...) \ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - if (!metadata_only) { - struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0); - struct stripe *dst, *src; - - while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) { - dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL); - - if (dst->alive != src->alive || - dst->sectors != src->sectors || - dst->algorithm != src->algorithm || - dst->nr_blocks != src->nr_blocks || - dst->nr_redundant != src->nr_redundant) { - bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused"); - ret = -EINVAL; - goto fsck_err; - } - - for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) - copy_stripe_field(block_sectors[i], - "block_sectors[%u]", i); - - dst->blocks_nonempty = 0; - for (i = 0; i < dst->nr_blocks; i++) - dst->blocks_nonempty += dst->block_sectors[i] != 0; - - genradix_iter_advance(&iter, &c->stripes[1]); - } - } - for (i = 0; i < ARRAY_SIZE(c->usage); i++) bch2_fs_usage_acc_to_base(c, i); for_each_member_device(ca, c, dev) { - struct bucket_array *dst = __bucket_array(ca, 0); - struct bucket_array *src = __bucket_array(ca, 1); - size_t b; - - for (b = 0; b < src->nbuckets; b++) { - copy_bucket_field(gen); - copy_bucket_field(data_type); - copy_bucket_field(stripe); - copy_bucket_field(dirty_sectors); - copy_bucket_field(cached_sectors); - - dst->b[b].oldest_gen = src->b[b].oldest_gen; - } - - { - struct bch_dev_usage *dst = ca->usage_base; - struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, - dev_usage_u64s()); - - copy_dev_field(buckets_ec, "buckets_ec"); - copy_dev_field(buckets_unavailable, "buckets_unavailable"); - - for (i = 0; i < BCH_DATA_NR; i++) { - copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); - copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); - copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); - } + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((void *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + copy_dev_field(buckets_unavailable, "buckets_unavailable"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); } }; @@ -1269,7 +1215,6 @@ static int bch2_gc_done(struct bch_fs *c, #undef copy_fs_field #undef copy_dev_field -#undef copy_bucket_field #undef copy_stripe_field #undef copy_field fsck_err: @@ -1277,6 +1222,8 @@ fsck_err: percpu_ref_put(&ca->ref); if (ret) bch_err(c, "%s: ret %i", __func__, ret); + + percpu_up_write(&c->mark_lock); return ret; } @@ -1285,7 +1232,6 @@ static int bch2_gc_start(struct bch_fs *c, { struct bch_dev *ca = NULL; unsigned i; - int ret; BUG_ON(c->usage_gc); @@ -1300,15 +1246,6 @@ static int bch2_gc_start(struct bch_fs *c, BUG_ON(ca->buckets[1]); BUG_ON(ca->usage_gc); - ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket), - GFP_KERNEL|__GFP_ZERO); - if (!ca->buckets[1]) { - percpu_ref_put(&ca->ref); - bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; - } - ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); @@ -1317,104 +1254,175 @@ static int bch2_gc_start(struct bch_fs *c, } } - ret = bch2_ec_mem_alloc(c, true); - if (ret) { - bch_err(c, "error allocating ec gc mem"); - return ret; - } + return 0; +} - percpu_down_write(&c->mark_lock); +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + bool initial, bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket *g; + struct bkey_s_c k; + struct bkey_alloc_unpacked old_u, new_u, gc_u; + struct bkey_alloc_buf *a; + int ret; - /* - * indicate to stripe code that we need to allocate for the gc stripes - * radix tree, too - */ - gc_pos_set(c, gc_phase(GC_PHASE_START)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; - for_each_member_device(ca, c, i) { - struct bucket_array *dst = __bucket_array(ca, 1); - struct bucket_array *src = __bucket_array(ca, 0); - size_t b; + old_u = new_u = bch2_alloc_unpack(k); - dst->first_bucket = src->first_bucket; - dst->nbuckets = src->nbuckets; + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, iter->pos.offset); + gc_u = (struct bkey_alloc_unpacked) { + .dev = iter->pos.inode, + .bucket = iter->pos.offset, + .gen = g->mark.gen, + .data_type = g->mark.data_type, + .dirty_sectors = g->mark.dirty_sectors, + .cached_sectors = g->mark.cached_sectors, + .read_time = g->io_time[READ], + .write_time = g->io_time[WRITE], + .stripe = g->stripe, + .stripe_redundancy = g->stripe_redundancy, + }; + percpu_up_read(&c->mark_lock); - for (b = 0; b < src->nbuckets; b++) { - struct bucket *d = &dst->b[b]; - struct bucket *s = &src->b[b]; + if (metadata_only && + gc_u.data_type != BCH_DATA_sb && + gc_u.data_type != BCH_DATA_journal && + gc_u.data_type != BCH_DATA_btree) + return 0; - d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; - d->gen_valid = s->gen_valid; + if (gen_after(old_u.gen, gc_u.gen)) + return 0; - if (metadata_only && - (s->mark.data_type == BCH_DATA_user || - s->mark.data_type == BCH_DATA_cached)) - d->_mark = s->mark; - } - }; +#define copy_bucket_field(_f) \ + if (fsck_err_on(new_u._f != gc_u._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + new_u.gen, \ + bch2_data_types[new_u.data_type], \ + new_u._f, gc_u._f)) \ + new_u._f = gc_u._f; \ + + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +#undef copy_bucket_field - percpu_up_write(&c->mark_lock); + if (!bkey_alloc_unpacked_cmp(old_u, new_u)) + return 0; - return 0; + a = bch2_alloc_pack(trans, new_u); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k) + : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; } -static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k) +static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only) { - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - char buf[200]; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; int ret = 0; - if (!refcount) - return 0; + bch2_trans_init(&trans, c, 0, 0); - r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++); - if (!r) - return -ENOMEM; + for_each_member_device(ca, c, i) { + for_each_btree_key(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS| + BTREE_ITER_PREFETCH, k, ret) { + if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + break; - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, + initial, metadata_only)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error writing alloc info: %i", ret); + percpu_ref_put(&ca->ref); + break; + } } - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), - r->refcount)) { - struct bkey_i *new; + bch2_trans_exit(&trans); + return ret; +} - new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); - if (!new) { - ret = -ENOMEM; - goto fsck_err; +static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + percpu_up_write(&c->mark_lock); + bch_err(c, "error allocating ca->buckets[gc]"); + return -ENOMEM; } - bkey_reassemble(new, k); + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets[1], buckets); + }; - if (!r->refcount) { - new->k.type = KEY_TYPE_deleted; - new->k.size = 0; - } else { - *bkey_refcount(new) = cpu_to_le64(r->refcount); - } + return bch2_alloc_read(c, true, metadata_only); +} - ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new); - if (ret) - kfree(new); - } -fsck_err: - return ret; +static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = __bucket_array(ca, true); + struct bucket *g; + + for_each_bucket(g, buckets) { + if (metadata_only && + (g->mark.data_type == BCH_DATA_user || + g->mark.data_type == BCH_DATA_cached || + g->mark.data_type == BCH_DATA_parity)) + continue; + g->_mark.dirty_sectors = 0; + g->_mark.cached_sectors = 0; + } + }; } static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bool metadata_only) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; size_t idx = 0; @@ -1424,14 +1432,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (metadata_only) return 0; - if (initial) { - c->reflink_gc_idx = 0; - - ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink, - bch2_gc_reflink_done_initial_fn); - goto out; - } - bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, @@ -1441,7 +1441,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, if (!refcount) continue; - r = genradix_ptr(&c->reflink_gc_table, idx); + r = genradix_ptr(&c->reflink_gc_table, idx++); if (!r || r->offset != k.k->p.offset || r->size != k.k->size) { @@ -1466,12 +1466,22 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, bkey_reassemble(new, k); - if (!r->refcount) + if (!r->refcount) { new->k.type = KEY_TYPE_deleted; - else + /* + * XXX ugly: bch2_journal_key_insert() queues up + * the key for the journal replay code, which + * doesn't run the extent overwrite pass + */ + if (initial) + new->k.size = 0; + } else { *bkey_refcount(new) = cpu_to_le64(r->refcount); + } - ret = __bch2_trans_do(&trans, NULL, NULL, 0, + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new) + : __bch2_trans_do(&trans, NULL, NULL, 0, __bch2_btree_insert(&trans, BTREE_ID_reflink, new)); kfree(new); @@ -1480,54 +1490,26 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial, } } fsck_err: - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); -out: - genradix_free(&c->reflink_gc_table); + bch2_trans_iter_exit(&trans, &iter); c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); return ret; } -static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k) -{ - - struct reflink_gc *r; - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - return 0; - - r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, - GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - return 0; -} - static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, bool metadata_only) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct reflink_gc *r; - int ret; + int ret = 0; if (metadata_only) return 0; - genradix_free(&c->reflink_gc_table); - c->reflink_gc_nr = 0; - - if (initial) - return bch2_btree_and_journal_walk(c, BTREE_ID_reflink, - bch2_gc_reflink_start_initial_fn); - bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { @@ -1547,10 +1529,89 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial, r->size = k.k->size; r->refcount = 0; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return 0; + return ret; +} + +static void bch2_gc_reflink_reset(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct genradix_iter iter; + struct reflink_gc *r; + + genradix_for_each(&c->reflink_gc_table, iter, r) + r->refcount = 0; +} + +static int bch2_gc_stripes_done(struct bch_fs *c, bool initial, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct gc_stripe *m; + const struct bch_stripe *s; + char buf[200]; + unsigned i; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + + for (i = 0; i < s->nr_blocks; i++) + if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) + goto inconsistent; + continue; +inconsistent: + if (fsck_err_on(true, c, + "stripe has wrong block sector count %u:\n" + " %s\n" + " should be %u", i, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), + m ? m->block_sectors[i] : 0)) { + struct bkey_i_stripe *new; + + new = kmalloc(bkey_bytes(k.k), GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + break; + } + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = initial + ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i) + : __bch2_trans_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i)); + kfree(new); + } + } +fsck_err: + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +static void bch2_gc_stripes_reset(struct bch_fs *c, bool initial, + bool metadata_only) +{ + genradix_free(&c->gc_stripes); } /** @@ -1586,15 +1647,18 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) /* flush interior btree updates: */ closure_wait_event(&c->btree_interior_update_wait, !bch2_btree_interior_updates_nr_pending(c)); -again: + ret = bch2_gc_start(c, metadata_only) ?: + bch2_gc_alloc_start(c, initial, metadata_only) ?: bch2_gc_reflink_start(c, initial, metadata_only); if (ret) goto out; +again: + gc_pos_set(c, gc_phase(GC_PHASE_START)); bch2_mark_superblocks(c); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) && + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && c->opts.fix_errors != FSCK_OPT_NO) { bch_info(c, "starting topology repair pass"); @@ -1628,39 +1692,40 @@ again: if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || (!iter && bch2_test_restart_gc)) { + if (iter++ > 2) { + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + goto out; + } + /* * XXX: make sure gens we fixed got saved */ - if (iter++ <= 2) { - bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); - percpu_down_write(&c->mark_lock); - bch2_gc_free(c); - percpu_up_write(&c->mark_lock); - /* flush fsck errors, reset counters */ - bch2_flush_fsck_errs(c); - - goto again; - } + bch2_gc_stripes_reset(c, initial, metadata_only); + bch2_gc_alloc_reset(c, initial, metadata_only); + bch2_gc_reflink_reset(c, initial, metadata_only); - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); + goto again; } out: if (!ret) { bch2_journal_block(&c->journal); - percpu_down_write(&c->mark_lock); - ret = bch2_gc_reflink_done(c, initial, metadata_only) ?: + ret = bch2_gc_stripes_done(c, initial, metadata_only) ?: + bch2_gc_reflink_done(c, initial, metadata_only) ?: + bch2_gc_alloc_done(c, initial, metadata_only) ?: bch2_gc_done(c, initial, metadata_only); bch2_journal_unblock(&c->journal); - } else { - percpu_down_write(&c->mark_lock); } + percpu_down_write(&c->mark_lock); /* Indicates that gc is no longer in progress: */ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); @@ -1695,9 +1760,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) percpu_down_read(&c->mark_lock); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, false); - if (gen_after(g->mark.gen, ptr->gen) > 16) { + if (ptr_stale(ca, ptr) > 16) { percpu_up_read(&c->mark_lock); return true; } @@ -1705,10 +1769,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, false); + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(g->gc_gen, ptr->gen)) - g->gc_gen = ptr->gen; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; } percpu_up_read(&c->mark_lock); @@ -1719,57 +1783,85 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree * node pointers currently never have cached pointers that can become stale: */ -static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) +static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id) { - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0, commit_err = 0; bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - c->gc_gens_pos = iter->pos; + while ((bch2_trans_begin(trans), + k = bch2_btree_iter_peek(&iter)).k) { + ret = bkey_err(k); + + if (ret == -EINTR) + continue; + if (ret) + break; + + c->gc_gens_pos = iter.pos; if (gc_btree_gens_key(c, k) && !commit_err) { bch2_bkey_buf_reassemble(&sk, c, k); bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - commit_err = - bch2_trans_update(&trans, iter, sk.k, 0) ?: - bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_NOWAIT| - BTREE_INSERT_NOFAIL); + bch2_trans_update(trans, &iter, sk.k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOWAIT| + BTREE_INSERT_NOFAIL); if (commit_err == -EINTR) { commit_err = 0; continue; } } - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(trans, &iter); - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + int ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + u = bch2_alloc_unpack(k); + + if (u.oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + + u.oldest_gen = ca->oldest_gen[iter->pos.offset]; + + return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN); +} + int bch2_gc_gens(struct bch_fs *c) { + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; + u64 b, start_time = local_clock(); unsigned i; int ret; @@ -1778,43 +1870,69 @@ int bch2_gc_gens(struct bch_fs *c) * introduces a deadlock in the RO path - we currently take the state * lock at the start of going RO, thus the gc thread may get stuck: */ + if (!mutex_trylock(&c->gc_gens_lock)) + return 0; + down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); + struct bucket_gens *gens; + + BUG_ON(ca->oldest_gen); - for_each_bucket(g, buckets) - g->gc_gen = g->mark.gen; - up_read(&ca->bucket_lock); + ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -ENOMEM; + goto err; + } + + gens = bucket_gens(ca); + + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; } for (i = 0; i < BTREE_ID_NR; i++) if ((1 << i) & BTREE_ID_HAS_PTRS) { c->gc_gens_btree = i; c->gc_gens_pos = POS_MIN; - ret = bch2_gc_btree_gens(c, i); + ret = bch2_gc_btree_gens(&trans, i); if (ret) { bch_err(c, "error recalculating oldest_gen: %i", ret); goto err; } } - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->oldest_gen = g->gc_gen; - up_read(&ca->bucket_lock); + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter)); + if (ret) { + bch_err(c, "error writing oldest_gen: %i", ret); + break; + } } + bch2_trans_iter_exit(&trans, &iter); c->gc_gens_btree = 0; c->gc_gens_pos = POS_MIN; c->gc_count++; + + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); err: + for_each_member_device(ca, c, i) { + kvfree(ca->oldest_gen); + ca->oldest_gen = NULL; + } + + bch2_trans_exit(&trans); up_read(&c->gc_lock); + mutex_unlock(&c->gc_gens_lock); return ret; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 59dfb06..0665f59 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -8,7 +8,6 @@ int bch2_gc(struct bch_fs *, bool, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); -void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); /* * For concurrent mark and sweep (with other index updates), we define a total diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 40fa011..a365132 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -391,16 +391,10 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_btree_node_iter_init_from_start(&src_iter, src); - if (btree_node_is_extents(src)) - nr = bch2_sort_repack_merge(c, btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); - else - nr = bch2_sort_repack(btree_bset_first(dst), - src, &src_iter, - &dst->format, - true); + nr = bch2_sort_repack(btree_bset_first(dst), + src, &src_iter, + &dst->format, + true); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], start_time); @@ -465,16 +459,13 @@ void bch2_btree_build_aux_trees(struct btree *b) * * Returns true if we sorted (i.e. invalidated iterators */ -void bch2_btree_init_next(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b) +void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) { struct bch_fs *c = trans->c; struct btree_node_entry *bne; bool reinit_iter = false; EBUG_ON(!(b->c.lock.state.seq & 1)); - EBUG_ON(iter && iter->l[b->c.level].b != b); BUG_ON(bset_written(b, bset(b, &b->set[1]))); if (b->nsets == MAX_BSETS && @@ -503,8 +494,8 @@ void bch2_btree_init_next(struct btree_trans *trans, bch2_btree_build_aux_trees(b); - if (iter && reinit_iter) - bch2_btree_iter_reinit_node(iter, b); + if (reinit_iter) + bch2_trans_node_reinit_iter(trans, b); } static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, @@ -569,7 +560,8 @@ enum btree_validate_ret { \ switch (write) { \ case READ: \ - bch_err(c, "%s", _buf2); \ + if (_buf2) \ + bch_err(c, "%s", _buf2); \ \ switch (type) { \ case BTREE_ERR_FIXABLE: \ @@ -695,7 +687,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, BTREE_ERR_FATAL, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); - if (btree_err_on(offset + sectors > c->opts.btree_node_size, + if (btree_err_on(offset + sectors > btree_sectors(c), BTREE_ERR_FIXABLE, c, ca, b, i, "bset past end of btree node")) { i->u64s = 0; @@ -909,7 +901,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->data->keys.seq, bp->seq); } - while (b->written < (ptr_written ?: c->opts.btree_node_size)) { + while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors, whiteout_u64s = 0; struct nonce nonce; struct bch_csum csum; @@ -980,19 +972,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - b->written += sectors; - blacklisted = bch2_journal_seq_is_blacklisted(c, le64_to_cpu(i->journal_seq), true); btree_err_on(blacklisted && first, BTREE_ERR_FIXABLE, c, ca, b, i, - "first btree node bset has blacklisted journal seq"); + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); btree_err_on(blacklisted && ptr_written, BTREE_ERR_FIXABLE, c, ca, b, i, - "found blacklisted bset in btree node with sectors_written"); + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); + + b->written += sectors; + if (blacklisted && !first) continue; @@ -1218,7 +1214,7 @@ static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) if (le64_to_cpu(bn->magic) != bset_magic(c)) return 0; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { if (!offset) { offset += vstruct_sectors(bn, c->block_bits); } else { @@ -1240,7 +1236,7 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void * if (!offset) return false; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { bne = data + (offset << 9); if (bne->keys.seq == bn->keys.seq) return true; @@ -1260,7 +1256,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) bool dump_bset_maps = false; bool have_retry = false; int ret = 0, best = -1, write = READ; - unsigned i, written, written2; + unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; @@ -1310,7 +1306,7 @@ fsck_err: if (ra->err[i]) continue; - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { if (!offset) { sectors = vstruct_sectors(bn, c->block_bits); } else { @@ -1327,7 +1323,7 @@ fsck_err: offset += sectors; } - while (offset < c->opts.btree_node_size) { + while (offset < btree_sectors(c)) { bne = ra->buf[i] + (offset << 9); if (bne->keys.seq == bn->keys.seq) { if (!gap) @@ -1805,8 +1801,8 @@ do_write: BUG_ON(btree_node_fake(b)); BUG_ON((b->will_make_reachable != 0) != !b->written); - BUG_ON(b->written >= c->opts.btree_node_size); - BUG_ON(b->written & (c->opts.block_size - 1)); + BUG_ON(b->written >= btree_sectors(c)); + BUG_ON(b->written & (block_sectors(c) - 1)); BUG_ON(bset_written(b, btree_bset_last(b))); BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); @@ -1879,7 +1875,7 @@ do_write: memset(data + bytes_to_write, 0, (sectors_to_write << 9) - bytes_to_write); - BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); + BUG_ON(b->written + sectors_to_write > btree_sectors(c)); BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 7fdcf87..0f20224 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -134,8 +134,7 @@ void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); void bch2_btree_node_drop_keys_outside_node(struct btree *); void bch2_btree_build_aux_trees(struct btree *); -void bch2_btree_init_next(struct btree_trans *, struct btree_iter *, - struct btree *); +void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, struct btree *, bool); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index fe710d1..ae63ecb 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -12,31 +12,69 @@ #include "error.h" #include "extents.h" #include "journal.h" +#include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include #include -static void btree_iter_set_search_pos(struct btree_iter *, struct bpos); -static void btree_trans_sort_iters(struct btree_trans *); -static void btree_iter_check_sort(struct btree_trans *, struct btree_iter *); -static struct btree_iter *btree_iter_child_alloc(struct btree_iter *, unsigned long); -static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *, - struct btree_iter *); -static void btree_iter_copy(struct btree_iter *, struct btree_iter *); +static void btree_trans_verify_sorted(struct btree_trans *); +static void btree_path_check_sort(struct btree_trans *, struct btree_path *, int); -static inline int btree_iter_cmp(const struct btree_iter *l, - const struct btree_iter *r) +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + +static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { - return cmp_int(l->btree_id, r->btree_id) ?: - -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: - bkey_cmp(l->real_pos, r->real_pos); +#ifdef CONFIG_BCACHEFS_DEBUG + return iter->ip_allocated; +#else + return 0; +#endif } -static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + +/* + * Unlocks before scheduling + * Note: does not revalidate iterator + */ +static inline int bch2_trans_cond_resched(struct btree_trans *trans) +{ + if (need_resched() || race_fault()) { + bch2_trans_unlock(trans); + schedule(); + return bch2_trans_relock(trans) ? 0 : -EINTR; + } else { + return 0; + } +} + +static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, + struct bpos r_pos, + unsigned r_level) +{ + /* + * Must match lock ordering as defined by __bch2_btree_node_lock: + */ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: + -cmp_int(l->level, r_level); +} + +static inline int btree_path_cmp(const struct btree_path *l, + const struct btree_path *r) { - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); + return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); +} +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ /* Are we iterating over keys in all snapshots? */ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { p = bpos_successor(p); @@ -50,8 +88,6 @@ static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) { - EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES); - /* Are we iterating over keys in all snapshots? */ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { p = bpos_predecessor(p); @@ -63,10 +99,10 @@ static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos return p; } -static inline bool is_btree_node(struct btree_iter *iter, unsigned l) +static inline bool is_btree_node(struct btree_path *path, unsigned l) { return l < BTREE_MAX_DEPTH && - (unsigned long) iter->l[l].b >= 128; + (unsigned long) path->l[l].b >= 128; } static inline struct bpos btree_iter_search_key(struct btree_iter *iter) @@ -79,41 +115,40 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) return pos; } -static inline bool btree_iter_pos_before_node(struct btree_iter *iter, +static inline bool btree_path_pos_before_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(iter->real_pos, b->data->min_key) < 0; + return bpos_cmp(path->pos, b->data->min_key) < 0; } -static inline bool btree_iter_pos_after_node(struct btree_iter *iter, +static inline bool btree_path_pos_after_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(b->key.k.p, iter->real_pos) < 0; + return bpos_cmp(b->key.k.p, path->pos) < 0; } -static inline bool btree_iter_pos_in_node(struct btree_iter *iter, +static inline bool btree_path_pos_in_node(struct btree_path *path, struct btree *b) { - return iter->btree_id == b->c.btree_id && - !btree_iter_pos_before_node(iter, b) && - !btree_iter_pos_after_node(iter, b); + return path->btree_id == b->c.btree_id && + !btree_path_pos_before_node(path, b) && + !btree_path_pos_after_node(path, b); } /* Btree node locking: */ -void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { - bch2_btree_node_unlock_write_inlined(b, iter); + bch2_btree_node_unlock_write_inlined(trans, path, b); } -void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +void __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree *b) { - struct btree_iter *linked; + struct btree_path *linked; unsigned readers = 0; - EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); - - trans_for_each_iter(iter->trans, linked) + trans_for_each_path(trans, linked) if (linked->l[b->c.level].b == b && btree_node_read_locked(linked, b->c.level)) readers++; @@ -124,140 +159,155 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) * goes to 0, and it's safe because we have the node intent * locked: */ - atomic64_sub(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); - btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); - atomic64_add(__SIX_VAL(read_lock, readers), - &b->c.lock.state.counter); + if (!b->c.lock.readers) + atomic64_sub(__SIX_VAL(read_lock, readers), + &b->c.lock.state.counter); + else + this_cpu_sub(*b->c.lock.readers, readers); + + six_lock_write(&b->c.lock, NULL, NULL); + + if (!b->c.lock.readers) + atomic64_add(__SIX_VAL(read_lock, readers), + &b->c.lock.state.counter); + else + this_cpu_add(*b->c.lock.readers, readers); } -bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree *b = btree_iter_node(iter, level); - int want = __btree_lock_want(iter, level); + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); - if (!is_btree_node(iter, level)) - return false; + if (!is_btree_node(path, level)) + goto fail; if (race_fault()) - return false; + goto fail; - if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || - (btree_node_lock_seq_matches(iter, b, level) && - btree_node_lock_increment(iter->trans, b, level, want))) { - mark_btree_node_locked(iter, level, want); + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, want))) { + mark_btree_node_locked(path, level, want); return true; - } else { - return false; } +fail: + trace_btree_node_relock_fail(trans->fn, _RET_IP_, + path->btree_id, + &path->pos, + (unsigned long) b, + path->l[level].lock_seq, + is_btree_node(path, level) ? b->c.lock.state.seq : 0); + return false; } -static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree *b = iter->l[level].b; + struct btree *b = path->l[level].b; - EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); - - if (!is_btree_node(iter, level)) + if (!is_btree_node(path, level)) return false; - if (btree_node_intent_locked(iter, level)) + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + } + + if (btree_node_intent_locked(path, level)) return true; if (race_fault()) return false; - if (btree_node_locked(iter, level) + if (btree_node_locked(path, level) ? six_lock_tryupgrade(&b->c.lock) - : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) + : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) goto success; - if (btree_node_lock_seq_matches(iter, b, level) && - btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { - btree_node_unlock(iter, level); + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, b, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(path, level); goto success; } return false; success: - mark_btree_node_intent_locked(iter, level); + mark_btree_node_intent_locked(path, level); return true; } -static inline bool btree_iter_get_locks(struct btree_iter *iter, bool upgrade, - unsigned long trace_ip) +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade) { - unsigned l = iter->level; + unsigned l = path->level; int fail_idx = -1; do { - if (!btree_iter_node(iter, l)) + if (!btree_path_node(path, l)) break; if (!(upgrade - ? bch2_btree_node_upgrade(iter, l) - : bch2_btree_node_relock(iter, l))) { - (upgrade - ? trace_node_upgrade_fail - : trace_node_relock_fail)(iter->trans->ip, trace_ip, - btree_iter_type(iter) == BTREE_ITER_CACHED, - iter->btree_id, &iter->real_pos, - l, iter->l[l].lock_seq, - is_btree_node(iter, l) - ? 0 - : (unsigned long) iter->l[l].b, - is_btree_node(iter, l) - ? iter->l[l].b->c.lock.state.seq - : 0); + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) fail_idx = l; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - } l++; - } while (l < iter->locks_want); + } while (l < path->locks_want); /* * When we fail to get a lock, we have to ensure that any child nodes - * can't be relocked so bch2_btree_iter_traverse has to walk back up to + * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - while (fail_idx >= 0) { - btree_node_unlock(iter, fail_idx); - iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; - --fail_idx; + if (fail_idx >= 0) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; + --fail_idx; + } while (fail_idx >= 0); } - if (iter->uptodate == BTREE_ITER_NEED_RELOCK) - iter->uptodate = BTREE_ITER_NEED_PEEK; + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; - bch2_btree_trans_verify_locks(iter->trans); + bch2_trans_verify_locks(trans); - return iter->uptodate < BTREE_ITER_NEED_RELOCK; + return path->uptodate < BTREE_ITER_NEED_RELOCK; } static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, - enum btree_iter_type type) + bool cached) { - return type != BTREE_ITER_CACHED + return !cached ? container_of(_b, struct btree, c)->key.k.p : container_of(_b, struct bkey_cached, c)->key.pos; } /* Slowpath: */ -bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, - unsigned level, struct btree_iter *iter, +bool __bch2_btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_trans *trans = iter->trans; - struct btree_iter *linked, *deadlock_iter = NULL; - u64 start_time = local_clock(); - unsigned reason = 9; - bool ret; + struct btree_path *linked; + unsigned reason; /* Check if it's safe to block: */ - trans_for_each_iter(trans, linked) { + trans_for_each_path(trans, linked) { if (!linked->nodes_locked) continue; @@ -275,141 +325,114 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, */ if (type == SIX_LOCK_intent && linked->nodes_locked != linked->nodes_intent_locked) { - deadlock_iter = linked; reason = 1; + goto deadlock; } - if (linked->btree_id != iter->btree_id) { - if (linked->btree_id > iter->btree_id) { - deadlock_iter = linked; - reason = 3; - } - continue; + if (linked->btree_id != path->btree_id) { + if (linked->btree_id < path->btree_id) + continue; + + reason = 3; + goto deadlock; } /* - * Within the same btree, cached iterators come before non - * cached iterators: + * Within the same btree, non-cached paths come before cached + * paths: */ - if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { - if (btree_iter_is_cached(iter)) { - deadlock_iter = linked; - reason = 4; - } - continue; + if (linked->cached != path->cached) { + if (!linked->cached) + continue; + + reason = 4; + goto deadlock; } /* * Interior nodes must be locked before their descendants: if - * another iterator has possible descendants locked of the node + * another path has possible descendants locked of the node * we're about to lock, it must have the ancestors locked too: */ if (level > __fls(linked->nodes_locked)) { - deadlock_iter = linked; reason = 5; + goto deadlock; } /* Must lock btree nodes in key order: */ if (btree_node_locked(linked, level) && bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b, - btree_iter_type(linked))) <= 0) { - deadlock_iter = linked; - reason = 7; + linked->cached)) <= 0) { BUG_ON(trans->in_traverse_all); + reason = 7; + goto deadlock; } } - if (unlikely(deadlock_iter)) { - trace_trans_restart_would_deadlock(trans->ip, ip, - trans->in_traverse_all, reason, - deadlock_iter->btree_id, - btree_iter_type(deadlock_iter), - &deadlock_iter->real_pos, - iter->btree_id, - btree_iter_type(iter), - &pos); - btree_trans_restart(trans); - return false; - } - - if (six_trylock_type(&b->c.lock, type)) - return true; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking_iter_idx = iter->idx; - trans->locking_pos = pos; - trans->locking_btree_id = iter->btree_id; - trans->locking_level = level; - trans->locking = b; -#endif - - ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; - -#ifdef CONFIG_BCACHEFS_DEBUG - trans->locking = NULL; -#endif - if (ret) - bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], - start_time); - return ret; + return btree_node_lock_type(trans, path, b, pos, level, + type, should_sleep_fn, p); +deadlock: + trace_trans_restart_would_deadlock(trans->fn, ip, + trans->in_traverse_all, reason, + linked->btree_id, + linked->cached, + &linked->pos, + path->btree_id, + path->cached, + &pos); + btree_trans_restart(trans); + return false; } /* Btree iterator locking: */ #ifdef CONFIG_BCACHEFS_DEBUG -static void bch2_btree_iter_verify_locks(struct btree_iter *iter) + +static void bch2_btree_path_verify_locks(struct btree_path *path) { unsigned l; - if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { - BUG_ON(iter->nodes_locked); + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); return; } - for (l = 0; btree_iter_node(iter, l); l++) { - if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && - !btree_node_locked(iter, l)) - continue; - - BUG_ON(btree_lock_want(iter, l) != - btree_node_locked_type(iter, l)); - } + for (l = 0; btree_path_node(path, l); l++) + BUG_ON(btree_lock_want(path, l) != + btree_node_locked_type(path, l)); } -void bch2_btree_trans_verify_locks(struct btree_trans *trans) +void bch2_trans_verify_locks(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - bch2_btree_iter_verify_locks(iter); + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); } #else -static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} #endif +/* Btree path locking: */ + /* * Only for btree_cache.c - only relocks intent locks */ -bool bch2_btree_iter_relock_intent(struct btree_iter *iter) +bool bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) { unsigned l; - for (l = iter->level; - l < iter->locks_want && btree_iter_node(iter, l); + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); l++) { - if (!bch2_btree_node_relock(iter, l)) { - trace_node_relock_fail(iter->trans->ip, _RET_IP_, - btree_iter_type(iter) == BTREE_ITER_CACHED, - iter->btree_id, &iter->real_pos, - l, iter->l[l].lock_seq, - is_btree_node(iter, l) - ? 0 - : (unsigned long) iter->l[l].b, - is_btree_node(iter, l) - ? iter->l[l].b->c.lock.state.seq - : 0); - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - btree_trans_restart(iter->trans); + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_trans_restart_relock_path_intent(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + btree_trans_restart(trans); return false; } } @@ -418,25 +441,30 @@ bool bch2_btree_iter_relock_intent(struct btree_iter *iter) } __flatten -bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip) +static bool bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) { - bool ret = btree_iter_get_locks(iter, false, trace_ip); + bool ret = btree_path_get_locks(trans, path, false); - if (!ret) - btree_trans_restart(iter->trans); + if (!ret) { + trace_trans_restart_relock_path(trans->fn, trace_ip, + path->btree_id, &path->pos); + btree_trans_restart(trans); + } return ret; } -bool __bch2_btree_iter_upgrade(struct btree_iter *iter, +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned new_locks_want) { - struct btree_iter *linked; + struct btree_path *linked; - EBUG_ON(iter->locks_want >= new_locks_want); + EBUG_ON(path->locks_want >= new_locks_want); - iter->locks_want = new_locks_want; + path->locks_want = new_locks_want; - if (btree_iter_get_locks(iter, true, _THIS_IP_)) + if (btree_path_get_locks(trans, path, true)) return true; /* @@ -444,7 +472,7 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, * iterators in the btree_trans here. * * On failure to upgrade the iterator, setting iter->locks_want and - * calling get_locks() is sufficient to make bch2_btree_iter_traverse() + * calling get_locks() is sufficient to make bch2_btree_path_traverse() * get the locks we want on transaction restart. * * But if this iterator was a clone, on transaction restart what we did @@ -456,75 +484,67 @@ bool __bch2_btree_iter_upgrade(struct btree_iter *iter, * * The code below used to be needed to ensure ancestor nodes get locked * before interior nodes - now that's handled by - * bch2_btree_iter_traverse_all(). + * bch2_btree_path_traverse_all(). */ - trans_for_each_iter(iter->trans, linked) - if (linked != iter && - btree_iter_type(linked) == btree_iter_type(iter) && - linked->btree_id == iter->btree_id && + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_iter_get_locks(linked, true, _THIS_IP_); + btree_path_get_locks(trans, linked, true); } - if (iter->should_be_locked) - btree_trans_restart(iter->trans); return false; } -void __bch2_btree_iter_downgrade(struct btree_iter *iter, +void __bch2_btree_path_downgrade(struct btree_path *path, unsigned new_locks_want) { unsigned l; - EBUG_ON(iter->locks_want < new_locks_want); + EBUG_ON(path->locks_want < new_locks_want); - iter->locks_want = new_locks_want; + path->locks_want = new_locks_want; - while (iter->nodes_locked && - (l = __fls(iter->nodes_locked)) >= iter->locks_want) { - if (l > iter->level) { - btree_node_unlock(iter, l); + while (path->nodes_locked && + (l = __fls(path->nodes_locked)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(path, l); } else { - if (btree_node_intent_locked(iter, l)) { - six_lock_downgrade(&iter->l[l].b->c.lock); - iter->nodes_intent_locked ^= 1 << l; + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + path->nodes_intent_locked ^= 1 << l; } break; } } - bch2_btree_trans_verify_locks(iter->trans); + bch2_btree_path_verify_locks(path); } void bch2_trans_downgrade(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - bch2_btree_iter_downgrade(iter); + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(path); } /* Btree transaction locking: */ -static inline bool btree_iter_should_be_locked(struct btree_iter *iter) -{ - return (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) || - iter->should_be_locked; -} - bool bch2_trans_relock(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; if (unlikely(trans->restarted)) return false; - trans_for_each_iter(trans, iter) - if (btree_iter_should_be_locked(iter) && - !bch2_btree_iter_relock(iter, _RET_IP_)) { - trace_trans_restart_relock(trans->ip, _RET_IP_, - iter->btree_id, &iter->real_pos); + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock(trans, path, _RET_IP_)) { + trace_trans_restart_relock(trans->fn, _RET_IP_, + path->btree_id, &path->pos); BUG_ON(!trans->restarted); return false; } @@ -533,10 +553,10 @@ bool bch2_trans_relock(struct btree_trans *trans) void bch2_trans_unlock(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - __bch2_btree_iter_unlock(iter); + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(path); BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); } @@ -545,26 +565,27 @@ void bch2_trans_unlock(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG -static void bch2_btree_iter_verify_cached(struct btree_iter *iter) +static void bch2_btree_path_verify_cached(struct btree_trans *trans, + struct btree_path *path) { struct bkey_cached *ck; - bool locked = btree_node_locked(iter, 0); + bool locked = btree_node_locked(path, 0); - if (!bch2_btree_node_relock(iter, 0)) + if (!bch2_btree_node_relock(trans, path, 0)) return; - ck = (void *) iter->l[0].b; - BUG_ON(ck->key.btree_id != iter->btree_id || - bkey_cmp(ck->key.pos, iter->pos)); + ck = (void *) path->l[0].b; + BUG_ON(ck->key.btree_id != path->btree_id || + bkey_cmp(ck->key.pos, path->pos)); if (!locked) - btree_node_unlock(iter, 0); + btree_node_unlock(path, 0); } -static void bch2_btree_iter_verify_level(struct btree_iter *iter, - unsigned level) +static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - struct btree_iter_level *l; + struct btree_path_level *l; struct btree_node_iter tmp; bool locked; struct bkey_packed *p, *k; @@ -574,65 +595,52 @@ static void bch2_btree_iter_verify_level(struct btree_iter *iter, if (!bch2_debug_check_iterators) return; - l = &iter->l[level]; + l = &path->l[level]; tmp = l->iter; - locked = btree_node_locked(iter, level); + locked = btree_node_locked(path, level); - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { + if (path->cached) { if (!level) - bch2_btree_iter_verify_cached(iter); + bch2_btree_path_verify_cached(trans, path); return; } - BUG_ON(iter->level < iter->min_depth); - - if (!btree_iter_node(iter, level)) + if (!btree_path_node(path, level)) return; - if (!bch2_btree_node_relock(iter, level)) + if (!bch2_btree_node_relock(trans, path, level)) return; - BUG_ON(!btree_iter_pos_in_node(iter, l->b)); - - /* - * node iterators don't use leaf node iterator: - */ - if (btree_iter_type(iter) == BTREE_ITER_NODES && - level <= iter->min_depth) - goto unlock; + BUG_ON(!btree_path_pos_in_node(path, l->b)); bch2_btree_node_iter_verify(&l->iter, l->b); /* - * For interior nodes, the iterator will have skipped past - * deleted keys: - * - * For extents, the iterator may have skipped past deleted keys (but not - * whiteouts) + * For interior nodes, the iterator will have skipped past deleted keys: */ - p = level || btree_node_type_is_extents(iter->btree_id) + p = level ? bch2_btree_node_iter_prev(&tmp, l->b) : bch2_btree_node_iter_prev_all(&tmp, l->b); k = bch2_btree_node_iter_peek_all(&l->iter, l->b); - if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) { + if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { msg = "before"; goto err; } - if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { msg = "after"; goto err; } -unlock: + if (!locked) - btree_node_unlock(iter, level); + btree_node_unlock(path, level); return; err: strcpy(buf2, "(none)"); strcpy(buf3, "(none)"); - bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_bpos_to_text(&PBUF(buf1), path->pos); if (p) { struct bkey uk = bkey_unpack_key(l->b, p); @@ -644,79 +652,175 @@ err: bch2_bkey_to_text(&PBUF(buf3), &uk); } - panic("iterator should be %s key at level %u:\n" - "iter pos %s\n" + panic("path should be %s key at level %u:\n" + "path pos %s\n" "prev key %s\n" "cur key %s\n", msg, level, buf1, buf2, buf3); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; - enum btree_iter_type type = btree_iter_type(iter); unsigned i; - EBUG_ON(iter->btree_id >= BTREE_ID_NR); + EBUG_ON(path->btree_id >= BTREE_ID_NR); - BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && - iter->pos.snapshot != iter->snapshot); + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { + BUG_ON(!path->cached && + c->btree_roots[path->btree_id].b->c.level > i); + break; + } + + bch2_btree_path_verify_level(trans, path, i); + } + + bch2_btree_path_verify_locks(path); +} + +void bch2_trans_verify_paths(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify(trans, path); +} + +static void bch2_btree_iter_verify(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + BUG_ON(iter->btree_id >= BTREE_ID_NR); + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - BUG_ON(type == BTREE_ITER_NODES && - !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); - - BUG_ON(type != BTREE_ITER_NODES && + BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(iter->btree_id)); - for (i = 0; i < (type != BTREE_ITER_CACHED ? BTREE_MAX_DEPTH : 1); i++) { - if (!iter->l[i].b) { - BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i); - break; - } - - bch2_btree_iter_verify_level(iter, i); - } - - bch2_btree_iter_verify_locks(iter); + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); + bch2_btree_path_verify(trans, iter->path); } static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { - enum btree_iter_type type = btree_iter_type(iter); + BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !iter->pos.snapshot); BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && iter->pos.snapshot != iter->snapshot); - BUG_ON((type == BTREE_ITER_KEYS || - type == BTREE_ITER_CACHED) && - (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || - bkey_cmp(iter->pos, iter->k.p) > 0)); + BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || + bkey_cmp(iter->pos, iter->k.p) > 0); } -void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) +static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { - struct btree_iter *iter; + struct btree_trans *trans = iter->trans; + struct btree_iter copy; + struct bkey_s_c prev; + int ret = 0; if (!bch2_debug_check_iterators) - return; + return 0; + + if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + return 0; + + if (bkey_err(k) || !k.k) + return 0; + + BUG_ON(!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) + goto out; + + ret = bkey_err(prev); + if (ret) + goto out; + + if (!bkey_cmp(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { + char buf1[100], buf2[200]; + + bch2_bkey_to_text(&PBUF(buf1), k.k); + bch2_bkey_to_text(&PBUF(buf2), prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, + buf1, buf2); + } +out: + bch2_trans_iter_exit(trans, ©); + return ret; +} + +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + char buf[100]; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!(path->nodes_locked & 1) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && + bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) + return; + } else { + if (!bkey_cmp(pos, path->pos)) + return; + } + } - trans_for_each_iter_with_node(trans, b, iter) - bch2_btree_iter_verify_level(iter, b->c.level); + bch2_dump_trans_paths_updates(trans); + panic("not locked: %s %s%s\n", + bch2_btree_ids[id], + (bch2_bpos_to_text(&PBUF(buf), pos), buf), + key_cache ? " cached" : ""); } #else -static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} +static inline void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned l) {} +static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) {} static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } #endif +/* Btree path: fixups after btree updates */ + static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, struct btree *b, struct bset_tree *t, @@ -734,40 +838,38 @@ static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); } -static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +static void __bch2_btree_path_fix_key_modified(struct btree_path *path, struct btree *b, struct bkey_packed *where) { - struct btree_iter_level *l = &iter->l[b->c.level]; + struct btree_path_level *l = &path->l[b->c.level]; if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) return; - if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0) + if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) bch2_btree_node_iter_advance(&l->iter, l->b); - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *b, struct bkey_packed *where) { - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter_with_node(iter->trans, b, linked) { - __bch2_btree_iter_fix_key_modified(linked, b, where); - bch2_btree_iter_verify_level(linked, b->c.level); + trans_for_each_path_with_node(trans, b, path) { + __bch2_btree_path_fix_key_modified(path, b, where); + bch2_btree_path_verify_level(trans, path, b->c.level); } } -static void __bch2_btree_node_iter_fix(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bset_tree *t, - struct bkey_packed *where, - unsigned clobber_u64s, - unsigned new_u64s) +static void __bch2_btree_node_iter_fix(struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) { const struct bkey_packed *end = btree_bkey_last(b, t); struct btree_node_iter_set *set; @@ -785,7 +887,7 @@ static void __bch2_btree_node_iter_fix(struct btree_iter *iter, /* didn't find the bset in the iterator - might have to readd it: */ if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { bch2_btree_node_iter_push(node_iter, b, where, end); goto fixup_done; } else { @@ -800,7 +902,7 @@ found: return; if (new_u64s && - bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) { + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { set->k = offset; } else if (set->k < offset + clobber_u64s) { set->k = offset + new_u64s; @@ -826,8 +928,7 @@ fixup_done: */ if (!bch2_btree_node_iter_end(node_iter) && iter_current_key_modified && - (b->c.level || - btree_node_type_is_extents(iter->btree_id))) { + b->c.level) { struct bset_tree *t; struct bkey_packed *k, *k2, *p; @@ -852,14 +953,10 @@ fixup_done: b, t, k2); } } - - if (!b->c.level && - node_iter == &iter->l[0].iter && - iter_current_key_modified) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -void bch2_btree_node_iter_fix(struct btree_iter *iter, +void bch2_btree_node_iter_fix(struct btree_trans *trans, + struct btree_path *path, struct btree *b, struct btree_node_iter *node_iter, struct bkey_packed *where, @@ -867,31 +964,31 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter, unsigned new_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); - struct btree_iter *linked; + struct btree_path *linked; - if (node_iter != &iter->l[b->c.level].iter) { - __bch2_btree_node_iter_fix(iter, b, node_iter, t, + if (node_iter != &path->l[b->c.level].iter) { + __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); if (bch2_debug_check_iterators) bch2_btree_node_iter_verify(node_iter, b); } - trans_for_each_iter_with_node(iter->trans, b, linked) { + trans_for_each_path_with_node(trans, b, linked) { __bch2_btree_node_iter_fix(linked, b, &linked->l[b->c.level].iter, t, where, clobber_u64s, new_u64s); - bch2_btree_iter_verify_level(linked, b->c.level); + bch2_btree_path_verify_level(trans, linked, b->c.level); } } -static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, - struct btree_iter_level *l, +/* Btree path level: pointer to a particular btree node and node iter */ + +static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + struct btree_path_level *l, struct bkey *u, struct bkey_packed *k) { - struct bkey_s_c ret; - if (unlikely(!k)) { /* * signal to bch2_btree_iter_peek_slot() that we're currently at @@ -901,58 +998,50 @@ static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, return bkey_s_c_null; } - ret = bkey_disassemble(l->b, k, u); - - /* - * XXX: bch2_btree_bset_insert_key() generates invalid keys when we - * overwrite extents - it sets k->type = KEY_TYPE_deleted on the key - * being overwritten but doesn't change k->size. But this is ok, because - * those keys are never written out, we just have to avoid a spurious - * assertion here: - */ - if (bch2_debug_check_bkeys && !bkey_deleted(ret.k)) - bch2_bkey_debugcheck(iter->trans->c, l->b, ret); - - return ret; + return bkey_disassemble(l->b, k, u); } -/* peek_all() doesn't skip deleted keys */ -static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + struct btree_path_level *l, + struct bkey *u) { - return __btree_iter_unpack(iter, l, &iter->k, + return __btree_iter_unpack(c, l, u, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_path_level_peek(struct bch_fs *c, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(c, l, u, bch2_btree_node_iter_peek(&l->iter, l->b)); - iter->real_pos = k.k ? k.k->p : l->b->key.k.p; + path->pos = k.k ? k.k->p : l->b->key.k.p; return k; } -static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter, - struct btree_iter_level *l) +static inline struct bkey_s_c btree_path_level_prev(struct bch_fs *c, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) { - struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k, + struct bkey_s_c k = __btree_iter_unpack(c, l, u, bch2_btree_node_iter_prev(&l->iter, l->b)); - iter->real_pos = k.k ? k.k->p : l->b->data->min_key; + path->pos = k.k ? k.k->p : l->b->data->min_key; return k; } -static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, - struct btree_iter_level *l, +static inline bool btree_path_advance_to_pos(struct btree_path *path, + struct btree_path_level *l, int max_advance) { struct bkey_packed *k; int nr_advanced = 0; while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && - bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) { + bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { if (max_advance > 0 && nr_advanced >= max_advance) return false; @@ -966,9 +1055,11 @@ static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, /* * Verify that iterator for parent node points to child node: */ -static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) +static void btree_path_verify_new_node(struct btree_trans *trans, + struct btree_path *path, struct btree *b) { - struct btree_iter_level *l; + struct bch_fs *c = trans->c; + struct btree_path_level *l; unsigned plevel; bool parent_locked; struct bkey_packed *k; @@ -976,16 +1067,19 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) return; + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + plevel = b->c.level + 1; - if (!btree_iter_node(iter, plevel)) + if (!btree_path_node(path, plevel)) return; - parent_locked = btree_node_locked(iter, plevel); + parent_locked = btree_node_locked(path, plevel); - if (!bch2_btree_node_relock(iter, plevel)) + if (!bch2_btree_node_relock(trans, path, plevel)) return; - l = &iter->l[plevel]; + l = &path->l[plevel]; k = bch2_btree_node_iter_peek_all(&l->iter, l->b); if (!k || bkey_deleted(k) || @@ -996,8 +1090,8 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) char buf4[100]; struct bkey uk = bkey_unpack_key(b, k); - bch2_dump_btree_node(iter->trans->c, l->b); - bch2_bpos_to_text(&PBUF(buf1), iter->real_pos); + bch2_dump_btree_node(c, l->b); + bch2_bpos_to_text(&PBUF(buf1), path->pos); bch2_bkey_to_text(&PBUF(buf2), &uk); bch2_bpos_to_text(&PBUF(buf3), b->data->min_key); bch2_bpos_to_text(&PBUF(buf3), b->data->max_key); @@ -1005,20 +1099,20 @@ static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) "iter pos %s %s\n" "iter key %s\n" "new node %s-%s\n", - bch2_btree_ids[iter->btree_id], buf1, + bch2_btree_ids[path->btree_id], buf1, buf2, buf3, buf4); } if (!parent_locked) - btree_node_unlock(iter, b->c.level + 1); + btree_node_unlock(path, plevel); } -static inline void __btree_iter_init(struct btree_iter *iter, - unsigned level) +static inline void __btree_path_level_init(struct btree_path *path, + unsigned level) { - struct btree_iter_level *l = &iter->l[level]; + struct btree_path_level *l = &path->l[level]; - bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos); + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); /* * Iterators to interior nodes should always be pointed at the first non @@ -1026,63 +1120,48 @@ static inline void __btree_iter_init(struct btree_iter *iter, */ if (level) bch2_btree_node_iter_peek(&l->iter, l->b); - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); } -static inline void btree_iter_node_set(struct btree_iter *iter, - struct btree *b) +static inline void btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + BUG_ON(path->cached); - btree_iter_verify_new_node(iter, b); + btree_path_verify_new_node(trans, path, b); - EBUG_ON(!btree_iter_pos_in_node(iter, b)); + EBUG_ON(!btree_path_pos_in_node(path, b)); EBUG_ON(b->c.lock.state.seq & 1); - iter->l[b->c.level].lock_seq = b->c.lock.state.seq; - iter->l[b->c.level].b = b; - __btree_iter_init(iter, b->c.level); + path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].b = b; + __btree_path_level_init(path, b->c.level); } +/* Btree path: fixups after btree node updates: */ + /* * A btree node is being replaced - update the iterator to point to the new * node: */ -void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) +void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) { - enum btree_node_locked_type t; - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter(iter->trans, linked) - if (btree_iter_type(linked) != BTREE_ITER_CACHED && - btree_iter_pos_in_node(linked, b)) { - /* - * bch2_btree_iter_node_drop() has already been called - - * the old node we're replacing has already been - * unlocked and the pointer invalidated - */ - BUG_ON(btree_node_locked(linked, b->c.level)); + trans_for_each_path(trans, path) + if (!path->cached && + btree_path_pos_in_node(path, b)) { + enum btree_node_locked_type t = + btree_lock_want(path, b->c.level); - t = btree_lock_want(linked, b->c.level); - if (t != BTREE_NODE_UNLOCKED) { + if (path->nodes_locked && + t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(path, b->c.level); six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(linked, b->c.level, t); + mark_btree_node_locked(path, b->c.level, t); } - btree_iter_node_set(linked, b); - } -} - -void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -{ - struct btree_iter *linked; - unsigned level = b->c.level; - - trans_for_each_iter(iter->trans, linked) - if (linked->l[level].b == b) { - btree_node_unlock(linked, level); - linked->l[level].b = BTREE_ITER_NO_NODE_DROP; + btree_path_level_init(trans, path, b); } } @@ -1090,14 +1169,16 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) * A btree node has been modified in such a way as to invalidate iterators - fix * them: */ -void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) +void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) { - struct btree_iter *linked; + struct btree_path *path; - trans_for_each_iter_with_node(iter->trans, b, linked) - __btree_iter_init(linked, b->c.level); + trans_for_each_path_with_node(trans, b, path) + __btree_path_level_init(path, b->c.level); } +/* Btree path: traverse, set_pos: */ + static int lock_root_check_fn(struct six_lock *lock, void *p) { struct btree *b = container_of(lock, struct btree, c.lock); @@ -1106,38 +1187,38 @@ static int lock_root_check_fn(struct six_lock *lock, void *p) return b == *rootp ? 0 : -1; } -static inline int btree_iter_lock_root(struct btree_trans *trans, - struct btree_iter *iter, +static inline int btree_path_lock_root(struct btree_trans *trans, + struct btree_path *path, unsigned depth_want, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; + struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; enum six_lock_type lock_type; unsigned i; - EBUG_ON(iter->nodes_locked); + EBUG_ON(path->nodes_locked); while (1) { b = READ_ONCE(*rootp); - iter->level = READ_ONCE(b->c.level); + path->level = READ_ONCE(b->c.level); - if (unlikely(iter->level < depth_want)) { + if (unlikely(path->level < depth_want)) { /* * the root is at a lower depth than the depth we want: * got to the end of the btree, or we're walking nodes * greater than some depth and there are no nodes >= * that depth */ - iter->level = depth_want; - for (i = iter->level; i < BTREE_MAX_DEPTH; i++) - iter->l[i].b = NULL; + path->level = depth_want; + for (i = path->level; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; return 1; } - lock_type = __btree_lock_want(iter, iter->level); - if (unlikely(!btree_node_lock(b, SPOS_MAX, iter->level, - iter, lock_type, + lock_type = __btree_lock_want(path, path->level); + if (unlikely(!btree_node_lock(trans, path, b, SPOS_MAX, + path->level, lock_type, lock_root_check_fn, rootp, trace_ip))) { if (trans->restarted) @@ -1146,16 +1227,16 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, } if (likely(b == READ_ONCE(*rootp) && - b->c.level == iter->level && + b->c.level == path->level && !race_fault())) { - for (i = 0; i < iter->level; i++) - iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; - iter->l[iter->level].b = b; - for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) - iter->l[i].b = NULL; - - mark_btree_node_locked(iter, iter->level, lock_type); - btree_iter_node_set(iter, b); + for (i = 0; i < path->level; i++) + path->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; + path->l[path->level].b = b; + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + + mark_btree_node_locked(path, path->level, lock_type); + btree_path_level_init(trans, path, b); return 0; } @@ -1164,23 +1245,23 @@ static inline int btree_iter_lock_root(struct btree_trans *trans, } noinline -static int btree_iter_prefetch(struct btree_iter *iter) +static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) { - struct bch_fs *c = iter->trans->c; - struct btree_iter_level *l = &iter->l[iter->level]; + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) - ? (iter->level > 1 ? 0 : 2) - : (iter->level > 1 ? 1 : 16); - bool was_locked = btree_node_locked(iter, iter->level); + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); int ret = 0; bch2_bkey_buf_init(&tmp); while (nr && !ret) { - if (!bch2_btree_node_relock(iter, iter->level)) + if (!bch2_btree_node_relock(trans, path, path->level)) break; bch2_btree_node_iter_advance(&node_iter, l->b); @@ -1189,26 +1270,62 @@ static int btree_iter_prefetch(struct btree_iter *iter) break; bch2_bkey_buf_unpack(&tmp, c, l->b, k); - ret = bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id, - iter->level - 1); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + path->level - 1); } if (!was_locked) - btree_node_unlock(iter, iter->level); + btree_node_unlock(path, path->level); bch2_bkey_buf_exit(&tmp, c); return ret; } -static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, unsigned plevel, struct btree *b) { - struct btree_iter_level *l = &iter->l[plevel]; - bool locked = btree_node_locked(iter, plevel); + struct btree_path_level *l = &path->l[plevel]; + bool locked = btree_node_locked(path, plevel); struct bkey_packed *k; struct bch_btree_ptr_v2 *bp; - if (!bch2_btree_node_relock(iter, plevel)) + if (!bch2_btree_node_relock(trans, path, plevel)) return; k = bch2_btree_node_iter_peek_all(&l->iter, l->b); @@ -1218,59 +1335,96 @@ static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, bp->mem_ptr = (unsigned long)b; if (!locked) - btree_node_unlock(iter, plevel); + btree_node_unlock(path, plevel); +} + +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; } -static __always_inline int btree_iter_down(struct btree_trans *trans, - struct btree_iter *iter, +static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_iter_level *l = &iter->l[iter->level]; + struct btree_path_level *l = path_l(path); struct btree *b; - unsigned level = iter->level - 1; - enum six_lock_type lock_type = __btree_lock_want(iter, level); + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); + bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); struct bkey_buf tmp; int ret; - EBUG_ON(!btree_node_locked(iter, iter->level)); + EBUG_ON(!btree_node_locked(path, path->level)); bch2_bkey_buf_init(&tmp); - bch2_bkey_buf_unpack(&tmp, c, l->b, - bch2_btree_node_iter_peek(&l->iter, l->b)); - b = bch2_btree_node_get(trans, iter, tmp.k, level, lock_type, trace_ip); + if (unlikely(!replay_done)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } + + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) goto err; - mark_btree_node_locked(iter, level, lock_type); - btree_iter_node_set(iter, b); + mark_btree_node_locked(path, level, lock_type); + btree_path_level_init(trans, path, b); - if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 && + if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) - btree_node_mem_ptr_set(iter, level + 1, b); - - if (iter->flags & BTREE_ITER_PREFETCH) - ret = btree_iter_prefetch(iter); + btree_node_mem_ptr_set(trans, path, level + 1, b); - if (btree_node_read_locked(iter, level + 1)) - btree_node_unlock(iter, level + 1); - iter->level = level; + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(path, level + 1); + path->level = level; - bch2_btree_iter_verify_locks(iter); + bch2_btree_path_verify_locks(path); err: bch2_bkey_buf_exit(&tmp, c); return ret; } -static int btree_iter_traverse_one(struct btree_iter *, unsigned long); +static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); -static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, +static int __btree_path_traverse_all(struct btree_trans *trans, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_path *path; int i; if (trans->in_traverse_all) @@ -1280,20 +1434,20 @@ static int __btree_iter_traverse_all(struct btree_trans *trans, int ret, retry_all: trans->restarted = false; - trans_for_each_iter(trans, iter) - iter->should_be_locked = false; + trans_for_each_path(trans, path) + path->should_be_locked = false; - btree_trans_sort_iters(trans); + btree_trans_verify_sorted(trans); for (i = trans->nr_sorted - 2; i >= 0; --i) { - struct btree_iter *iter1 = trans->iters + trans->sorted[i]; - struct btree_iter *iter2 = trans->iters + trans->sorted[i + 1]; - - if (iter1->btree_id == iter2->btree_id && - iter1->locks_want < iter2->locks_want) - __bch2_btree_iter_upgrade(iter1, iter2->locks_want); - else if (!iter1->locks_want && iter2->locks_want) - __bch2_btree_iter_upgrade(iter1, 1); + struct btree_path *path1 = trans->paths + trans->sorted[i]; + struct btree_path *path2 = trans->paths + trans->sorted[i + 1]; + + if (path1->btree_id == path2->btree_id && + path1->locks_want < path2->locks_want) + __bch2_btree_path_upgrade(trans, path1, path2->locks_want); + else if (!path1->locks_want && path2->locks_want) + __bch2_btree_path_upgrade(trans, path1, 1); } bch2_trans_unlock(trans); @@ -1310,66 +1464,89 @@ retry_all: } while (ret); } - if (unlikely(ret == -EIO)) { - trans->error = true; + if (unlikely(ret == -EIO)) goto out; - } BUG_ON(ret && ret != -EINTR); /* Now, redo traversals in correct order: */ - trans_for_each_iter_inorder(trans, iter) { - EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); - - ret = btree_iter_traverse_one(iter, _THIS_IP_); - if (ret) - goto retry_all; + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; - EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + /* + * Traversing a path can cause another path to be added at about + * the same position: + */ + if (path->uptodate) { + ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + if (ret) + goto retry_all; + } else { + i++; + } } - trans_for_each_iter(trans, iter) - BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + /* + * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() + * and relock(), relock() won't relock since path->should_be_locked + * isn't set yet, which is all fine + */ + trans_for_each_path(trans, path) + BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); out: bch2_btree_cache_cannibalize_unlock(c); trans->in_traverse_all = false; - trace_trans_traverse_all(trans->ip, trace_ip); + trace_trans_traverse_all(trans->fn, trace_ip); return ret; } -static int bch2_btree_iter_traverse_all(struct btree_trans *trans) +static int bch2_btree_path_traverse_all(struct btree_trans *trans) { - return __btree_iter_traverse_all(trans, 0, _RET_IP_); + return __btree_path_traverse_all(trans, 0, _RET_IP_); } -static inline bool btree_iter_good_node(struct btree_iter *iter, +static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, unsigned l, int check_pos) { - if (!is_btree_node(iter, l) || - !bch2_btree_node_relock(iter, l)) + if (!is_btree_node(path, l) || + !bch2_btree_node_relock(trans, path, l)) return false; - if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) return false; - if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) return false; return true; } -static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, +static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, int check_pos) { - unsigned l = iter->level; + unsigned i, l = path->level; - while (btree_iter_node(iter, l) && - !btree_iter_good_node(iter, l, check_pos)) { - btree_node_unlock(iter, l); - iter->l[l].b = BTREE_ITER_NO_NODE_UP; + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) { + btree_node_unlock(path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; l++; } + /* If we need intent locks, take them too: */ + for (i = l + 1; + i < path->locks_want && btree_path_node(path, i); + i++) + if (!bch2_btree_node_relock(trans, path, i)) + while (l <= i) { + btree_node_unlock(path, l); + path->l[l].b = BTREE_ITER_NO_NODE_UP; + l++; + } + return l; } @@ -1382,131 +1559,459 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_iter_traverse_one(struct btree_iter *iter, +static int btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, unsigned long trace_ip) { - struct btree_trans *trans = iter->trans; - unsigned l, depth_want = iter->level; + unsigned depth_want = path->level; int ret = 0; + if (unlikely(trans->restarted)) { + ret = -EINTR; + goto out; + } + /* - * Ensure we obey iter->should_be_locked: if it's set, we can't unlock - * and re-traverse the iterator without a transaction restart: + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: */ - if (iter->should_be_locked) { - ret = bch2_btree_iter_relock(iter, trace_ip) ? 0 : -EINTR; + if (path->should_be_locked) { + ret = bch2_btree_path_relock(trans, path, trace_ip) ? 0 : -EINTR; goto out; } - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { - ret = bch2_btree_iter_traverse_cached(iter); + if (path->cached) { + ret = bch2_btree_path_traverse_cached(trans, path, flags); goto out; } - if (unlikely(iter->level >= BTREE_MAX_DEPTH)) + if (unlikely(path->level >= BTREE_MAX_DEPTH)) goto out; - iter->level = btree_iter_up_until_good_node(iter, 0); - - /* If we need intent locks, take them too: */ - for (l = iter->level + 1; - l < iter->locks_want && btree_iter_node(iter, l); - l++) - if (!bch2_btree_node_relock(iter, l)) - while (iter->level <= l) { - btree_node_unlock(iter, iter->level); - iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; - iter->level++; - } + path->level = btree_path_up_until_good_node(trans, path, 0); /* - * Note: iter->nodes[iter->level] may be temporarily NULL here - that + * Note: path->nodes[path->level] may be temporarily NULL here - that * would indicate to other code that we got to the end of the btree, * here it indicates that relocking the root failed - it's critical that - * btree_iter_lock_root() comes next and that it can't fail + * btree_path_lock_root() comes next and that it can't fail */ - while (iter->level > depth_want) { - ret = btree_iter_node(iter, iter->level) - ? btree_iter_down(trans, iter, trace_ip) - : btree_iter_lock_root(trans, iter, depth_want, trace_ip); + while (path->level > depth_want) { + ret = btree_path_node(path, path->level) + ? btree_path_down(trans, path, flags, trace_ip) + : btree_path_lock_root(trans, path, depth_want, trace_ip); if (unlikely(ret)) { if (ret == 1) { /* - * Got to the end of the btree (in - * BTREE_ITER_NODES mode) + * No nodes at this level - got to the end of + * the btree: */ ret = 0; goto out; } - __bch2_btree_iter_unlock(iter); - iter->level = depth_want; + __bch2_btree_path_unlock(path); + path->level = depth_want; - if (ret == -EIO) { - iter->flags |= BTREE_ITER_ERROR; - iter->l[iter->level].b = + if (ret == -EIO) + path->l[path->level].b = BTREE_ITER_NO_NODE_ERROR; - } else { - iter->l[iter->level].b = + else + path->l[path->level].b = BTREE_ITER_NO_NODE_DOWN; - } goto out; } } - iter->uptodate = BTREE_ITER_NEED_PEEK; + path->uptodate = BTREE_ITER_UPTODATE; out: BUG_ON((ret == -EINTR) != !!trans->restarted); - trace_iter_traverse(trans->ip, trace_ip, - btree_iter_type(iter) == BTREE_ITER_CACHED, - iter->btree_id, &iter->real_pos, ret); - bch2_btree_iter_verify(iter); + bch2_btree_path_verify(trans, path); return ret; } -static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) -{ - struct btree_trans *trans = iter->trans; - int ret; +static int __btree_path_traverse_all(struct btree_trans *, int, unsigned long); - ret = bch2_trans_cond_resched(trans) ?: - btree_iter_traverse_one(iter, _RET_IP_); - if (unlikely(ret) && hweight64(trans->iters_linked) == 1) { - ret = __btree_iter_traverse_all(trans, ret, _RET_IP_); - BUG_ON(ret == -EINTR); - } +int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; - return ret; + return bch2_trans_cond_resched(trans) ?: + btree_path_traverse_one(trans, path, flags, _RET_IP_); } -/* - * Note: - * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is - * for internal btree iterator users - * - * bch2_btree_iter_traverse sets iter->real_pos to iter->pos, - * btree_iter_traverse() does not: - */ -static inline int __must_check -btree_iter_traverse(struct btree_iter *iter) +static void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + struct btree_path *src) { - return iter->uptodate >= BTREE_ITER_NEED_RELOCK - ? __bch2_btree_iter_traverse(iter) - : 0; + unsigned i; + + memcpy(&dst->pos, &src->pos, + sizeof(struct btree_path) - offsetof(struct btree_path, pos)); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_locked(dst, i)) + six_lock_increment(&dst->l[i].b->c.lock, + __btree_lock_want(dst, i)); + + btree_path_check_sort(trans, dst, 0); } -int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, + bool intent) { - int ret; + struct btree_path *new = btree_path_alloc(trans, src); - btree_iter_set_search_pos(iter, btree_iter_search_key(iter)); + btree_path_copy(trans, new, src); + __btree_path_get(new, intent); + return new; +} + +inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + if (path->ref > 1 || path->preserve) { + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; +#ifdef CONFIG_BCACHEFS_DEBUG + path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + + return path; +} + +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip) +{ + int cmp = bpos_cmp(new_pos, path->pos); + unsigned l = path->level; + + EBUG_ON(trans->restarted); + EBUG_ON(!path->ref); + + if (!cmp) + return path; + + path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + path->should_be_locked = false; + + btree_path_check_sort(trans, path, cmp); + + if (unlikely(path->cached)) { + btree_node_unlock(path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_CACHED; + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + goto out; + } - ret = btree_iter_traverse(iter); + l = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, l)) { + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp < 0 || + !btree_path_advance_to_pos(path, &path->l[l], 8)) + __btree_path_level_init(path, l); + } + + if (l != path->level) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(path); + } +out: + bch2_btree_path_verify(trans, path); + return path; +} + +/* Btree path: main interface: */ + +static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *next; + + next = prev_btree_path(trans, path); + if (next && !btree_path_cmp(next, path)) + return next; + + next = next_btree_path(trans, path); + if (next && !btree_path_cmp(next, path)) + return next; + + return NULL; +} + +static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *next; + + next = prev_btree_path(trans, path); + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; + + next = next_btree_path(trans, path); + if (next && next->level == path->level && path_l(next)->b == path_l(path)->b) + return next; + + return NULL; +} + +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +{ + __bch2_btree_path_unlock(path); + btree_path_list_remove(trans, path); + trans->paths_allocated &= ~(1ULL << path->idx); +} + +void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +{ + struct btree_path *dup; + + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + /* + * Perhaps instead we should check for duplicate paths in traverse_all: + */ + if (path->preserve && + (dup = have_path_at_pos(trans, path))) { + dup->preserve = true; + path->preserve = false; + goto free; + } + + if (!path->preserve && + (dup = have_node_at_pos(trans, path))) + goto free; + return; +free: + if (path->should_be_locked && + !btree_node_locked(dup, path->level)) + return; + + dup->should_be_locked |= path->should_be_locked; + __bch2_path_free(trans, path); +} + +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + struct btree_path *path; + struct btree_insert_entry *i; + unsigned idx; + char buf1[300], buf2[300]; + + btree_trans_verify_sorted(trans); + + trans_for_each_path_inorder(trans, path, idx) + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", + path->idx, path->ref, path->intent_ref, + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", + bch2_btree_ids[path->btree_id], + (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + path->nodes_locked, +#ifdef CONFIG_BCACHEFS_DEBUG + (void *) path->ip_allocated +#else + NULL +#endif + ); + + trans_for_each_update(trans, i) { + struct bkey u; + struct bkey_s_c old = bch2_btree_path_peek_slot(i->path, &u); + + printk(KERN_ERR "update: btree %s %pS\n old %s\n new %s", + bch2_btree_ids[i->btree_id], + (void *) i->ip_allocated, + (bch2_bkey_val_to_text(&PBUF(buf1), trans->c, old), buf1), + (bch2_bkey_val_to_text(&PBUF(buf2), trans->c, bkey_i_to_s_c(i->k)), buf2)); + } +} + +static struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) +{ + struct btree_path *path; + unsigned idx; + + if (unlikely(trans->paths_allocated == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) { + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); + } + + idx = __ffs64(~trans->paths_allocated); + trans->paths_allocated |= 1ULL << idx; + + path = &trans->paths[idx]; + + path->idx = idx; + path->ref = 0; + path->intent_ref = 0; + path->nodes_locked = 0; + path->nodes_intent_locked = 0; + + btree_path_list_add(trans, pos, path); + return path; +} + +struct btree_path *bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) +{ + struct btree_path *path, *path_pos = NULL; + bool cached = flags & BTREE_ITER_CACHED; + bool intent = flags & BTREE_ITER_INTENT; + int i; + + BUG_ON(trans->restarted); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, + btree_id, + cached, + pos, + level) > 0) + break; + + path_pos = path; + } + + if (path_pos && + path_pos->cached == cached && + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); + path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; + + __btree_path_get(path, intent); + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; + path->locks_want = locks_want; + path->nodes_locked = 0; + path->nodes_intent_locked = 0; + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = BTREE_ITER_NO_NODE_INIT; +#ifdef CONFIG_BCACHEFS_DEBUG + path->ip_allocated = ip; +#endif + btree_trans_verify_sorted(trans); + } + + if (!(flags & BTREE_ITER_NOPRESERVE)) + path->preserve = true; + + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + + /* + * If the path has locks_want greater than requested, we don't downgrade + * it here - on transaction restart because btree node split needs to + * upgrade locks, we might be putting/getting the iterator again. + * Downgrading iterators only happens via bch2_trans_downgrade(), after + * a successful transaction commit. + */ + + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) { + path->locks_want = locks_want; + btree_path_get_locks(trans, path, true); + } + + return path; +} + +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +{ + + struct bkey_s_c k; + + if (!path->cached) { + struct btree_path_level *l = path_l(path); + struct bkey_packed *_k; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); + + if (!k.k || bpos_cmp(path->pos, k.k->p)) + goto hole; + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + + EBUG_ON(ck && + (path->btree_id != ck->key.btree_id || + bkey_cmp(path->pos, ck->key.pos))); + + /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */ + if (unlikely(!ck || !ck->valid)) + return bkey_s_c_null; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + + k = bkey_i_to_s_c(ck->k); + } + + return k; +hole: + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +/* Btree iterators: */ + +int __must_check +__bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); +} + +int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + int ret; + + iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); if (ret) return ret; - iter->should_be_locked = true; + iter->path->should_be_locked = true; return 0; } @@ -1514,149 +2019,134 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) { - struct btree *b; + struct btree_trans *trans = iter->trans; + struct btree *b = NULL; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) - return NULL; + goto err; - b = btree_iter_node(iter, iter->level); + b = btree_path_node(iter->path, iter->path->level); if (!b) - return NULL; + goto out; BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); - iter->pos = iter->real_pos = b->key.k.p; + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; return b; +err: + b = ERR_PTR(ret); + goto out; } struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { - struct btree *b; + struct btree_trans *trans = iter->trans; + struct btree_path *path = iter->path; + struct btree *b = NULL; + unsigned l; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); + BUG_ON(trans->restarted); + EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); - /* already got to end? */ - if (!btree_iter_node(iter, iter->level)) - return NULL; - - bch2_trans_cond_resched(iter->trans); - - btree_node_unlock(iter, iter->level); - iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP; - iter->level++; - - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - ret = btree_iter_traverse(iter); - if (ret) + /* already at end? */ + if (!btree_path_node(path, path->level)) return NULL; /* got to end? */ - b = btree_iter_node(iter, iter->level); - if (!b) + if (!btree_path_node(path, path->level + 1)) { + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; return NULL; + } + + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + __bch2_btree_path_unlock(path); + path->l[path->level].b = BTREE_ITER_NO_NODE_GET_LOCKS; + path->l[path->level + 1].b = BTREE_ITER_NO_NODE_GET_LOCKS; + trace_trans_restart_relock_next_node(trans->fn, _THIS_IP_, + path->btree_id, &path->pos); + btree_trans_restart(trans); + ret = -EINTR; + goto err; + } + + b = btree_path_node(path, path->level + 1); - if (bpos_cmp(iter->pos, b->key.k.p) < 0) { + if (!bpos_cmp(iter->pos, b->key.k.p)) { + btree_node_unlock(path, path->level); + path->l[path->level].b = BTREE_ITER_NO_NODE_UP; + path->level++; + } else { /* * Haven't gotten to the end of the parent node: go back down to * the next child node */ - btree_iter_set_search_pos(iter, bpos_successor(iter->pos)); + path = iter->path = + bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + path->level = iter->min_depth; - /* Unlock to avoid screwing up our lock invariants: */ - btree_node_unlock(iter, iter->level); + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(path, l); - iter->level = iter->min_depth; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_iter_verify(iter); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(trans, path, iter->flags); if (ret) - return NULL; + goto err; - b = iter->l[iter->level].b; + b = path->l[path->level].b; } - iter->pos = iter->real_pos = b->key.k.p; + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + iter->path->should_be_locked = true; + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; return b; +err: + b = ERR_PTR(ret); + goto out; } /* Iterate across keys (in leaf nodes only) */ -static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bpos old_pos = iter->real_pos; -#endif - int cmp = bpos_cmp(new_pos, iter->real_pos); - unsigned l = iter->level; - - EBUG_ON(iter->trans->restarted); - - if (!cmp) - goto out; - - iter->real_pos = new_pos; - iter->should_be_locked = false; - - btree_iter_check_sort(iter->trans, iter); - - if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { - btree_node_unlock(iter, 0); - iter->l[0].b = BTREE_ITER_NO_NODE_CACHED; - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - return; - } - - l = btree_iter_up_until_good_node(iter, cmp); - - if (btree_iter_node(iter, l)) { - /* - * We might have to skip over many keys, or just a few: try - * advancing the node iterator, and if we have to skip over too - * many keys just reinit it (or if we're rewinding, since that - * is expensive). - */ - if (cmp < 0 || - !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) - __btree_iter_init(iter, l); - - /* Don't leave it locked if we're not supposed to: */ - if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) - btree_node_unlock(iter, l); - } -out: - if (l != iter->level) - btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); - else - btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); - - bch2_btree_iter_verify(iter); -#ifdef CONFIG_BCACHEFS_DEBUG - trace_iter_set_search_pos(iter->trans->ip, _RET_IP_, - iter->btree_id, - &old_pos, &new_pos, l); -#endif -} - inline bool bch2_btree_iter_advance(struct btree_iter *iter) { struct bpos pos = iter->k.p; - bool ret = bpos_cmp(pos, SPOS_MAX) != 0; + bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_cmp(pos, SPOS_MAX) + : bkey_cmp(pos, SPOS_MAX)) != 0; if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) pos = bkey_successor(iter, pos); @@ -1677,54 +2167,177 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) +static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos pos) { - struct bpos next_pos = iter->l[0].b->key.k.p; - bool ret = bpos_cmp(next_pos, SPOS_MAX) != 0; + struct btree_insert_entry *i; - /* - * Typically, we don't want to modify iter->pos here, since that - * indicates where we searched from - unless we got to the end of the - * btree, in that case we want iter->pos to reflect that: - */ - if (ret) - btree_iter_set_search_pos(iter, bpos_successor(next_pos)); - else - bch2_btree_iter_set_pos(iter, SPOS_MAX); + trans_for_each_update(trans, i) + if ((cmp_int(btree_id, i->btree_id) ?: + bpos_cmp(pos, i->k->k.p)) <= 0) { + if (btree_id == i->btree_id) + return i->k; + break; + } - return ret; + return NULL; } -static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) +static noinline +struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans, + struct btree_path *path) { - struct bpos next_pos = iter->l[0].b->data->min_key; - bool ret = bpos_cmp(next_pos, POS_MIN) != 0; + struct journal_keys *keys = &trans->c->journal_keys; + size_t idx = bch2_journal_key_search(keys, path->btree_id, + path->level, path->pos); - if (ret) - btree_iter_set_search_pos(iter, bpos_predecessor(next_pos)); - else - bch2_btree_iter_set_pos(iter, POS_MIN); + while (idx < keys->nr && keys->d[idx].overwritten) + idx++; - return ret; + return (idx < keys->nr && + keys->d[idx].btree_id == path->btree_id && + keys->d[idx].level == path->level) + ? keys->d[idx].k + : NULL; } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, - struct bpos pos) +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { - struct btree_insert_entry *i; + struct bkey_i *next_journal = + __btree_trans_peek_journal(trans, iter->path); - if (!(iter->flags & BTREE_ITER_WITH_UPDATES)) - return NULL; + if (next_journal && + bpos_cmp(next_journal->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } - trans_for_each_update(iter->trans, i) - if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: - bkey_cmp(pos, i->k->k.p)) <= 0) { - if (iter->btree_id == i->iter->btree_id) - return i->k; + return k; +} + +/* + * Checks btree key cache for key at iter->pos and returns it if present, or + * bkey_s_c_null: + */ +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + int ret; + + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) + return bkey_s_c_null; + + if (!iter->key_cache_path) + iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, + iter->flags & BTREE_ITER_INTENT, 0, + iter->flags|BTREE_ITER_CACHED, + _THIS_IP_); + + iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + iter->key_cache_path->should_be_locked = true; + + return bch2_btree_path_peek_slot(iter->key_cache_path, &u); +} + +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +{ + struct btree_trans *trans = iter->trans; + struct bkey_i *next_update; + struct bkey_s_c k, k2; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + bch2_btree_iter_verify(iter); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out; + } + + iter->path->should_be_locked = true; + + k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k); + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + ret = bkey_err(k2); + if (ret) { + k = k2; + bch2_btree_iter_set_pos(iter, iter->pos); + goto out; + } + + k = k2; + iter->k = *k.k; + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + + next_update = iter->flags & BTREE_ITER_WITH_UPDATES + ? btree_trans_peek_updates(trans, iter->btree_id, search_key) + : NULL; + if (next_update && + bpos_cmp(next_update->k.p, + k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } + + if (k.k && bkey_deleted(k.k)) { + /* + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. + */ + search_key = bpos_cmp(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + + if (likely(k.k)) { break; + } else if (likely(bpos_cmp(iter->path->l[0].b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ + search_key = bpos_successor(iter->path->l[0].b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; } + } +out: + bch2_btree_iter_verify(iter); - return NULL; + return k; } /** @@ -1733,42 +2346,79 @@ static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter, */ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); - struct bkey_i *next_update; struct bkey_s_c k; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); - bch2_btree_iter_verify(iter); + if (iter->update_path) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + bch2_btree_iter_verify_entry_exit(iter); -start: - next_update = btree_trans_peek_updates(iter, search_key); - btree_iter_set_search_pos(iter, search_key); while (1) { - ret = btree_iter_traverse(iter); - if (unlikely(ret)) - return bkey_s_c_err(ret); + k = __bch2_btree_iter_peek(iter, search_key); + if (!k.k || bkey_err(k)) + goto out; - k = btree_iter_level_peek(iter, &iter->l[0]); + if (iter->update_path && + bkey_cmp(iter->update_path->pos, k.k->p)) { + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } - if (next_update && - bpos_cmp(next_update->k.p, iter->real_pos) <= 0) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + (iter->flags & BTREE_ITER_INTENT) && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && + !iter->update_path) { + struct bpos pos = k.k->p; + + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; } - if (likely(k.k)) { - if (bkey_deleted(k.k)) { - search_key = bkey_successor(iter, k.k->p); - goto start; - } - - break; + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; } - if (!btree_iter_set_pos_to_next_leaf(iter)) - return bkey_s_c_null; + break; } /* @@ -1780,9 +2430,28 @@ start: else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) iter->pos = bkey_start_pos(k.k); + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + BUG_ON(!iter->path->nodes_locked); +out: + if (iter->update_path) { + BUG_ON(!(iter->update_path->nodes_locked & 1)); + iter->update_path->should_be_locked = true; + } + iter->path->should_be_locked = true; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) { + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - iter->should_be_locked = true; + return k; } @@ -1804,37 +2473,103 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) */ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) { - struct btree_iter_level *l = &iter->l[0]; + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct btree_path *saved_path = NULL; struct bkey_s_c k; + struct bkey saved_k; + const struct bch_val *saved_v; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); + EBUG_ON(iter->path->cached || iter->path->level); EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); - btree_iter_set_search_pos(iter, iter->pos); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; while (1) { - ret = btree_iter_traverse(iter); + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto no_key; + goto out; } - k = btree_iter_level_peek(iter, l); + k = btree_path_level_peek(trans->c, iter->path, + &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) - ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0 - : bkey_cmp(k.k->p, iter->pos) > 0)) - k = btree_iter_level_prev(iter, l); + ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 + : bpos_cmp(k.k->p, search_key) > 0)) + k = btree_path_level_prev(trans->c, iter->path, + &iter->path->l[0], &iter->k); - if (likely(k.k)) - break; + btree_path_check_sort(trans, iter->path, 0); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (k.k->p.snapshot == iter->snapshot) + goto got_key; + + /* + * If we have a saved candidate, and we're no + * longer at the same _key_ (not pos), return + * that candidate + */ + if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; + iter->k = saved_k; + k.v = saved_v; + goto got_key; + } + + if (bch2_snapshot_is_ancestor(iter->trans->c, + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + saved_k = *k.k; + saved_v = k.v; + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +got_key: + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_predecessor(iter, k.k->p); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + continue; + } - if (!btree_iter_set_pos_to_prev_leaf(iter)) { + break; + } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); k = bkey_s_c_null; - goto no_key; + goto out; } } @@ -1843,20 +2578,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) /* Extents can straddle iter->pos: */ if (bkey_cmp(k.k->p, iter->pos) < 0) iter->pos = k.k->p; + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; out: + if (saved_path) + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + iter->path->should_be_locked = true; + bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; + return k; -no_key: - /* - * btree_iter_level_peek() may have set iter->k to a key we didn't want, and - * then we errored going to the previous leaf - make sure it's - * consistent with iter->pos: - */ - bkey_init(&iter->k); - iter->k.p = iter->pos; - goto out; } /** @@ -1873,12 +2606,12 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) { + struct btree_trans *trans = iter->trans; struct bpos search_key; struct bkey_s_c k; int ret; - EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS && - btree_iter_type(iter) != BTREE_ITER_CACHED); + EBUG_ON(iter->path->level); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -1892,50 +2625,57 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } search_key = btree_iter_search_key(iter); - btree_iter_set_search_pos(iter, search_key); + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); - ret = btree_iter_traverse(iter); + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) return bkey_s_c_err(ret); - if (btree_iter_type(iter) == BTREE_ITER_CACHED || - !(iter->flags & BTREE_ITER_IS_EXTENTS)) { + if ((iter->flags & BTREE_ITER_CACHED) || + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - struct bkey_cached *ck; - switch (btree_iter_type(iter)) { - case BTREE_ITER_KEYS: - k = btree_iter_level_peek_all(iter, &iter->l[0]); - EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, iter->pos) == 0); - break; - case BTREE_ITER_CACHED: - ck = (void *) iter->l[0].b; - EBUG_ON(iter->btree_id != ck->key.btree_id || - bkey_cmp(iter->pos, ck->key.pos)); - BUG_ON(!ck->valid); - - k = bkey_i_to_s_c(ck->k); - break; - case BTREE_ITER_NODES: - BUG(); + if ((iter->flags & BTREE_ITER_WITH_UPDATES) && + (next_update = btree_trans_peek_updates(trans, + iter->btree_id, search_key)) && + !bpos_cmp(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + goto out; } - next_update = btree_trans_peek_updates(iter, search_key); - if (next_update && - (!k.k || bpos_cmp(next_update->k.p, k.k->p) <= 0)) { + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (next_update = __btree_trans_peek_journal(trans, iter->path)) && + !bpos_cmp(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); + goto out; } + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + if (!bkey_err(k)) + iter->k = *k.k; + goto out; + } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); } else { - if ((iter->flags & BTREE_ITER_INTENT)) { - struct btree_iter *child = - btree_iter_child_alloc(iter, _THIS_IP_); + struct bpos next; - btree_iter_copy(child, iter); - k = bch2_btree_iter_peek(child); + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; - if (k.k && !bkey_err(k)) - iter->k = child->k; + bch2_trans_copy_iter(&iter2, iter); + k = bch2_btree_iter_peek(&iter2); + + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; + k.k = &iter->k; + } + bch2_trans_iter_exit(trans, &iter2); } else { struct bpos pos = iter->pos; @@ -1945,38 +2685,34 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (unlikely(bkey_err(k))) return k; - } - if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) { - if (!k.k || - ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS) - ? bpos_cmp(iter->pos, k.k->p) - : bkey_cmp(iter->pos, k.k->p))) { - bkey_init(&iter->k); - iter->k.p = iter->pos; - k = (struct bkey_s_c) { &iter->k, NULL }; - } - } else { - struct bpos next = k.k ? bkey_start_pos(k.k) : POS_MAX; + next = k.k ? bkey_start_pos(k.k) : POS_MAX; if (bkey_cmp(iter->pos, next) < 0) { bkey_init(&iter->k); iter->k.p = iter->pos; - bch2_key_resize(&iter->k, - min_t(u64, KEY_SIZE_MAX, - (next.inode == iter->pos.inode - ? next.offset - : KEY_OFFSET_MAX) - - iter->pos.offset)); + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); + EBUG_ON(!iter->k.size); + } k = (struct bkey_s_c) { &iter->k, NULL }; - EBUG_ON(!k.k->size); } } +out: + iter->path->should_be_locked = true; bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - iter->should_be_locked = true; + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); return k; } @@ -1997,35 +2733,14 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } -static inline void bch2_btree_iter_init(struct btree_trans *trans, - struct btree_iter *iter, enum btree_id btree_id) -{ - struct bch_fs *c = trans->c; - unsigned i; - - iter->trans = trans; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - iter->btree_id = btree_id; - iter->real_pos = POS_MIN; - iter->level = 0; - iter->min_depth = 0; - iter->locks_want = 0; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - for (i = 0; i < ARRAY_SIZE(iter->l); i++) - iter->l[i].b = BTREE_ITER_NO_NODE_INIT; - - prefetch(c->btree_roots[btree_id].b); -} - /* new transactional stuff: */ -static inline void btree_iter_verify_sorted_ref(struct btree_trans *trans, - struct btree_iter *iter) +static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, + struct btree_path *path) { - EBUG_ON(iter->sorted_idx >= trans->nr_sorted); - EBUG_ON(trans->sorted[iter->sorted_idx] != iter->idx); - EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + EBUG_ON(path->sorted_idx >= trans->nr_sorted); + EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); } static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) @@ -2034,432 +2749,201 @@ static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) unsigned i; for (i = 0; i < trans->nr_sorted; i++) - btree_iter_verify_sorted_ref(trans, trans->iters + trans->sorted[i]); + btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); #endif } -static inline void btree_trans_verify_sorted(struct btree_trans *trans) +static void btree_trans_verify_sorted(struct btree_trans *trans) { #ifdef CONFIG_BCACHEFS_DEBUG - struct btree_iter *iter, *prev = NULL; + struct btree_path *path, *prev = NULL; + unsigned i; - trans_for_each_iter_inorder(trans, iter) - BUG_ON(prev && btree_iter_cmp(prev, iter) > 0); + trans_for_each_path_inorder(trans, path, i) { + BUG_ON(prev && btree_path_cmp(prev, path) > 0); + prev = path; + } #endif } -static inline void btree_iter_swap(struct btree_trans *trans, - struct btree_iter *l, struct btree_iter *r) +static inline void btree_path_swap(struct btree_trans *trans, + struct btree_path *l, struct btree_path *r) { swap(l->sorted_idx, r->sorted_idx); swap(trans->sorted[l->sorted_idx], trans->sorted[r->sorted_idx]); - btree_iter_verify_sorted_ref(trans, l); - btree_iter_verify_sorted_ref(trans, r); + btree_path_verify_sorted_ref(trans, l); + btree_path_verify_sorted_ref(trans, r); } -static void btree_trans_sort_iters(struct btree_trans *trans) +static void btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, + int cmp) { - bool swapped = false; - int i, l = 0, r = trans->nr_sorted; - - while (1) { - for (i = l; i + 1 < r; i++) { - if (btree_iter_cmp(trans->iters + trans->sorted[i], - trans->iters + trans->sorted[i + 1]) > 0) { - swap(trans->sorted[i], trans->sorted[i + 1]); - trans->iters[trans->sorted[i]].sorted_idx = i; - trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; - swapped = true; - } - } + struct btree_path *n; - if (!swapped) - break; + if (cmp <= 0) { + n = prev_btree_path(trans, path); + if (n && btree_path_cmp(n, path) > 0) { + do { + btree_path_swap(trans, n, path); + n = prev_btree_path(trans, path); + } while (n && btree_path_cmp(n, path) > 0); - r--; - swapped = false; - - for (i = r - 2; i >= l; --i) { - if (btree_iter_cmp(trans->iters + trans->sorted[i], - trans->iters + trans->sorted[i + 1]) > 0) { - swap(trans->sorted[i], - trans->sorted[i + 1]); - trans->iters[trans->sorted[i]].sorted_idx = i; - trans->iters[trans->sorted[i + 1]].sorted_idx = i + 1; - swapped = true; - } + goto out; } - - if (!swapped) - break; - - l++; - swapped = false; - } - - btree_trans_verify_sorted_refs(trans); - btree_trans_verify_sorted(trans); -} - -static void btree_iter_check_sort(struct btree_trans *trans, struct btree_iter *iter) -{ - struct btree_iter *n; - - EBUG_ON(iter->sorted_idx == U8_MAX); - - n = next_btree_iter(trans, iter); - if (n && btree_iter_cmp(iter, n) > 0) { - do { - btree_iter_swap(trans, iter, n); - n = next_btree_iter(trans, iter); - } while (n && btree_iter_cmp(iter, n) > 0); - - return; } - n = prev_btree_iter(trans, iter); - if (n && btree_iter_cmp(n, iter) > 0) { - do { - btree_iter_swap(trans, n, iter); - n = prev_btree_iter(trans, iter); - } while (n && btree_iter_cmp(n, iter) > 0); + if (cmp >= 0) { + n = next_btree_path(trans, path); + if (n && btree_path_cmp(path, n) > 0) { + do { + btree_path_swap(trans, path, n); + n = next_btree_path(trans, path); + } while (n && btree_path_cmp(path, n) > 0); + } } - +out: btree_trans_verify_sorted(trans); } -static inline void btree_iter_list_remove(struct btree_trans *trans, - struct btree_iter *iter) +static inline void btree_path_list_remove(struct btree_trans *trans, + struct btree_path *path) { unsigned i; - EBUG_ON(iter->sorted_idx >= trans->nr_sorted); + EBUG_ON(path->sorted_idx >= trans->nr_sorted); - array_remove_item(trans->sorted, trans->nr_sorted, iter->sorted_idx); + array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); - for (i = iter->sorted_idx; i < trans->nr_sorted; i++) - trans->iters[trans->sorted[i]].sorted_idx = i; + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; - iter->sorted_idx = U8_MAX; + path->sorted_idx = U8_MAX; btree_trans_verify_sorted_refs(trans); } -static inline void btree_iter_list_add(struct btree_trans *trans, - struct btree_iter *pos, - struct btree_iter *iter) +static inline void btree_path_list_add(struct btree_trans *trans, + struct btree_path *pos, + struct btree_path *path) { unsigned i; btree_trans_verify_sorted_refs(trans); - iter->sorted_idx = pos ? pos->sorted_idx : trans->nr_sorted; + path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; - array_insert_item(trans->sorted, trans->nr_sorted, iter->sorted_idx, iter->idx); + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); - for (i = iter->sorted_idx; i < trans->nr_sorted; i++) - trans->iters[trans->sorted[i]].sorted_idx = i; + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; btree_trans_verify_sorted_refs(trans); } -static void btree_iter_child_free(struct btree_iter *iter) -{ - struct btree_iter *child = btree_iter_child(iter); - - if (child) { - bch2_trans_iter_free(iter->trans, child); - iter->child_idx = U8_MAX; - } -} - -static struct btree_iter *btree_iter_child_alloc(struct btree_iter *iter, - unsigned long ip) -{ - struct btree_trans *trans = iter->trans; - struct btree_iter *child = btree_iter_child(iter); - - if (!child) { - child = btree_trans_iter_alloc(trans, iter); - child->ip_allocated = ip; - iter->child_idx = child->idx; - - trans->iters_live |= 1ULL << child->idx; - trans->iters_touched |= 1ULL << child->idx; - } - - return child; -} - -static inline void __bch2_trans_iter_free(struct btree_trans *trans, - unsigned idx) -{ - btree_iter_child_free(&trans->iters[idx]); - - btree_iter_list_remove(trans, &trans->iters[idx]); - - __bch2_btree_iter_unlock(&trans->iters[idx]); - trans->iters_linked &= ~(1ULL << idx); - trans->iters_live &= ~(1ULL << idx); - trans->iters_touched &= ~(1ULL << idx); -} - -int bch2_trans_iter_put(struct btree_trans *trans, - struct btree_iter *iter) -{ - int ret; - - if (IS_ERR_OR_NULL(iter)) - return 0; - - BUG_ON(trans->iters + iter->idx != iter); - BUG_ON(!btree_iter_live(trans, iter)); - - ret = btree_iter_err(iter); - - if (!(trans->iters_touched & (1ULL << iter->idx)) && - !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) - __bch2_trans_iter_free(trans, iter->idx); - - trans->iters_live &= ~(1ULL << iter->idx); - return ret; -} - -int bch2_trans_iter_free(struct btree_trans *trans, - struct btree_iter *iter) -{ - if (IS_ERR_OR_NULL(iter)) - return 0; - - set_btree_iter_dontneed(trans, iter); - - return bch2_trans_iter_put(trans, iter); -} - -noinline __cold -static void btree_trans_iter_alloc_fail(struct btree_trans *trans) -{ - - struct btree_iter *iter; - struct btree_insert_entry *i; - char buf[100]; - - btree_trans_sort_iters(trans); - - trans_for_each_iter_inorder(trans, iter) - printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n", - bch2_btree_ids[iter->btree_id], - (bch2_bpos_to_text(&PBUF(buf), iter->real_pos), buf), - btree_iter_live(trans, iter) ? " live" : "", - (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", - iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", - (void *) iter->ip_allocated); - - trans_for_each_update(trans, i) { - char buf[300]; - - bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k)); - printk(KERN_ERR "update: btree %s %s\n", - bch2_btree_ids[i->iter->btree_id], buf); - } - panic("trans iter oveflow\n"); -} - -static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans, - struct btree_iter *pos) -{ - struct btree_iter *iter; - unsigned idx; - - if (unlikely(trans->iters_linked == - ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) - btree_trans_iter_alloc_fail(trans); - - idx = __ffs64(~trans->iters_linked); - iter = &trans->iters[idx]; - - iter->trans = trans; - iter->idx = idx; - iter->child_idx = U8_MAX; - iter->sorted_idx = U8_MAX; - iter->flags = 0; - iter->nodes_locked = 0; - iter->nodes_intent_locked = 0; - trans->iters_linked |= 1ULL << idx; - - btree_iter_list_add(trans, pos, iter); - return iter; -} - -static void btree_iter_copy(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { - unsigned i; - - __bch2_btree_iter_unlock(dst); - btree_iter_child_free(dst); - - memcpy(&dst->flags, &src->flags, - sizeof(struct btree_iter) - offsetof(struct btree_iter, flags)); - - for (i = 0; i < BTREE_MAX_DEPTH; i++) - if (btree_node_locked(dst, i)) - six_lock_increment(&dst->l[i].b->c.lock, - __btree_lock_want(dst, i)); - - dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; - dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; - - btree_iter_check_sort(dst->trans, dst); + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + if (iter->update_path) + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, + iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; } -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) +static void __bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) { - struct btree_iter *iter, *best = NULL; - struct bpos real_pos, pos_min = POS_MIN; - EBUG_ON(trans->restarted); - if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && - btree_node_type_is_extents(btree_id) && - !(flags & BTREE_ITER_NOT_EXTENTS) && - !(flags & BTREE_ITER_ALL_SNAPSHOTS)) + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + btree_node_type_is_extents(btree_id)) flags |= BTREE_ITER_IS_EXTENTS; - if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES && + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && !btree_type_has_snapshots(btree_id)) flags &= ~BTREE_ITER_ALL_SNAPSHOTS; - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS)) - pos.snapshot = btree_type_has_snapshots(btree_id) - ? U32_MAX : 0; - - real_pos = pos; - - if ((flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(pos, POS_MAX)) - real_pos = bpos_nosnap_successor(pos); - - trans_for_each_iter(trans, iter) { - if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) - continue; - - if (iter->btree_id != btree_id) - continue; - - if (best) { - int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos), - bpos_diff(iter->real_pos, real_pos)); - - if (cmp < 0 || - ((cmp == 0 && btree_iter_keep(trans, iter)))) - continue; - } - - best = iter; - } - - if (!best) { - iter = btree_trans_iter_alloc(trans, NULL); - bch2_btree_iter_init(trans, iter, btree_id); - } else if (btree_iter_keep(trans, best)) { - iter = btree_trans_iter_alloc(trans, best); - btree_iter_copy(iter, best); - } else { - iter = best; - } - - trans->iters_live |= 1ULL << iter->idx; - trans->iters_touched |= 1ULL << iter->idx; - - iter->flags = flags; - - iter->snapshot = pos.snapshot; + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; - /* - * If the iterator has locks_want greater than requested, we explicitly - * do not downgrade it here - on transaction restart because btree node - * split needs to upgrade locks, we might be putting/getting the - * iterator again. Downgrading iterators only happens via an explicit - * bch2_trans_downgrade(). - */ - - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > iter->locks_want) { - iter->locks_want = locks_want; - btree_iter_get_locks(iter, true, _THIS_IP_); - } + if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags)) + flags |= BTREE_ITER_WITH_JOURNAL; - while (iter->level != depth) { - btree_node_unlock(iter, iter->level); - iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; - iter->uptodate = BTREE_ITER_NEED_TRAVERSE; - if (iter->level < depth) - iter->level++; - else - iter->level--; - } + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + iter->trans = trans; + iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; + iter->btree_id = btree_id; iter->min_depth = depth; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k.type = KEY_TYPE_deleted; + iter->k.p = pos; + iter->k.size = 0; +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif - bch2_btree_iter_set_pos(iter, pos); - btree_iter_set_search_pos(iter, real_pos); - - trace_trans_get_iter(_RET_IP_, trans->ip, - btree_id, - &real_pos, locks_want, iter->uptodate, - best ? &best->real_pos : &pos_min, - best ? best->locks_want : U8_MAX, - best ? best->uptodate : U8_MAX); - - return iter; + iter->path = bch2_path_get(trans, btree_id, iter->pos, + locks_want, depth, flags, ip); } -struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags) +void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) { - struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, - locks_want, depth, - BTREE_ITER_NODES| - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - flags); - - BUG_ON(bkey_cmp(iter->pos, pos)); - BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH)); - BUG_ON(iter->level != depth); - BUG_ON(iter->min_depth != depth); - iter->ip_allocated = _RET_IP_; - - return iter; + __bch2_trans_iter_init(trans, iter, btree_id, pos, + 0, 0, flags, _RET_IP_); } -struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, - struct btree_iter *src) +void bch2_trans_node_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, + struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) { - struct btree_iter *iter; - - iter = btree_trans_iter_alloc(trans, src); - btree_iter_copy(iter, src); - - trans->iters_live |= 1ULL << iter->idx; - /* - * We don't need to preserve this iter since it's cheap to copy it - * again - this will cause trans_iter_put() to free it right away: - */ - set_btree_iter_dontneed(trans, iter); + __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, + BTREE_ITER_NOT_EXTENTS| + __BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_ALL_SNAPSHOTS| + flags, _RET_IP_); + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); +} - return iter; +void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +{ + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = NULL; } void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) @@ -2488,7 +2972,7 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) trans->mem_bytes = new_bytes; if (old_bytes) { - trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes); + trace_trans_restart_mem_realloced(trans->fn, _RET_IP_, new_bytes); btree_trans_restart(trans); return ERR_PTR(-EINTR); } @@ -2500,20 +2984,6 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } -inline void bch2_trans_unlink_iters(struct btree_trans *trans) -{ - u64 iters = trans->iters_linked & - ~trans->iters_touched & - ~trans->iters_live; - - while (iters) { - unsigned idx = __ffs64(iters); - - iters &= ~(1ULL << idx); - __bch2_trans_iter_free(trans, idx); - } -} - /** * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset @@ -2524,19 +2994,13 @@ inline void bch2_trans_unlink_iters(struct btree_trans *trans) */ void bch2_trans_begin(struct btree_trans *trans) { - struct btree_iter *iter; - - trans_for_each_iter(trans, iter) - iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| - BTREE_ITER_SET_POS_AFTER_COMMIT); + struct btree_insert_entry *i; + struct btree_path *path; - /* - * XXX: we shouldn't be doing this if the transaction was restarted, but - * currently we still overflow transaction iterators if we do that - * */ - bch2_trans_unlink_iters(trans); - trans->iters_touched &= trans->iters_live; + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); trans->extra_journal_res = 0; trans->nr_updates = 0; trans->mem_top = 0; @@ -2552,48 +3016,59 @@ void bch2_trans_begin(struct btree_trans *trans) (void *) &trans->fs_usage_deltas->memset_start); } + trans_for_each_path(trans, path) { + path->should_be_locked = false; + + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction + * iterators if we do that + */ + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); + else if (!path->ref) + path->preserve = false; + } + bch2_trans_cond_resched(trans); if (trans->restarted) - bch2_btree_iter_traverse_all(trans); + bch2_btree_path_traverse_all(trans); trans->restarted = false; } -static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) +static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) { - size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX; + size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; - size_t sorted_bytes = sizeof(u8) * BTREE_ITER_MAX; void *p = NULL; BUG_ON(trans->used_mempool); #ifdef __KERNEL__ - p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); + p = this_cpu_xchg(c->btree_paths_bufs->path , NULL); #endif if (!p) - p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); + p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); - trans->iters = p; p += iters_bytes; + trans->paths = p; p += paths_bytes; trans->updates = p; p += updates_bytes; - trans->sorted = p; p += sorted_bytes; } -void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, - unsigned expected_nr_iters, - size_t expected_mem_bytes) +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes, + const char *fn) __acquires(&c->btree_trans_barrier) { + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + memset(trans, 0, sizeof(*trans)); trans->c = c; - trans->ip = _RET_IP_; + trans->fn = fn; - /* - * reallocating iterators currently completely breaks - * bch2_trans_iter_put(), we always allocate the max: - */ - bch2_trans_alloc_iters(trans, c); + bch2_trans_alloc_paths(trans, c); if (expected_mem_bytes) { trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); @@ -2607,62 +3082,67 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -#ifdef CONFIG_BCACHEFS_DEBUG trans->pid = current->pid; mutex_lock(&c->btree_trans_lock); list_add(&trans->list, &c->btree_trans_list); mutex_unlock(&c->btree_trans_lock); +} + +static void check_btree_paths_leaked(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->ref) + goto leaked; + return; +leaked: + bch_err(c, "btree paths leaked from %s!", trans->fn); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", + bch2_btree_ids[path->btree_id], + (void *) path->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); #endif } -int bch2_trans_exit(struct btree_trans *trans) +void bch2_trans_exit(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { + struct btree_insert_entry *i; struct bch_fs *c = trans->c; bch2_trans_unlock(trans); -#ifdef CONFIG_BCACHEFS_DEBUG - if (trans->iters_live) { - struct btree_iter *iter; - - trans_for_each_iter(trans, iter) - btree_iter_child_free(iter); - } - - if (trans->iters_live) { - struct btree_iter *iter; + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + trans->nr_updates = 0; - bch_err(c, "btree iterators leaked!"); - trans_for_each_iter(trans, iter) - if (btree_iter_live(trans, iter)) - printk(KERN_ERR " btree %s allocated at %pS\n", - bch2_btree_ids[iter->btree_id], - (void *) iter->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); - } + check_btree_paths_leaked(trans); - mutex_lock(&trans->c->btree_trans_lock); + mutex_lock(&c->btree_trans_lock); list_del(&trans->list); - mutex_unlock(&trans->c->btree_trans_lock); -#endif + mutex_unlock(&c->btree_trans_lock); srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); - bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (trans->fs_usage_deltas) { if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == REPLICAS_DELTA_LIST_MAX) mempool_free(trans->fs_usage_deltas, - &trans->c->replicas_delta_pool); + &c->replicas_delta_pool); else kfree(trans->fs_usage_deltas); } if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) - mempool_free(trans->mem, &trans->c->btree_trans_mem_pool); + mempool_free(trans->mem, &c->btree_trans_mem_pool); else kfree(trans->mem); @@ -2670,74 +3150,70 @@ int bch2_trans_exit(struct btree_trans *trans) /* * Userspace doesn't have a real percpu implementation: */ - trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); + trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); #endif - if (trans->iters) - mempool_free(trans->iters, &trans->c->btree_iters_pool); + if (trans->paths) + mempool_free(trans->paths, &c->btree_paths_pool); trans->mem = (void *) 0x1; - trans->iters = (void *) 0x1; - - return trans->error ? -EIO : 0; + trans->paths = (void *) 0x1; } static void __maybe_unused -bch2_btree_iter_node_to_text(struct printbuf *out, +bch2_btree_path_node_to_text(struct printbuf *out, struct btree_bkey_cached_common *_b, - enum btree_iter_type type) + bool cached) { pr_buf(out, " l=%u %s:", _b->level, bch2_btree_ids[_b->btree_id]); - bch2_bpos_to_text(out, btree_node_pos(_b, type)); + bch2_bpos_to_text(out, btree_node_pos(_b, cached)); } -#ifdef CONFIG_BCACHEFS_DEBUG -static bool trans_has_btree_nodes_locked(struct btree_trans *trans) +static bool trans_has_locks(struct btree_trans *trans) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - if (btree_iter_type(iter) != BTREE_ITER_CACHED && - iter->nodes_locked) + trans_for_each_path(trans, path) + if (path->nodes_locked) return true; return false; } -#endif void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) { -#ifdef CONFIG_BCACHEFS_DEBUG struct btree_trans *trans; - struct btree_iter *iter; + struct btree_path *path; struct btree *b; + static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (!trans_has_btree_nodes_locked(trans)) + if (!trans_has_locks(trans)) continue; - pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip); + pr_buf(out, "%i %s\n", trans->pid, trans->fn); - trans_for_each_iter(trans, iter) { - if (!iter->nodes_locked) + trans_for_each_path(trans, path) { + if (!path->nodes_locked) continue; - pr_buf(out, " iter %u %c %s:", - iter->idx, - btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', - bch2_btree_ids[iter->btree_id]); - bch2_bpos_to_text(out, iter->pos); + pr_buf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); pr_buf(out, "\n"); for (l = 0; l < BTREE_MAX_DEPTH; l++) { - if (btree_node_locked(iter, l)) { + if (btree_node_locked(path, l)) { pr_buf(out, " %s l=%u ", - btree_node_intent_locked(iter, l) ? "i" : "r", l); - bch2_btree_iter_node_to_text(out, - (void *) iter->l[l].b, - btree_iter_type(iter)); + btree_node_intent_locked(path, l) ? "i" : "r", l); + bch2_btree_path_node_to_text(out, + (void *) path->l[l].b, + path->cached); pr_buf(out, "\n"); } } @@ -2745,44 +3221,47 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) b = READ_ONCE(trans->locking); if (b) { - iter = &trans->iters[trans->locking_iter_idx]; - pr_buf(out, " locking iter %u %c l=%u %s:", - trans->locking_iter_idx, - btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b', + path = &trans->paths[trans->locking_path_idx]; + pr_buf(out, " locking path %u %c l=%u %c %s:", + trans->locking_path_idx, + path->cached ? 'c' : 'b', trans->locking_level, + lock_types[trans->locking_lock_type], bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); pr_buf(out, " node "); - bch2_btree_iter_node_to_text(out, - (void *) b, - btree_iter_type(iter)); + bch2_btree_path_node_to_text(out, + (void *) b, path->cached); pr_buf(out, "\n"); } } mutex_unlock(&c->btree_trans_lock); -#endif } void bch2_fs_btree_iter_exit(struct bch_fs *c) { + if (c->btree_trans_barrier_initialized) + cleanup_srcu_struct(&c->btree_trans_barrier); mempool_exit(&c->btree_trans_mem_pool); - mempool_exit(&c->btree_iters_pool); - cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_paths_pool); } int bch2_fs_btree_iter_init(struct bch_fs *c) { unsigned nr = BTREE_ITER_MAX; + int ret; INIT_LIST_HEAD(&c->btree_trans_list); mutex_init(&c->btree_trans_lock); - return init_srcu_struct(&c->btree_trans_barrier) ?: - mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, - sizeof(u8) * nr + - sizeof(struct btree_iter) * nr + + ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(struct btree_path) * nr + sizeof(struct btree_insert_entry) * nr) ?: mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, - BTREE_TRANS_MEM_MAX); + BTREE_TRANS_MEM_MAX) ?: + init_srcu_struct(&c->btree_trans_barrier); + if (!ret) + c->btree_trans_barrier_initialized = true; + return ret; } diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 39124e6..759c7b5 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -5,138 +5,158 @@ #include "bset.h" #include "btree_types.h" -static inline void btree_iter_set_dirty(struct btree_iter *iter, - enum btree_iter_uptodate u) +static inline void __btree_path_get(struct btree_path *path, bool intent) { - iter->uptodate = max_t(unsigned, iter->uptodate, u); + path->ref++; + path->intent_ref += intent; } -static inline struct btree *btree_iter_node(struct btree_iter *iter, - unsigned level) +static inline bool __btree_path_put(struct btree_path *path, bool intent) { - return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; + EBUG_ON(!path->ref); + EBUG_ON(!path->intent_ref && intent); + path->intent_ref -= intent; + return --path->ref == 0; } -static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, - const struct btree *b, unsigned level) +static inline void btree_path_set_dirty(struct btree_path *path, + enum btree_path_uptodate u) { - /* - * We don't compare the low bits of the lock sequence numbers because - * @iter might have taken a write lock on @b, and we don't want to skip - * the linked iterator if the sequence numbers were equal before taking - * that write lock. The lock sequence number is incremented by taking - * and releasing write locks and is even when unlocked: - */ - return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + path->uptodate = max_t(unsigned, path->uptodate, u); } -static inline struct btree *btree_node_parent(struct btree_iter *iter, - struct btree *b) +static inline struct btree *btree_path_node(struct btree_path *path, + unsigned level) { - return btree_iter_node(iter, b->c.level + 1); + return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; } -static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) +static inline bool btree_node_lock_seq_matches(const struct btree_path *path, + const struct btree *b, unsigned level) { - return hweight64(trans->iters_linked) > 1; + /* + * We don't compare the low bits of the lock sequence numbers because + * @path might have taken a write lock on @b, and we don't want to skip + * the linked path if the sequence numbers were equal before taking that + * write lock. The lock sequence number is incremented by taking and + * releasing write locks and is even when unlocked: + */ + return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; } -static inline int btree_iter_err(const struct btree_iter *iter) +static inline struct btree *btree_node_parent(struct btree_path *path, + struct btree *b) { - return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; + return btree_path_node(path, b->c.level + 1); } -/* Iterate over iters within a transaction: */ +/* Iterate over paths within a transaction: */ -static inline struct btree_iter * -__trans_next_iter(struct btree_trans *trans, unsigned idx) +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned idx) { u64 l; if (idx == BTREE_ITER_MAX) return NULL; - l = trans->iters_linked >> idx; + l = trans->paths_allocated >> idx; if (!l) return NULL; idx += __ffs64(l); EBUG_ON(idx >= BTREE_ITER_MAX); - EBUG_ON(trans->iters[idx].idx != idx); - return &trans->iters[idx]; + EBUG_ON(trans->paths[idx].idx != idx); + return &trans->paths[idx]; } -#define trans_for_each_iter(_trans, _iter) \ - for (_iter = __trans_next_iter((_trans), 0); \ - (_iter); \ - _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) +#define trans_for_each_path(_trans, _path) \ + for (_path = __trans_next_path((_trans), 0); \ + (_path); \ + _path = __trans_next_path((_trans), (_path)->idx + 1)) -static inline struct btree_iter *next_btree_iter(struct btree_trans *trans, struct btree_iter *iter) +static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { - unsigned idx = iter ? iter->sorted_idx + 1 : 0; + unsigned idx = path ? path->sorted_idx + 1 : 0; EBUG_ON(idx > trans->nr_sorted); return idx < trans->nr_sorted - ? trans->iters + trans->sorted[idx] + ? trans->paths + trans->sorted[idx] : NULL; } -static inline struct btree_iter *prev_btree_iter(struct btree_trans *trans, struct btree_iter *iter) +static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) { - EBUG_ON(iter->sorted_idx >= trans->nr_sorted); - return iter->sorted_idx - ? trans->iters + trans->sorted[iter->sorted_idx - 1] + EBUG_ON(path->sorted_idx >= trans->nr_sorted); + return path->sorted_idx + ? trans->paths + trans->sorted[path->sorted_idx - 1] : NULL; } -#define trans_for_each_iter_inorder(_trans, _iter) \ - for (_iter = next_btree_iter(trans, NULL); \ - (_iter); \ - _iter = next_btree_iter((_trans), (_iter))) +#define trans_for_each_path_inorder(_trans, _path, _i) \ + for (_i = 0; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ + _i++) -static inline bool __iter_has_node(const struct btree_iter *iter, +static inline bool __path_has_node(const struct btree_path *path, const struct btree *b) { - return iter->l[b->c.level].b == b && - btree_node_lock_seq_matches(iter, b, b->c.level); + return path->l[b->c.level].b == b && + btree_node_lock_seq_matches(path, b, b->c.level); } -static inline struct btree_iter * -__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, +static inline struct btree_path * +__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, unsigned idx) { - struct btree_iter *iter = __trans_next_iter(trans, idx); + struct btree_path *path = __trans_next_path(trans, idx); - while (iter && !__iter_has_node(iter, b)) - iter = __trans_next_iter(trans, iter->idx + 1); + while (path && !__path_has_node(path, b)) + path = __trans_next_path(trans, path->idx + 1); - return iter; + return path; } -#define trans_for_each_iter_with_node(_trans, _b, _iter) \ - for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ - (_iter); \ - _iter = __trans_next_iter_with_node((_trans), (_b), \ - (_iter)->idx + 1)) +#define trans_for_each_path_with_node(_trans, _b, _path) \ + for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ + (_path); \ + _path = __trans_next_path_with_node((_trans), (_b), \ + (_path)->idx + 1)) + +struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); +struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, + struct bpos, bool, unsigned long); +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, + unsigned, unsigned, unsigned, unsigned long); +inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); #ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); -void bch2_btree_trans_verify_locks(struct btree_trans *); +void bch2_trans_verify_paths(struct btree_trans *); +void bch2_trans_verify_locks(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); #else -static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, - struct btree *b) {} -static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} +static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} #endif -void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, - struct bkey_packed *); -void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_packed *, - unsigned, unsigned); +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); -bool bch2_btree_iter_relock_intent(struct btree_iter *); -bool bch2_btree_iter_relock(struct btree_iter *, unsigned long); +bool bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); + +void bch2_path_put(struct btree_trans *, struct btree_path *, bool); bool bch2_trans_relock(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); @@ -149,35 +169,39 @@ static inline int btree_trans_restart(struct btree_trans *trans) return -EINTR; } -bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); +bool bch2_btree_node_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); -static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, +static inline bool bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned new_locks_want) { new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - return iter->locks_want < new_locks_want - ? __bch2_btree_iter_upgrade(iter, new_locks_want) - : iter->uptodate <= BTREE_ITER_NEED_PEEK; + return path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE; } -void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); +void __bch2_btree_path_downgrade(struct btree_path *, unsigned); -static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) +static inline void bch2_btree_path_downgrade(struct btree_path *path) { - unsigned new_locks_want = iter->level + !!(iter->flags & BTREE_ITER_INTENT); + unsigned new_locks_want = path->level + !!path->intent_ref; - if (iter->locks_want > new_locks_want) - __bch2_btree_iter_downgrade(iter, new_locks_want); + if (path->locks_want > new_locks_want) + __bch2_btree_path_downgrade(path, new_locks_want); } void bch2_trans_downgrade(struct btree_trans *); -void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); - -void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); +void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); int __must_check bch2_btree_iter_traverse(struct btree_iter *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); @@ -196,17 +220,26 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); bool bch2_btree_iter_advance(struct btree_iter *); bool bch2_btree_iter_rewind(struct btree_iter *); -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { - if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) - new_pos.snapshot = iter->snapshot; - iter->k.type = KEY_TYPE_deleted; iter->k.p.inode = iter->pos.inode = new_pos.inode; iter->k.p.offset = iter->pos.offset = new_pos.offset; iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; iter->k.size = 0; - iter->should_be_locked = false; +} + +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (unlikely(iter->update_path)) + bch2_path_put(iter->trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + __bch2_btree_iter_set_pos(iter, new_pos); } static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) @@ -215,45 +248,62 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline struct btree_iter *idx_to_btree_iter(struct btree_trans *trans, unsigned idx) +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) { - return idx != U8_MAX ? trans->iters + idx : NULL; + struct bpos pos = iter->pos; + + iter->snapshot = snapshot; + pos.snapshot = snapshot; + bch2_btree_iter_set_pos(iter, pos); } -static inline struct btree_iter *btree_iter_child(struct btree_iter *iter) +void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); +void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, + unsigned, struct bpos, unsigned); +void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); +void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); + +static inline void set_btree_iter_dontneed(struct btree_iter *iter) { - return idx_to_btree_iter(iter->trans, iter->child_idx); + iter->path->preserve = false; } -/* - * Unlocks before scheduling - * Note: does not revalidate iterator - */ -static inline int bch2_trans_cond_resched(struct btree_trans *trans) +void *bch2_trans_kmalloc(struct btree_trans *, size_t); +void bch2_trans_begin(struct btree_trans *); + +static inline struct btree * +__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) { - if (need_resched() || race_fault()) { - bch2_trans_unlock(trans); - schedule(); - return bch2_trans_relock(trans) ? 0 : -EINTR; - } else { - return 0; - } + struct btree *b; + + while (b = bch2_btree_iter_peek_node(iter), + PTR_ERR_OR_ZERO(b) == -EINTR) + bch2_trans_begin(trans); + + return b; } -#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _locks_want, _depth, _flags, _b) \ - for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ - _start, _locks_want, _depth, _flags), \ - _b = bch2_btree_iter_peek_node(_iter); \ - (_b); \ - (_b) = bch2_btree_iter_next_node(_iter)) +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\ + !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) #define for_each_btree_node(_trans, _iter, _btree_id, _start, \ - _flags, _b) \ + _flags, _b, _ret) \ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ - 0, 0, _flags, _b) + 0, 0, _flags, _b, _ret) + +static inline int bkey_err(struct bkey_s_c k) +{ + return PTR_ERR_OR_ZERO(k.k); +} -static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, unsigned flags) { return flags & BTREE_ITER_SLOTS @@ -261,92 +311,62 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, : bch2_btree_iter_peek(iter); } -static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, - unsigned flags) +static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - return flags & BTREE_ITER_SLOTS - ? bch2_btree_iter_next_slot(iter) - : bch2_btree_iter_next(iter); + return hweight64(trans->paths_allocated) > BTREE_ITER_MAX / 2 + ? -EINTR : 0; } -static inline int bkey_err(struct bkey_s_c k) +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return PTR_ERR_OR_ZERO(k.k); + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bkey_err(k) == -EINTR)) + bch2_trans_begin(trans); + + return k; } #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \ - (_start), (_flags)), \ - (_k) = __bch2_btree_iter_peek(_iter, _flags); \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) + bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ - for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ - (_k) = __bch2_btree_iter_next(_iter, _flags)) - -/* new multiple iterator interface: */ - -int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); -int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); + bch2_btree_iter_advance(&(_iter))) -void bch2_trans_unlink_iters(struct btree_trans *); - -struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, - struct bpos, unsigned, - unsigned, unsigned); - -static inline struct btree_iter * -bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, - struct bpos pos, unsigned flags) -{ - struct btree_iter *iter = - __bch2_trans_get_iter(trans, btree_id, pos, - (flags & BTREE_ITER_INTENT) != 0, 0, - flags); - iter->ip_allocated = _THIS_IP_; - return iter; -} - -struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, - struct btree_iter *); -static inline struct btree_iter * -bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) -{ - struct btree_iter *iter = - __bch2_trans_copy_iter(trans, src); - - iter->ip_allocated = _THIS_IP_; - return iter; -} - -struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, - enum btree_id, struct bpos, - unsigned, unsigned, unsigned); - -static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter) -{ - return (trans->iters_live & (1ULL << iter->idx)) != 0; -} +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) -static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter) -{ - return btree_iter_live(trans, iter) || - (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -} +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) -static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) -{ - trans->iters_touched &= ~(1ULL << iter->idx); -} +/* new multiple iterator interface: */ -void bch2_trans_begin(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); +void __bch2_trans_init(struct btree_trans *, struct bch_fs *, + unsigned, size_t, const char *); +void bch2_trans_exit(struct btree_trans *); -void *bch2_trans_kmalloc(struct btree_trans *, size_t); -void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); -int bch2_trans_exit(struct btree_trans *); +#define bch2_trans_init(...) __bch2_trans_init(__VA_ARGS__, __func__) void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index e327ef3..928aab6 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -146,23 +146,32 @@ bkey_cached_reuse(struct btree_key_cache *c) } static struct bkey_cached * -btree_key_cache_create(struct btree_key_cache *c, +btree_key_cache_create(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) { + struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck; bool was_new = true; - ck = bkey_cached_alloc(c); + ck = bkey_cached_alloc(bc); if (unlikely(!ck)) { - ck = bkey_cached_reuse(c); - if (unlikely(!ck)) + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", + bch2_btree_ids[btree_id]); return ERR_PTR(-ENOMEM); + } was_new = false; } + if (btree_id == BTREE_ID_subvolumes) + six_lock_pcpu_alloc(&ck->c.lock); + else + six_lock_pcpu_free(&ck->c.lock); + ck->c.level = 0; ck->c.btree_id = btree_id; ck->key.btree_id = btree_id; @@ -170,7 +179,7 @@ btree_key_cache_create(struct btree_key_cache *c, ck->valid = false; ck->flags = 1U << BKEY_CACHED_ACCESSED; - if (unlikely(rhashtable_lookup_insert_fast(&c->table, + if (unlikely(rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params))) { /* We raced with another fill: */ @@ -180,15 +189,15 @@ btree_key_cache_create(struct btree_key_cache *c, six_unlock_intent(&ck->c.lock); kfree(ck); } else { - mutex_lock(&c->lock); - bkey_cached_free(c, ck); - mutex_unlock(&c->lock); + mutex_lock(&bc->lock); + bkey_cached_free(bc, ck); + mutex_unlock(&bc->lock); } return NULL; } - atomic_long_inc(&c->nr_keys); + atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); @@ -196,24 +205,27 @@ btree_key_cache_create(struct btree_key_cache *c, } static int btree_key_cache_fill(struct btree_trans *trans, - struct btree_iter *ck_iter, + struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_iter *iter; + struct btree_path *path; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; + struct bkey u; int ret; - iter = bch2_trans_get_iter(trans, ck->key.btree_id, - ck->key.pos, BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + path = bch2_path_get(trans, ck->key.btree_id, + ck->key.pos, 0, 0, 0, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, path, 0); if (ret) goto err; - if (!bch2_btree_node_relock(ck_iter, 0)) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); + k = bch2_btree_path_peek_slot(path, &u); + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_trans_restart_relock_key_cache_fill(trans->fn, + _THIS_IP_, ck_path->btree_id, &ck_path->pos); ret = btree_trans_restart(trans); goto err; } @@ -228,6 +240,8 @@ static int btree_key_cache_fill(struct btree_trans *trans, new_u64s = roundup_pow_of_two(new_u64s); new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); ret = -ENOMEM; goto err; } @@ -237,7 +251,7 @@ static int btree_key_cache_fill(struct btree_trans *trans, * XXX: not allowed to be holding read locks when we take a write lock, * currently */ - bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); + bch2_btree_node_lock_write(trans, ck_path, ck_path->l[0].b); if (new_k) { kfree(ck->k); ck->u64s = new_u64s; @@ -246,93 +260,91 @@ static int btree_key_cache_fill(struct btree_trans *trans, bkey_reassemble(ck->k, k); ck->valid = true; - bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - set_btree_iter_dontneed(trans, iter); + path->preserve = false; err: - bch2_trans_iter_put(trans, iter); + bch2_path_put(trans, path, 0); return ret; } static int bkey_cached_check_fn(struct six_lock *lock, void *p) { struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); - const struct btree_iter *iter = p; + const struct btree_path *path = p; - return ck->key.btree_id == iter->btree_id && - !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1; + return ck->key.btree_id == path->btree_id && + !bpos_cmp(ck->key.pos, path->pos) ? 0 : -1; } __flatten -int bch2_btree_iter_traverse_cached(struct btree_iter *iter) +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey_cached *ck; int ret = 0; - BUG_ON(iter->level); + BUG_ON(path->level); - iter->l[1].b = NULL; + path->l[1].b = NULL; - if (bch2_btree_node_relock(iter, 0)) { - ck = (void *) iter->l[0].b; + if (bch2_btree_node_relock(trans, path, 0)) { + ck = (void *) path->l[0].b; goto fill; } retry: - ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) { - if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { - iter->l[0].b = NULL; + if (flags & BTREE_ITER_CACHED_NOCREATE) { + path->l[0].b = NULL; return 0; } - ck = btree_key_cache_create(&c->btree_key_cache, - iter->btree_id, iter->pos); + ck = btree_key_cache_create(c, path->btree_id, path->pos); ret = PTR_ERR_OR_ZERO(ck); if (ret) goto err; if (!ck) goto retry; - mark_btree_node_locked(iter, 0, SIX_LOCK_intent); - iter->locks_want = 1; + mark_btree_node_locked(path, 0, SIX_LOCK_intent); + path->locks_want = 1; } else { - enum six_lock_type lock_want = __btree_lock_want(iter, 0); + enum six_lock_type lock_want = __btree_lock_want(path, 0); - if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, - bkey_cached_check_fn, iter, _THIS_IP_)) { + if (!btree_node_lock(trans, path, (void *) ck, path->pos, 0, + lock_want, + bkey_cached_check_fn, path, _THIS_IP_)) { if (!trans->restarted) goto retry; - trace_transaction_restart_ip(trans->ip, _THIS_IP_); ret = -EINTR; goto err; } - if (ck->key.btree_id != iter->btree_id || - bpos_cmp(ck->key.pos, iter->pos)) { + if (ck->key.btree_id != path->btree_id || + bpos_cmp(ck->key.pos, path->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } - mark_btree_node_locked(iter, 0, lock_want); + mark_btree_node_locked(path, 0, lock_want); } - iter->l[0].lock_seq = ck->c.lock.state.seq; - iter->l[0].b = (void *) ck; + path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].b = (void *) ck; fill: - if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { - if (!iter->locks_want && - !!__bch2_btree_iter_upgrade(iter, 1)) { - trace_transaction_restart_ip(trans->ip, _THIS_IP_); - BUG_ON(!trans->restarted); - ret = -EINTR; + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { + trace_transaction_restart_ip(trans->fn, _THIS_IP_); + ret = btree_trans_restart(trans); goto err; } - ret = btree_key_cache_fill(trans, iter, ck); + ret = btree_key_cache_fill(trans, path, ck); if (ret) goto err; } @@ -340,22 +352,14 @@ fill: if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - iter->uptodate = BTREE_ITER_NEED_PEEK; - - if ((iter->flags & BTREE_ITER_INTENT) && - !bch2_btree_iter_upgrade(iter, 1)) { - BUG_ON(!trans->restarted); - ret = -EINTR; - } - - BUG_ON(!ret && !btree_node_locked(iter, 0)); + path->uptodate = BTREE_ITER_UPTODATE; + BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); return ret; err: if (ret != -EINTR) { - btree_node_unlock(iter, 0); - iter->flags |= BTREE_ITER_ERROR; - iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; + btree_node_unlock(path, 0); + path->l[0].b = BTREE_ITER_NO_NODE_ERROR; } return ret; } @@ -368,40 +372,48 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct journal *j = &c->journal; - struct btree_iter *c_iter = NULL, *b_iter = NULL; + struct btree_iter c_iter, b_iter; struct bkey_cached *ck = NULL; int ret; - b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT); - c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_CACHED_NOCREATE| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(c_iter); + bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_CACHED_NOCREATE| + BTREE_ITER_INTENT); + b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + + ret = bch2_btree_iter_traverse(&c_iter); if (ret) goto out; - ck = (void *) c_iter->l[0].b; - if (!ck || - (journal_seq && ck->journal.seq != journal_seq)) + ck = (void *) c_iter.path->l[0].b; + if (!ck) goto out; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (!evict) - goto out; - goto evict; + if (evict) + goto evict; + goto out; } + BUG_ON(!ck->valid); + + if (journal_seq && ck->journal.seq != journal_seq) + goto out; + /* * Since journal reclaim depends on us making progress here, and the * allocator/copygc depend on journal reclaim making progress, we need * to be using alloc reserves: * */ - ret = bch2_btree_iter_traverse(b_iter) ?: - bch2_trans_update(trans, b_iter, ck->k, + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, @@ -423,7 +435,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_journal_pin_drop(j, &ck->journal); bch2_journal_preres_put(j, &ck->res); - BUG_ON(!btree_node_locked(c_iter, 0)); + BUG_ON(!btree_node_locked(c_iter.path, 0)); if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { @@ -432,10 +444,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, } } else { evict: - BUG_ON(!btree_node_intent_locked(c_iter, 0)); + BUG_ON(!btree_node_intent_locked(c_iter.path, 0)); - mark_btree_node_unlocked(c_iter, 0); - c_iter->l[0].b = NULL; + mark_btree_node_unlocked(c_iter.path, 0); + c_iter.path->l[0].b = NULL; six_lock_write(&ck->c.lock, NULL, NULL); @@ -451,8 +463,8 @@ evict: mutex_unlock(&c->btree_key_cache.lock); } out: - bch2_trans_iter_put(trans, b_iter); - bch2_trans_iter_put(trans, c_iter); + bch2_trans_iter_exit(trans, &b_iter); + bch2_trans_iter_exit(trans, &c_iter); return ret; } @@ -503,11 +515,11 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, } bool bch2_btree_insert_key_cached(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, struct bkey_i *insert) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) iter->l[0].b; + struct bkey_cached *ck = (void *) path->l[0].b; bool kick_reclaim = false; BUG_ON(insert->u64s > ck->u64s); @@ -664,11 +676,12 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) rcu_read_lock(); tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); - for (i = 0; i < tbl->size; i++) - rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { - bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed); - } + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &bc->freed); + } rcu_read_unlock(); list_for_each_entry_safe(ck, n, &bc->freed, list) { diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index 7e2b0a0..b3d241b 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); size_t max_dirty = 4096 + (nr_keys * 3) / 4; - return nr_dirty > max_dirty && - test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + return nr_dirty > max_dirty; } int bch2_btree_key_cache_journal_flush(struct journal *, @@ -26,10 +25,11 @@ int bch2_btree_key_cache_journal_flush(struct journal *, struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); -int bch2_btree_iter_traverse_cached(struct btree_iter *); +int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, + unsigned); bool bch2_btree_insert_key_cached(struct btree_trans *, - struct btree_iter *, struct bkey_i *); + struct btree_path *, struct bkey_i *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 7532bcd..b4434ec 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -21,7 +21,7 @@ enum btree_node_locked_type { BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, }; -static inline int btree_node_locked_type(struct btree_iter *iter, +static inline int btree_node_locked_type(struct btree_path *path, unsigned level) { /* @@ -30,35 +30,35 @@ static inline int btree_node_locked_type(struct btree_iter *iter, * branches: */ return BTREE_NODE_UNLOCKED + - ((iter->nodes_locked >> level) & 1) + - ((iter->nodes_intent_locked >> level) & 1); + ((path->nodes_locked >> level) & 1) + + ((path->nodes_intent_locked >> level) & 1); } -static inline bool btree_node_intent_locked(struct btree_iter *iter, +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned level) { - return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; + return btree_node_locked_type(path, level) == BTREE_NODE_INTENT_LOCKED; } -static inline bool btree_node_read_locked(struct btree_iter *iter, +static inline bool btree_node_read_locked(struct btree_path *path, unsigned level) { - return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; + return btree_node_locked_type(path, level) == BTREE_NODE_READ_LOCKED; } -static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) +static inline bool btree_node_locked(struct btree_path *path, unsigned level) { - return iter->nodes_locked & (1 << level); + return path->nodes_locked & (1 << level); } -static inline void mark_btree_node_unlocked(struct btree_iter *iter, +static inline void mark_btree_node_unlocked(struct btree_path *path, unsigned level) { - iter->nodes_locked &= ~(1 << level); - iter->nodes_intent_locked &= ~(1 << level); + path->nodes_locked &= ~(1 << level); + path->nodes_intent_locked &= ~(1 << level); } -static inline void mark_btree_node_locked(struct btree_iter *iter, +static inline void mark_btree_node_locked(struct btree_path *path, unsigned level, enum six_lock_type type) { @@ -66,52 +66,52 @@ static inline void mark_btree_node_locked(struct btree_iter *iter, BUILD_BUG_ON(SIX_LOCK_read != 0); BUILD_BUG_ON(SIX_LOCK_intent != 1); - iter->nodes_locked |= 1 << level; - iter->nodes_intent_locked |= type << level; + path->nodes_locked |= 1 << level; + path->nodes_intent_locked |= type << level; } -static inline void mark_btree_node_intent_locked(struct btree_iter *iter, +static inline void mark_btree_node_intent_locked(struct btree_path *path, unsigned level) { - mark_btree_node_locked(iter, level, SIX_LOCK_intent); + mark_btree_node_locked(path, level, SIX_LOCK_intent); } -static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) { - return level < iter->locks_want + return level < path->locks_want ? SIX_LOCK_intent : SIX_LOCK_read; } static inline enum btree_node_locked_type -btree_lock_want(struct btree_iter *iter, int level) +btree_lock_want(struct btree_path *path, int level) { - if (level < iter->level) + if (level < path->level) return BTREE_NODE_UNLOCKED; - if (level < iter->locks_want) + if (level < path->locks_want) return BTREE_NODE_INTENT_LOCKED; - if (level == iter->level) + if (level == path->level) return BTREE_NODE_READ_LOCKED; return BTREE_NODE_UNLOCKED; } -static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) +static inline void btree_node_unlock(struct btree_path *path, unsigned level) { - int lock_type = btree_node_locked_type(iter, level); + int lock_type = btree_node_locked_type(path, level); EBUG_ON(level >= BTREE_MAX_DEPTH); if (lock_type != BTREE_NODE_UNLOCKED) - six_unlock_type(&iter->l[level].b->c.lock, lock_type); - mark_btree_node_unlocked(iter, level); + six_unlock_type(&path->l[level].b->c.lock, lock_type); + mark_btree_node_unlocked(path, level); } -static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) +static inline void __bch2_btree_path_unlock(struct btree_path *path) { - btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); - while (iter->nodes_locked) - btree_node_unlock(iter, __ffs(iter->nodes_locked)); + while (path->nodes_locked) + btree_node_unlock(path, __ffs(path->nodes_locked)); } static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) @@ -128,23 +128,35 @@ static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) } } -/* - * wrapper around six locks that just traces lock contended time - */ -static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) +static inline bool btree_node_lock_type(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct bpos pos, unsigned level, + enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) { - u64 start_time = local_clock(); + struct bch_fs *c = trans->c; + u64 start_time; + bool ret; - six_lock_type(&b->c.lock, type, NULL, NULL); - bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -} + if (six_trylock_type(&b->c.lock, type)) + return true; -static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, - enum six_lock_type type) -{ - if (!six_trylock_type(&b->c.lock, type)) - __btree_node_lock_type(c, b, type); + start_time = local_clock(); + + trans->locking_path_idx = path->idx; + trans->locking_pos = pos; + trans->locking_btree_id = path->btree_id; + trans->locking_level = level; + trans->locking_lock_type = type; + trans->locking = b; + ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0; + trans->locking = NULL; + + if (ret) + bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); + + return ret; } /* @@ -155,11 +167,11 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, struct btree *b, unsigned level, enum btree_node_locked_type want) { - struct btree_iter *iter; + struct btree_path *path; - trans_for_each_iter(trans, iter) - if (iter->l[level].b == b && - btree_node_locked_type(iter, level) >= want) { + trans_for_each_path(trans, path) + if (path->l[level].b == b && + btree_node_locked_type(path, level) >= want) { six_lock_increment(&b->c.lock, want); return true; } @@ -167,40 +179,39 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, return false; } -bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, - struct btree_iter *, enum six_lock_type, +bool __bch2_btree_node_lock(struct btree_trans *, struct btree_path *, + struct btree *, struct bpos, unsigned, + enum six_lock_type, six_lock_should_sleep_fn, void *, unsigned long); -static inline bool btree_node_lock(struct btree *b, - struct bpos pos, unsigned level, - struct btree_iter *iter, +static inline bool btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, struct bpos pos, unsigned level, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p, unsigned long ip) { - struct btree_trans *trans = iter->trans; - EBUG_ON(level >= BTREE_MAX_DEPTH); - EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); return likely(six_trylock_type(&b->c.lock, type)) || btree_node_lock_increment(trans, b, level, type) || - __bch2_btree_node_lock(b, pos, level, iter, type, + __bch2_btree_node_lock(trans, path, b, pos, level, type, should_sleep_fn, p, ip); } -bool __bch2_btree_node_relock(struct btree_iter *, unsigned); +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned); -static inline bool bch2_btree_node_relock(struct btree_iter *iter, - unsigned level) +static inline bool bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) { - EBUG_ON(btree_node_locked(iter, level) && - btree_node_locked_type(iter, level) != - __btree_lock_want(iter, level)); + EBUG_ON(btree_node_locked(path, level) && + btree_node_locked_type(path, level) != + __btree_lock_want(path, level)); - return likely(btree_node_locked(iter, level)) || - __bch2_btree_node_relock(iter, level); + return likely(btree_node_locked(path, level)) || + __bch2_btree_node_relock(trans, path, level); } /* @@ -208,30 +219,35 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, * succeed: */ static inline void -bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) { - struct btree_iter *linked; + struct btree_path *linked; - EBUG_ON(iter->l[b->c.level].b != b); - EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_path_with_node(trans, b, linked) linked->l[b->c.level].lock_seq += 2; six_unlock_write(&b->c.lock); } -void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); -void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); +void __bch2_btree_node_lock_write(struct btree_trans *, struct btree *); -static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) +static inline void bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { - EBUG_ON(iter->l[b->c.level].b != b); - EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq != b->c.lock.state.seq); + EBUG_ON(!btree_node_intent_locked(path, b->c.level)); if (unlikely(!six_trylock_write(&b->c.lock))) - __bch2_btree_node_lock_write(b, iter); + __bch2_btree_node_lock_write(trans, b); } #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index a1e5deb..68272f2 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -176,52 +176,47 @@ struct btree_node_iter { } data[MAX_BSETS]; }; -enum btree_iter_type { - BTREE_ITER_KEYS, - BTREE_ITER_NODES, - BTREE_ITER_CACHED, -}; - -#define BTREE_ITER_TYPE ((1 << 2) - 1) - /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -#define BTREE_ITER_SLOTS (1 << 2) +#define BTREE_ITER_SLOTS (1 << 0) /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 3) +#define BTREE_ITER_INTENT (1 << 1) /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 4) +#define BTREE_ITER_PREFETCH (1 << 2) /* * Indicates that this iterator should not be reused until transaction commit, * either because a pending update references it or because the update depends * on that particular key being locked (e.g. by the str_hash code, for hash * table consistency) */ -#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) +#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 3) /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 6) -#define BTREE_ITER_NOT_EXTENTS (1 << 7) -#define BTREE_ITER_ERROR (1 << 8) -#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 9) -#define BTREE_ITER_CACHED_NOFILL (1 << 10) -#define BTREE_ITER_CACHED_NOCREATE (1 << 11) -#define BTREE_ITER_WITH_UPDATES (1 << 12) +#define BTREE_ITER_IS_EXTENTS (1 << 4) +#define BTREE_ITER_NOT_EXTENTS (1 << 5) +#define BTREE_ITER_CACHED (1 << 6) +#define BTREE_ITER_CACHED_NOFILL (1 << 7) +#define BTREE_ITER_CACHED_NOCREATE (1 << 8) +#define BTREE_ITER_WITH_KEY_CACHE (1 << 9) +#define BTREE_ITER_WITH_UPDATES (1 << 10) +#define BTREE_ITER_WITH_JOURNAL (1 << 11) +#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12) #define BTREE_ITER_ALL_SNAPSHOTS (1 << 13) +#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14) +#define BTREE_ITER_NOPRESERVE (1 << 15) -enum btree_iter_uptodate { +enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, - BTREE_ITER_NEED_PEEK = 1, - BTREE_ITER_NEED_RELOCK = 2, - BTREE_ITER_NEED_TRAVERSE = 3, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, }; #define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) @@ -233,74 +228,78 @@ enum btree_iter_uptodate { #define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) #define BTREE_ITER_NO_NODE_CACHED ((struct btree *) 8) -/* - * @pos - iterator's current position - * @level - current btree depth - * @locks_want - btree level below which we start taking intent locks - * @nodes_locked - bitmask indicating which nodes in @nodes are locked - * @nodes_intent_locked - bitmask indicating which locks are intent locks - */ -struct btree_iter { - struct btree_trans *trans; - unsigned long ip_allocated; - +struct btree_path { u8 idx; - u8 child_idx; u8 sorted_idx; + u8 ref; + u8 intent_ref; /* btree_iter_copy starts here: */ - u16 flags; - - /* When we're filtering by snapshot, the snapshot ID we're looking for: */ - unsigned snapshot; - struct bpos pos; - struct bpos real_pos; - struct bpos pos_after_commit; enum btree_id btree_id:4; - enum btree_iter_uptodate uptodate:3; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; /* - * True if we've returned a key (and thus are expected to keep it - * locked), false after set_pos - for avoiding spurious transaction - * restarts in bch2_trans_relock(): + * When true, failing to relock this path will cause the transaction to + * restart: */ bool should_be_locked:1; - unsigned level:4, - min_depth:4, + unsigned level:3, locks_want:4, nodes_locked:4, nodes_intent_locked:4; - struct btree_iter_level { + struct btree_path_level { struct btree *b; struct btree_node_iter iter; u32 lock_seq; } l[BTREE_MAX_DEPTH]; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif +}; + +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; + enum btree_id btree_id:4; + unsigned min_depth:4; + + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + + struct bpos pos; + struct bpos pos_after_commit; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. */ struct bkey k; +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned long ip_allocated; +#endif }; -static inline enum btree_iter_type -btree_iter_type(const struct btree_iter *iter) -{ - return iter->flags & BTREE_ITER_TYPE; -} - -static inline bool btree_iter_is_cached(const struct btree_iter *iter) -{ - return btree_iter_type(iter) == BTREE_ITER_CACHED; -} - -static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -{ - return iter->l + iter->level; -} - struct btree_key_cache { struct mutex lock; struct rhashtable table; @@ -345,9 +344,12 @@ struct btree_insert_entry { u8 bkey_type; enum btree_id btree_id:8; u8 level; - unsigned trans_triggers_run:1; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; struct bkey_i *k; - struct btree_iter *iter; + struct btree_path *path; + unsigned long ip_allocated; }; #ifndef CONFIG_LOCKDEP @@ -368,40 +370,37 @@ struct btree_trans_commit_hook { struct btree_trans { struct bch_fs *c; -#ifdef CONFIG_BCACHEFS_DEBUG + const char *fn; struct list_head list; struct btree *locking; - unsigned locking_iter_idx; + unsigned locking_path_idx; struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; + u8 locking_lock_type; pid_t pid; -#endif - unsigned long ip; int srcu_idx; u8 nr_sorted; u8 nr_updates; bool used_mempool:1; - bool error:1; bool in_traverse_all:1; bool restarted:1; + bool journal_transaction_names:1; /* * For when bch2_trans_update notices we'll be splitting a compressed * extent: */ unsigned extra_journal_res; - u64 iters_linked; - u64 iters_live; - u64 iters_touched; + u64 paths_allocated; unsigned mem_top; unsigned mem_bytes; void *mem; - u8 *sorted; - struct btree_iter *iters; + u8 sorted[BTREE_ITER_MAX]; + struct btree_path *paths; struct btree_insert_entry *updates; /* update path: */ @@ -605,16 +604,6 @@ static inline bool btree_node_is_extents(struct btree *b) return btree_node_type_is_extents(btree_node_type(b)); } -static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) -{ - return __btree_node_type(iter->level, iter->btree_id); -} - -static inline bool btree_iter_is_extents(struct btree_iter *iter) -{ - return btree_node_type_is_extents(btree_iter_key_type(iter)); -} - #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ((1U << BKEY_TYPE_extents)| \ (1U << BKEY_TYPE_inodes)| \ @@ -624,7 +613,9 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ ((1U << BKEY_TYPE_alloc)| \ - (1U << BKEY_TYPE_stripes)) + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ @@ -647,6 +638,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id) enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -659,6 +651,7 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) @@ -670,8 +663,13 @@ enum btree_update_flags { #define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) #define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ - ((1U << KEY_TYPE_stripe)| \ - (1U << KEY_TYPE_inode)) + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) static inline bool btree_node_type_needs_gc(enum btree_node_type type) { @@ -688,11 +686,6 @@ struct btree_root { s8 error; }; -/* - * Optional hook that will be called just prior to a btree node update, when - * we're holding the write lock and we know what key is about to be overwritten: - */ - enum btree_insert_ret { BTREE_INSERT_OK, /* leaf node needs to be split */ @@ -713,8 +706,4 @@ enum btree_node_sibling { btree_next_sib, }; -typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, - struct btree *, - struct btree_node_iter *); - #endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 217b52e..d9a406a 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -8,10 +8,11 @@ struct bch_fs; struct btree; -void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_iter *, +void bch2_btree_node_lock_for_insert(struct btree_trans *, struct btree_path *, struct btree *); -bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_i *); +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { @@ -60,20 +61,24 @@ int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, - struct bpos, struct bpos, u64 *); + struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_delete_range(struct bch_fs *, enum btree_id, - struct bpos, struct bpos, u64 *); + struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - __le64, unsigned); + struct btree *, unsigned); void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, bool); int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, struct bkey_i *, bool); -int bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); +int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + +int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *); @@ -119,14 +124,14 @@ static inline int bch2_trans_commit(struct btree_trans *trans, #define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ({ \ struct btree_trans trans; \ - int _ret, _ret2; \ + int _ret; \ \ bch2_trans_init(&trans, (_c), 0, 0); \ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ _do); \ - _ret2 = bch2_trans_exit(&trans); \ + bch2_trans_exit(&trans); \ \ - _ret ?: _ret2; \ + _ret; \ }) #define trans_for_each_update(_trans, _i) \ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index c8c3382..088c320 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -16,6 +16,7 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -23,8 +24,9 @@ #include static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - struct btree_iter *, struct btree *, + struct btree_path *, struct btree *, struct keylist *, unsigned); +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); /* Debug code: */ @@ -43,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) BUG_ON(!b->c.level); - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; bch2_btree_node_iter_init_from_start(&iter, b); @@ -152,38 +154,26 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) clear_btree_node_noevict(b); - bch2_btree_node_hash_remove(&c->btree_cache, b); - mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); } -void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) +static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree *b) { - struct open_buckets ob = b->ob; + struct bch_fs *c = trans->c; + struct btree_path *path; - b->ob.nr = 0; + trans_for_each_path(trans, path) + BUG_ON(path->l[b->c.level].b == b && + path->l[b->c.level].lock_seq == b->c.lock.state.seq); - clear_btree_node_dirty(c, b); + six_lock_write(&b->c.lock, NULL, NULL); - btree_node_lock_type(c, b, SIX_LOCK_write); + bch2_btree_node_hash_remove(&c->btree_cache, b); __btree_node_free(c, b); - six_unlock_write(&b->c.lock); - bch2_open_buckets_put(c, &ob); -} - -void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) -{ - struct btree_iter *linked; - - trans_for_each_iter(iter->trans, linked) - BUG_ON(linked->l[b->c.level].b == b); - - six_lock_write(&b->c.lock, NULL, NULL); - __btree_node_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); } @@ -234,12 +224,12 @@ retry: if (IS_ERR(wp)) return ERR_CAST(wp); - if (wp->sectors_free < c->opts.btree_node_size) { + if (wp->sectors_free < btree_sectors(c)) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ob->sectors_free < c->opts.btree_node_size) + if (ob->sectors_free < btree_sectors(c)) ob->sectors_free = 0; bch2_alloc_sectors_done(c, wp); @@ -247,12 +237,14 @@ retry: } bkey_btree_ptr_v2_init(&tmp.k); - bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(c); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); @@ -275,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); + set_btree_node_accessed(b); set_btree_node_dirty(c, b); set_btree_node_need_write(b); @@ -388,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) while (as->nr_prealloc_nodes) { struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; - six_unlock_write(&b->c.lock); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); if (c->btree_reserve_cache_nr < ARRAY_SIZE(c->btree_reserve_cache)) { @@ -402,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) bch2_open_buckets_put(c, &b->ob); } - btree_node_lock_type(c, b, SIX_LOCK_write); __btree_node_free(c, b); six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); } @@ -413,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as) } static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, - unsigned flags, struct closure *cl) + unsigned flags) { struct bch_fs *c = as->c; + struct closure cl; struct btree *b; int ret; + closure_init_stack(&cl); +retry: + BUG_ON(nr_nodes > BTREE_RESERVE_MAX); /* * Protects reaping from the btree node cache and using the btree node * open bucket reserve: + * + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: */ - ret = bch2_btree_cache_cannibalize_lock(c, cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) - return ret; + goto err; while (as->nr_prealloc_nodes < nr_nodes) { b = __bch2_btree_node_alloc(c, &as->disk_res, flags & BTREE_INSERT_NOWAIT - ? NULL : cl, flags); + ? NULL : &cl, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); - goto err_free; + goto err; } as->prealloc_nodes[as->nr_prealloc_nodes++] = b; } bch2_btree_cache_cannibalize_unlock(c); + closure_sync(&cl); return 0; -err_free: +err: bch2_btree_cache_cannibalize_unlock(c); - trace_btree_reserve_get_fail(c, nr_nodes, cl); + closure_sync(&cl); + + if (ret == -EAGAIN) + goto retry; + + trace_btree_reserve_get_fail(c, nr_nodes, &cl); return ret; } @@ -466,15 +473,23 @@ static void bch2_btree_update_free(struct btree_update *as) bch2_disk_reservation_put(c, &as->disk_res); bch2_btree_reserve_put(as); + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + as->start_time); + mutex_lock(&c->btree_interior_update_lock); list_del(&as->unwritten_list); list_del(&as->list); - mutex_unlock(&c->btree_interior_update_lock); closure_debug_destroy(&as->cl); mempool_free(as, &c->btree_interior_update_pool); + /* + * Have to do the wakeup with btree_interior_update_lock still held, + * since being on btree_interior_update_list is our ref on @c: + */ closure_wake_up(&c->btree_interior_update_wait); + + mutex_unlock(&c->btree_interior_update_lock); } static void btree_update_will_delete_key(struct btree_update *as, @@ -605,8 +620,8 @@ err: * we're in journal error state: */ - btree_node_lock_type(c, b, SIX_LOCK_intent); - btree_node_lock_type(c, b, SIX_LOCK_write); + six_lock_intent(&b->c.lock, NULL, NULL); + six_lock_write(&b->c.lock, NULL, NULL); mutex_lock(&c->btree_interior_update_lock); list_del(&as->write_blocked_list); @@ -660,7 +675,7 @@ err: for (i = 0; i < as->nr_new_nodes; i++) { b = as->new_nodes[i]; - btree_node_lock_type(c, b, SIX_LOCK_read); + six_lock_read(&b->c.lock, NULL, NULL); btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); } @@ -773,7 +788,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) * And it adds @b to the list of @as's new nodes, so that we can update sector * counts in bch2_btree_update_nodes_written: */ -void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) +static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; @@ -827,7 +842,7 @@ found: closure_put(&as->cl); } -void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) +static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) { while (b->ob.nr) as->open_buckets[as->nr_open_buckets++] = @@ -839,7 +854,7 @@ void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b * nodes and thus outstanding btree_updates - redirect @b's * btree_updates to point to this btree_update: */ -void bch2_btree_interior_update_will_free_node(struct btree_update *as, +static void bch2_btree_interior_update_will_free_node(struct btree_update *as, struct btree *b) { struct bch_fs *c = as->c; @@ -911,8 +926,11 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as, as->nr_old_nodes++; } -void bch2_btree_update_done(struct btree_update *as) +static void bch2_btree_update_done(struct btree_update *as) { + struct bch_fs *c = as->c; + u64 start_time = as->start_time; + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); if (as->took_gc_lock) @@ -923,38 +941,39 @@ void bch2_btree_update_done(struct btree_update *as) continue_at(&as->cl, btree_update_set_nodes_written, as->c->btree_interior_update_worker); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], + start_time); } -struct btree_update * -bch2_btree_update_start(struct btree_iter *iter, unsigned level, - unsigned nr_nodes, unsigned flags) +static struct btree_update * +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + unsigned level, unsigned nr_nodes, unsigned flags) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct btree_update *as; - struct closure cl; + u64 start_time = local_clock(); int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0; int journal_flags = 0; int ret = 0; - BUG_ON(!iter->should_be_locked); + BUG_ON(!path->should_be_locked); if (flags & BTREE_INSERT_JOURNAL_RESERVED) journal_flags |= JOURNAL_RES_GET_RESERVED; - - closure_init_stack(&cl); -retry: + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; /* * XXX: figure out how far we might need to split, * instead of locking/reserving all the way to the root: */ - if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { - trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_, - iter->btree_id, - &iter->real_pos); - return ERR_PTR(-EINTR); + if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) { + trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_, + path->btree_id, &path->pos); + ret = btree_trans_restart(trans); + return ERR_PTR(ret); } if (flags & BTREE_INSERT_GC_LOCK_HELD) @@ -972,9 +991,10 @@ retry: memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); as->c = c; + as->start_time = start_time; as->mode = BTREE_INTERIOR_NO_UPDATE; as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); - as->btree_id = iter->btree_id; + as->btree_id = path->btree_id; INIT_LIST_HEAD(&as->list); INIT_LIST_HEAD(&as->unwritten_list); INIT_LIST_HEAD(&as->write_blocked_list); @@ -998,43 +1018,34 @@ retry: if (ret) goto err; + bch2_trans_unlock(trans); + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, - journal_flags|JOURNAL_RES_GET_NONBLOCK); - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { - bch2_btree_update_free(as); - btree_trans_restart(trans); - return ERR_PTR(ret); - } - - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, - BTREE_UPDATE_JOURNAL_RES, - journal_flags); - if (ret) { - trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_); - goto err; - } - - if (!bch2_trans_relock(trans)) { - ret = -EINTR; - goto err; - } + journal_flags); + if (ret) { + bch2_btree_update_free(as); + trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return ERR_PTR(ret); } ret = bch2_disk_reservation_get(c, &as->disk_res, - nr_nodes * c->opts.btree_node_size, + nr_nodes * btree_sectors(c), c->opts.metadata_replicas, disk_res_flags); if (ret) goto err; - ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(as, nr_nodes, flags); if (ret) goto err; + if (!bch2_trans_relock(trans)) { + ret = -EINTR; + goto err; + } + bch2_journal_pin_add(&c->journal, atomic64_read(&c->journal.seq), &as->journal, NULL); @@ -1042,16 +1053,6 @@ retry: return as; err: bch2_btree_update_free(as); - - if (ret == -EAGAIN) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = -EINTR; - } - - if (ret == -EINTR && bch2_trans_relock(trans)) - goto retry; - return ERR_PTR(ret); } @@ -1092,8 +1093,10 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) * is nothing new to be done. This just guarantees that there is a * journal write. */ -static void bch2_btree_set_root(struct btree_update *as, struct btree *b, - struct btree_iter *iter) +static void bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b) { struct bch_fs *c = as->c; struct btree *old; @@ -1108,7 +1111,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, * Ensure no one is using the old root while we switch to the * new root: */ - bch2_btree_node_lock_write(old, iter); + bch2_btree_node_lock_write(trans, path, old); bch2_btree_set_root_inmem(c, b); @@ -1121,15 +1124,17 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b, * an intent lock on the new root, and any updates that would * depend on the new root would have to update the new root. */ - bch2_btree_node_unlock_write(old, iter); + bch2_btree_node_unlock_write(trans, path, old); } /* Interior node updates: */ -static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, - struct btree_iter *iter, - struct bkey_i *insert, - struct btree_node_iter *node_iter) +static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) { struct bch_fs *c = as->c; struct bkey_packed *k; @@ -1138,6 +1143,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && !btree_ptr_sectors_written(insert)); + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); if (invalid) { @@ -1161,15 +1169,18 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) bch2_btree_node_iter_advance(node_iter, b); - bch2_btree_bset_insert_key(iter, b, node_iter, insert); + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty(c, b); set_btree_node_need_write(b); } static void -__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys, - struct btree_node_iter node_iter) +__bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) { struct bkey_i *insert = bch2_keylist_front(keys); struct bkey_packed *k; @@ -1181,8 +1192,8 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ; while (!bch2_keylist_empty(keys)) { - bch2_insert_fixup_btree_ptr(as, b, iter, - bch2_keylist_front(keys), &node_iter); + bch2_insert_fixup_btree_ptr(as, trans, path, b, + &node_iter, bch2_keylist_front(keys)); bch2_keylist_pop_front(keys); } } @@ -1192,8 +1203,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * node) */ static struct btree *__btree_split_node(struct btree_update *as, - struct btree *n1, - struct btree_iter *iter) + struct btree *n1) { struct bkey_format_state s; size_t nr_packed = 0, nr_unpacked = 0; @@ -1308,8 +1318,10 @@ static struct btree *__btree_split_node(struct btree_update *as, * nodes that were coalesced, and thus in the middle of a child node post * coalescing: */ -static void btree_split_insert_keys(struct btree_update *as, struct btree *b, - struct btree_iter *iter, +static void btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, struct keylist *keys) { struct btree_node_iter node_iter; @@ -1319,7 +1331,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, bch2_btree_node_iter_init(&node_iter, b, &k->k.p); - __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter); + __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); /* * We can't tolerate whiteouts here - with whiteouts there can be @@ -1349,18 +1361,17 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, btree_node_interior_verify(as->c, b); } -static void btree_split(struct btree_update *as, - struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, struct keylist *keys, - unsigned flags) +static void btree_split(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; - struct btree *parent = btree_node_parent(iter, b); + struct btree *parent = btree_node_parent(path, b); struct btree *n1, *n2 = NULL, *n3 = NULL; u64 start_time = local_clock(); BUG_ON(!parent && (b != btree_node_root(c, b))); - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); bch2_btree_interior_update_will_free_node(as, b); @@ -1368,12 +1379,12 @@ static void btree_split(struct btree_update *as, bch2_btree_update_add_new_node(as, n1); if (keys) - btree_split_insert_keys(as, n1, iter, keys); + btree_split_insert_keys(as, trans, path, n1, keys); if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { trace_btree_split(c, b); - n2 = __btree_split_node(as, n1, iter); + n2 = __btree_split_node(as, n1); bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n1); @@ -1398,7 +1409,7 @@ static void btree_split(struct btree_update *as, n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; - btree_split_insert_keys(as, n3, iter, &as->parent_keys); + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); bch2_btree_node_write(c, n3, SIX_LOCK_intent); } @@ -1418,12 +1429,12 @@ static void btree_split(struct btree_update *as, if (parent) { /* Split a non root node */ - bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); } else if (n3) { - bch2_btree_set_root(as, n3, iter); + bch2_btree_set_root(as, trans, path, n3); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(as, n1, iter); + bch2_btree_set_root(as, trans, path, n1); } bch2_btree_update_get_open_buckets(as, n1); @@ -1432,15 +1443,14 @@ static void btree_split(struct btree_update *as, if (n3) bch2_btree_update_get_open_buckets(as, n3); - /* Successful split, update the iterator to point to the new nodes: */ + /* Successful split, update the path to point to the new nodes: */ six_lock_increment(&b->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); if (n3) - bch2_btree_iter_node_replace(iter, n3); + bch2_trans_node_add(trans, n3); if (n2) - bch2_btree_iter_node_replace(iter, n2); - bch2_btree_iter_node_replace(iter, n1); + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); /* * The old node must be freed (in memory) _before_ unlocking the new @@ -1448,7 +1458,7 @@ static void btree_split(struct btree_update *as, * node after another thread has locked and updated the new node, thus * seeing stale data: */ - bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(trans, b); if (n3) six_unlock_intent(&n3->c.lock); @@ -1456,26 +1466,32 @@ static void btree_split(struct btree_update *as, six_unlock_intent(&n2->c.lock); six_unlock_intent(&n1->c.lock); - bch2_btree_trans_verify_locks(trans); + bch2_trans_verify_locks(trans); - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], + bch2_time_stats_update(&c->times[n2 + ? BCH_TIME_btree_node_split + : BCH_TIME_btree_node_compact], start_time); } static void -bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, - struct btree_iter *iter, struct keylist *keys) +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) { - struct btree_iter *linked; + struct btree_path *linked; - __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter); + __bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); btree_update_updated_node(as, b); - trans_for_each_iter_with_node(iter->trans, b, linked) + trans_for_each_path_with_node(trans, b, linked) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); - bch2_btree_trans_verify_iters(iter->trans, b); + bch2_trans_verify_paths(trans); } /** @@ -1490,10 +1506,9 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic. */ -static void bch2_btree_insert_node(struct btree_update *as, - struct btree_trans *trans, struct btree_iter *iter, - struct btree *b, struct keylist *keys, - unsigned flags) +static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) { struct bch_fs *c = as->c; int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); @@ -1501,21 +1516,21 @@ static void bch2_btree_insert_node(struct btree_update *as, int live_u64s_added, u64s_added; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); + BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - bch2_btree_node_lock_for_insert(trans, iter, b); + bch2_btree_node_lock_for_insert(trans, path, b); if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { - bch2_btree_node_unlock_write(b, iter); + bch2_btree_node_unlock_write(trans, path, b); goto split; } btree_node_interior_verify(c, b); - bch2_btree_insert_keys_interior(as, b, iter, keys); + bch2_btree_insert_keys_interior(as, trans, path, b, keys); live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; @@ -1527,48 +1542,48 @@ static void bch2_btree_insert_node(struct btree_update *as, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); - bch2_btree_node_unlock_write(b, iter); + bch2_btree_node_unlock_write(trans, path, b); btree_node_interior_verify(c, b); return; split: - btree_split(as, trans, iter, b, keys, flags); + btree_split(as, trans, path, b, keys, flags); } int bch2_btree_split_leaf(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned flags) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; + struct btree *b = path_l(path)->b; struct btree_update *as; unsigned l; int ret = 0; - as = bch2_btree_update_start(iter, iter->level, + as = bch2_btree_update_start(trans, path, path->level, btree_update_reserve_required(c, b), flags); if (IS_ERR(as)) return PTR_ERR(as); - btree_split(as, trans, iter, b, NULL, flags); + btree_split(as, trans, path, b, NULL, flags); bch2_btree_update_done(as); - for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++) - ret = bch2_foreground_maybe_merge(trans, iter, l, flags); + for (l = path->level + 1; btree_path_node(path, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(trans, path, l, flags); return ret; } int __bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct bch_fs *c = trans->c; - struct btree_iter *sib_iter = NULL; + struct btree_path *sib_path = NULL; struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; @@ -1576,39 +1591,36 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, struct btree *b, *m, *n, *prev, *next, *parent; struct bpos sib_pos; size_t sib_u64s; - int ret = 0, ret2 = 0; - -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; + u64 start_time = local_clock(); + int ret = 0; - BUG_ON(!iter->should_be_locked); - BUG_ON(!btree_node_locked(iter, level)); + BUG_ON(!path->should_be_locked); + BUG_ON(!btree_node_locked(path, level)); - b = iter->l[level].b; + b = path->l[level].b; if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { b->sib_u64s[sib] = U16_MAX; - goto out; + return 0; } sib_pos = sib == btree_prev_sib ? bpos_predecessor(b->data->min_key) : bpos_successor(b->data->max_key); - sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id, - sib_pos, U8_MAX, level, - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(sib_iter); + sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); if (ret) goto err; - m = sib_iter->l[level].b; + sib_path->should_be_locked = true; - if (btree_node_parent(iter, b) != - btree_node_parent(sib_iter, m)) { + m = sib_path->l[level].b; + + if (btree_node_parent(path, b) != + btree_node_parent(sib_path, m)) { b->sib_u64s[sib] = U16_MAX; goto out; } @@ -1659,8 +1671,8 @@ retry: if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) goto out; - parent = btree_node_parent(iter, b); - as = bch2_btree_update_start(iter, level, + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, level, btree_update_reserve_required(c, parent) + 1, flags| BTREE_INSERT_NOFAIL| @@ -1696,47 +1708,34 @@ retry: bch2_keylist_add(&as->parent_keys, &delete); bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, trans, iter, parent, &as->parent_keys, flags); + bch2_trans_verify_paths(trans); + + bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); six_lock_increment(&m->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); - bch2_btree_iter_node_drop(iter, m); - bch2_btree_iter_node_replace(iter, n); + bch2_trans_node_add(trans, n); - bch2_btree_trans_verify_iters(trans, n); + bch2_trans_verify_paths(trans); - bch2_btree_node_free_inmem(c, b, iter); - bch2_btree_node_free_inmem(c, m, iter); + bch2_btree_node_free_inmem(trans, b); + bch2_btree_node_free_inmem(trans, m); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); -out: - bch2_btree_trans_verify_locks(trans); - bch2_trans_iter_free(trans, sib_iter); - /* - * Don't downgrade locks here: we're called after successful insert, - * and the caller will downgrade locks after a successful insert - * anyways (in case e.g. a split was required first) - * - * And we're also called when inserting into interior nodes in the - * split path, and downgrading to read locks in there is potentially - * confusing: - */ - return ret ?: ret2; + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); +out: err: - bch2_trans_iter_put(trans, sib_iter); - sib_iter = NULL; - - if (ret == -EINTR && bch2_trans_relock(trans)) - goto retry; - - goto out; + bch2_path_put(trans, sib_path, true); + bch2_trans_verify_locks(trans); + return ret; } /** @@ -1744,32 +1743,23 @@ err: */ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, - __le64 seq, unsigned flags) + struct btree *b, + unsigned flags) { struct bch_fs *c = trans->c; - struct btree *b, *n, *parent; + struct btree *n, *parent; struct btree_update *as; int ret; flags |= BTREE_INSERT_NOFAIL; -retry: - ret = bch2_btree_iter_traverse(iter); - if (ret) - goto out; - b = bch2_btree_iter_peek_node(iter); - if (!b || b->data->keys.seq != seq) - goto out; - - parent = btree_node_parent(iter, b); - as = bch2_btree_update_start(iter, b->c.level, + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, (parent ? btree_update_reserve_required(c, parent) : 0) + 1, flags); ret = PTR_ERR_OR_ZERO(as); - if (ret == -EINTR) - goto retry; if (ret) { trace_btree_gc_rewrite_node_fail(c, b); goto out; @@ -1789,23 +1779,22 @@ retry: if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - bch2_btree_insert_node(as, trans, iter, parent, + bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys, flags); } else { - bch2_btree_set_root(as, n, iter); + bch2_btree_set_root(as, trans, iter->path, n); } bch2_btree_update_get_open_buckets(as, n); six_lock_increment(&b->c.lock, SIX_LOCK_intent); - bch2_btree_iter_node_drop(iter, b); - bch2_btree_iter_node_replace(iter, n); - bch2_btree_node_free_inmem(c, b, iter); + bch2_trans_node_add(trans, n); + bch2_btree_node_free_inmem(trans, b); six_unlock_intent(&n->c.lock); bch2_btree_update_done(as); out: - bch2_btree_iter_downgrade(iter); + bch2_btree_path_downgrade(iter->path); return ret; } @@ -1818,20 +1807,38 @@ struct async_btree_rewrite { __le64 seq; }; +static int async_btree_node_rewrite_trans(struct btree_trans *trans, + struct async_btree_rewrite *a) +{ + struct btree_iter iter; + struct btree *b; + int ret; + + bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + if (!b || b->data->keys.seq != a->seq) + goto out; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +out : + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - struct btree_trans trans; - struct btree_iter *iter; - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos, - BTREE_MAX_DEPTH, a->level, 0); - bch2_btree_node_rewrite(&trans, iter, a->seq, 0); - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + bch2_trans_do(c, NULL, NULL, 0, + async_btree_node_rewrite_trans(&trans, a)); percpu_ref_put(&c->writes); kfree(a); } @@ -1840,9 +1847,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) - return; - if (!percpu_ref_tryget(&c->writes)) return; @@ -1869,7 +1873,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bool skip_triggers) { struct bch_fs *c = trans->c; - struct btree_iter *iter2 = NULL; + struct btree_iter iter2 = { NULL }; struct btree *parent; u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX]; int ret; @@ -1897,19 +1901,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BUG_ON(ret); } - parent = btree_node_parent(iter, b); + parent = btree_node_parent(iter->path, b); if (parent) { - iter2 = bch2_trans_copy_iter(trans, iter); + bch2_trans_copy_iter(&iter2, iter); - BUG_ON(iter2->level != b->c.level); - BUG_ON(bpos_cmp(iter2->pos, new_key->k.p)); + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, + iter2.flags & BTREE_ITER_INTENT, + _THIS_IP_); - btree_node_unlock(iter2, iter2->level); - iter2->l[iter2->level].b = BTREE_ITER_NO_NODE_UP; - iter2->level++; + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); - ret = bch2_btree_iter_traverse(iter2) ?: - bch2_trans_update(trans, iter2, new_key, BTREE_TRIGGER_NORUN); + btree_node_unlock(iter2.path, iter2.path->level); + path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP; + iter2.path->level++; + + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); if (ret) goto err; } else { @@ -1926,12 +1934,13 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| BTREE_INSERT_JOURNAL_RESERVED); if (ret) goto err; - bch2_btree_node_lock_write(b, iter); + bch2_btree_node_lock_write(trans, iter->path, b); if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -1946,9 +1955,9 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bkey_copy(&b->key, new_key); } - bch2_btree_node_unlock_write(b, iter); + bch2_btree_node_unlock_write(trans, iter->path, b); out: - bch2_trans_iter_put(trans, iter2); + bch2_trans_iter_exit(trans, &iter2); return ret; err: if (new_hash) { @@ -1965,9 +1974,16 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; + struct btree_path *path = iter->path; struct closure cl; int ret = 0; + if (!btree_node_intent_locked(path, b->c.level) && + !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) { + btree_trans_restart(trans); + return -EINTR; + } + closure_init_stack(&cl); /* @@ -1986,8 +2002,10 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite new_hash = bch2_btree_node_mem_alloc(c); } + path->intent_ref++; ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, skip_triggers); + --path->intent_ref; if (new_hash) { mutex_lock(&c->btree_cache.lock); @@ -2006,18 +2024,18 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, struct btree *b, struct bkey_i *new_key, bool skip_triggers) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_trans_get_node_iter(trans, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); + bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter); if (ret) goto out; /* has node been freed? */ - if (iter->l[b->c.level].b != b) { + if (iter.path->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); goto out; @@ -2025,9 +2043,9 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - ret = bch2_btree_node_update_key(trans, iter, b, new_key, skip_triggers); + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index e88e737..8dc86fa 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -35,6 +35,7 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, struct btree_update { struct closure cl; struct bch_fs *c; + u64 start_time; struct list_head list; struct list_head unwritten_list; @@ -81,12 +82,12 @@ struct btree_update { /* Nodes being freed: */ struct keylist old_keys; u64 _old_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* Nodes being added: */ struct keylist new_keys; u64 _new_keys[BTREE_UPDATE_NODES_MAX * - BKEY_BTREE_PTR_VAL_U64s_MAX]; + BKEY_BTREE_PTR_U64s_MAX]; /* New nodes, that will be made reachable by this update: */ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; @@ -113,57 +114,39 @@ struct btree_update { u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; }; -void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, - struct btree_iter *); -void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); - -void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); - struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, struct btree *, struct bkey_format); -void bch2_btree_update_done(struct btree_update *); -struct btree_update * -bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned); - -void bch2_btree_interior_update_will_free_node(struct btree_update *, - struct btree *); -void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); - -int bch2_btree_split_leaf(struct btree_trans *, struct btree_iter *, unsigned); +int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); -int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_iter *, +int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, unsigned, unsigned, enum btree_node_sibling); static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned level, unsigned flags, enum btree_node_sibling sib) { struct btree *b; - if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) - return 0; - - if (!bch2_btree_node_relock(iter, level)) - return 0; + EBUG_ON(!btree_node_locked(path, level)); - b = iter->l[level].b; + b = path->l[level].b; if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) return 0; - return __bch2_foreground_maybe_merge(trans, iter, level, flags, sib); + return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); } static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned level, unsigned flags) { - return bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: - bch2_foreground_maybe_merge_sibling(trans, iter, level, flags, + bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_next_sib); } @@ -235,7 +218,7 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, { ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + b->whiteout_u64s; - ssize_t total = c->opts.btree_node_size << 6; + ssize_t total = c->opts.btree_node_size >> 3; /* Always leave one extra u64 for bch2_varint_decode: */ used++; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 7e9909e..4b37a48 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -15,12 +15,18 @@ #include "journal.h" #include "journal_reclaim.h" #include "keylist.h" +#include "recovery.h" +#include "subvolume.h" #include "replicas.h" #include #include #include +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags); + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { @@ -29,40 +35,59 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, bpos_cmp(l->k->k.p, r->k->k.p); } +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + static inline bool same_leaf_as_prev(struct btree_trans *trans, struct btree_insert_entry *i) { return i != trans->updates && - iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; + insert_l(&i[0])->b == insert_l(&i[-1])->b; } -inline void bch2_btree_node_lock_for_insert(struct btree_trans *trans, - struct btree_iter *iter, - struct btree *b) +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) { - struct bch_fs *c = trans->c; + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} - bch2_btree_node_lock_write(b, iter); +static inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; - if (btree_iter_type(iter) == BTREE_ITER_CACHED) + if (path->cached) return; if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); /* * If the last bset has been written, or if it's gotten too big - start * a new bset to insert into: */ if (want_new_bset(c, b)) - bch2_btree_init_next(trans, iter, b); + bch2_btree_init_next(trans, b); +} + +void bch2_btree_node_lock_for_insert(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + bch2_btree_node_lock_write(trans, path, b); + bch2_btree_node_prep_for_write(trans, path, b); } /* Inserting into a given leaf node (last stage of insert): */ /* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_iter *iter, +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, struct btree *b, struct btree_node_iter *node_iter, struct bkey_i *insert) @@ -76,8 +101,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); EBUG_ON(insert->k.u64s > - bch_btree_keys_u64s_remaining(iter->trans->c, b)); - EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + bch_btree_keys_u64s_remaining(trans->c, b)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -96,7 +120,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, k->type = KEY_TYPE_deleted; if (k->needs_whiteout) - push_whiteout(iter->trans->c, b, insert->k.p); + push_whiteout(trans->c, b, insert->k.p); k->needs_whiteout = false; if (k >= btree_bset_last(b)->start) { @@ -104,7 +128,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, bch2_bset_delete(b, k, clobber_u64s); goto fix_iter; } else { - bch2_btree_iter_fix_key_modified(iter, b, k); + bch2_btree_path_fix_key_modified(trans, b, k); } return true; @@ -122,7 +146,7 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter, clobber_u64s = k->u64s; goto overwrite; } else { - bch2_btree_iter_fix_key_modified(iter, b, k); + bch2_btree_path_fix_key_modified(trans, b, k); } } @@ -132,7 +156,7 @@ overwrite: new_u64s = k->u64s; fix_iter: if (clobber_u64s != new_u64s) - bch2_btree_node_iter_fix(iter, b, node_iter, k, + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, clobber_u64s, new_u64s); return true; } @@ -144,7 +168,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, struct btree_write *w = container_of(pin, struct btree_write, journal); struct btree *b = container_of(w, struct btree, writes[i]); - btree_node_lock_type(c, b, SIX_LOCK_read); + six_lock_read(&b->c.lock, NULL, NULL); bch2_btree_node_write_cond(c, b, (btree_current_write(b) == w && w->journal.seq == seq)); six_unlock_read(&b->c.lock); @@ -176,22 +200,18 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, * btree_insert_key - insert a key one key into a leaf node */ static bool btree_insert_key_leaf(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *insert) + struct btree_insert_entry *insert) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; + struct btree *b = insert_l(insert)->b; struct bset_tree *t = bset_tree_last(b); struct bset *i = bset(b, t); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - EBUG_ON(!iter->level && - !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); - - if (unlikely(!bch2_btree_bset_insert_key(iter, b, - &iter_l(iter)->iter, insert))) + if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, + &insert_l(insert)->iter, insert->k))) return false; i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, @@ -212,9 +232,8 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, if (u64s_added > live_u64s_added && bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_trans_node_reinit_iter(trans, b); - trace_btree_insert_key(c, b, insert); return true; } @@ -225,9 +244,15 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(bpos_cmp(i->k->k.p, i->iter->real_pos)); - BUG_ON(i->level != i->iter->level); - BUG_ON(i->btree_id != i->iter->btree_id); + BUG_ON(bpos_cmp(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); } static noinline int @@ -245,7 +270,7 @@ bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, return ret; if (!bch2_trans_relock(trans)) { - trace_trans_restart_journal_preres_get(trans->ip, trace_ip); + trace_trans_restart_journal_preres_get(trans->fn, trace_ip); return -EINTR; } @@ -267,13 +292,38 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } -static enum btree_insert_ret +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct jset_entry *entry = journal_res_entry(&c->journal, &trans->journal_res); + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned u64s = JSET_ENTRY_LOG_U64s - 1; + unsigned b, buflen = u64s * sizeof(u64); + + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 0; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + b = min_t(unsigned, strlen(trans->fn), buflen); + memcpy(l->d, trans->fn, b); + while (b < buflen) + l->d[b++] = '\0'; + + trans->journal_res.offset += JSET_ENTRY_LOG_U64s; + trans->journal_res.u64s -= JSET_ENTRY_LOG_U64s; +} + +static inline enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, - struct btree_iter *iter, + struct btree *b, unsigned u64s) { struct bch_fs *c = trans->c; - struct btree *b = iter_l(iter)->b; if (!bch2_btree_node_insert_fits(c, b, u64s)) return BTREE_INSERT_BTREE_NODE_FULL; @@ -283,17 +333,18 @@ btree_key_can_insert(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert_cached(struct btree_trans *trans, - struct btree_iter *iter, + struct btree_path *path, unsigned u64s) { - struct bkey_cached *ck = (void *) iter->l[0].b; + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; unsigned new_u64s; struct bkey_i *new_k; - BUG_ON(iter->level); + EBUG_ON(path->level); if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && - bch2_btree_key_cache_must_wait(trans->c) && + bch2_btree_key_cache_must_wait(c) && !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) return BTREE_INSERT_NEED_JOURNAL_RECLAIM; @@ -308,8 +359,11 @@ btree_key_can_insert_cached(struct btree_trans *trans, new_u64s = roundup_pow_of_two(u64s); new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); - if (!new_k) + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[path->btree_id], new_u64s); return -ENOMEM; + } ck->u64s = new_u64s; ck->k = new_k; @@ -328,9 +382,9 @@ static inline void do_btree_insert_one(struct btree_trans *trans, i->k->k.needs_whiteout = false; - did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED) - ? btree_insert_key_leaf(trans, i->iter, i->k) - : bch2_btree_insert_key_cached(trans, i->iter, i->k); + did_work = !i->cached + ? btree_insert_key_leaf(trans, i) + : bch2_btree_insert_key_cached(trans, i->path, i->k); if (!did_work) return; @@ -340,29 +394,33 @@ static inline void do_btree_insert_one(struct btree_trans *trans, i->level, i->k); - bch2_journal_set_has_inode(j, &trans->journal_res, - i->k->k.p.inode); - if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } } -static noinline void bch2_trans_mark_gc(struct btree_trans *trans) +static noinline int bch2_trans_mark_gc(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + int ret = 0; trans_for_each_update(trans, i) { /* * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc */ - BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); + BUG_ON(i->cached || i->level); - if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) - bch2_mark_update(trans, i->iter, i->k, - i->flags|BTREE_TRIGGER_GC); + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = bch2_mark_update(trans, i->path, i->k, + i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } } + + return ret; } static inline int @@ -378,7 +436,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, int ret; if (race_fault()) { - trace_trans_restart_fault_inject(trans->ip, trace_ip); + trace_trans_restart_fault_inject(trans->fn, trace_ip); trans->restarted = true; return -EINTR; } @@ -405,9 +463,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, u64s = 0; u64s += i->k->k.u64s; - ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED - ? btree_key_can_insert(trans, i->iter, u64s) - : btree_key_can_insert_cached(trans, i->iter, u64s); + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, i->path, u64s); if (ret) { *stopped_at = i; return ret; @@ -417,17 +475,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, marking = true; } - if (marking) { - percpu_down_read(&c->mark_lock); - } - - /* Must be called under mark_lock: */ - if (marking && trans->fs_usage_deltas && - !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) { - ret = BTREE_INSERT_NEED_MARK_REPLICAS; - goto err; - } - /* * Don't get journal reservation until after we know insert will * succeed: @@ -436,7 +483,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_NONBLOCK); if (ret) - goto err; + return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } @@ -464,63 +514,139 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - trans_for_each_update(trans, i) - if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) - bch2_mark_update(trans, i->iter, i->k, - i->flags); + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return BTREE_INSERT_NEED_MARK_REPLICAS; - if (marking && trans->fs_usage_deltas) - bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas); + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = bch2_mark_update(trans, i->path, i->k, i->flags); + if (ret) + return ret; + } - if (unlikely(c->gc_pos.phase)) - bch2_trans_mark_gc(trans); + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_mark_gc(trans); + if (ret) + return ret; + } trans_for_each_update(trans, i) do_btree_insert_one(trans, i); -err: - if (marking) { - percpu_up_read(&c->mark_lock); - } return ret; } -static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter) +static inline void path_upgrade_readers(struct btree_trans *trans, struct btree_path *path) { - struct btree_insert_entry *i; - struct btree *b = iter_l(iter)->b; - struct bkey_s_c old; - int u64s_delta = 0; - int ret; + unsigned l; - /* - * Inserting directly into interior nodes is an uncommon operation with - * various weird edge cases: also, a lot of things about - * BTREE_ITER_NODES iters need to be audited - */ - if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS)) - return 0; + for (l = 0; l < BTREE_MAX_DEPTH; l++) + if (btree_node_read_locked(path, l)) + BUG_ON(!bch2_btree_node_upgrade(trans, path, l)); +} + +static inline void upgrade_readers(struct btree_trans *trans, struct btree_path *path) +{ + struct btree *b = path_l(path)->b; + + do { + if (path->nodes_locked && + path->nodes_locked != path->nodes_intent_locked) + path_upgrade_readers(trans, path); + } while ((path = prev_btree_path(trans, path)) && + path_l(path)->b == b); +} + +/* + * Check for nodes that we have both read and intent locks on, and upgrade the + * readers to intent: + */ +static inline void normalize_read_intent_locks(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i, nr_read = 0, nr_intent = 0; + + trans_for_each_path_inorder(trans, path, i) { + struct btree_path *next = i + 1 < trans->nr_sorted + ? trans->paths + trans->sorted[i + 1] + : NULL; + + if (path->nodes_locked) { + if (path->nodes_intent_locked) + nr_intent++; + else + nr_read++; + } + + if (!next || path_l(path)->b != path_l(next)->b) { + if (nr_read && nr_intent) + upgrade_readers(trans, path); + + nr_read = nr_intent = 0; + } + } + + bch2_trans_verify_locks(trans); +} + +static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos) +{ + struct btree_path *path; + unsigned i; + + trans_for_each_path_inorder(trans, path, i) { + //if (path == pos) + // break; - BUG_ON(iter->level); + if (path->nodes_locked != path->nodes_intent_locked && + !bch2_btree_path_upgrade(trans, path, path->level + 1)) + return true; + } + + return false; +} + +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; trans_for_each_update(trans, i) { - if (iter_l(i->iter)->b != b) + if (same_leaf_as_prev(trans, i)) continue; - old = bch2_btree_iter_peek_slot(i->iter); - ret = bkey_err(old); - if (ret) - return ret; + if (!six_trylock_write(&insert_l(i)->b->c.lock)) { + if (have_conflicting_read_lock(trans, i->path)) + goto fail; - u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; - u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + btree_node_lock_type(trans, i->path, + insert_l(i)->b, + i->path->pos, i->level, + SIX_LOCK_write, NULL, NULL); + } + + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } - if (u64s_delta > 0) - return 0; + return 0; +fail: + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; - return bch2_foreground_maybe_merge(trans, iter, - iter->level, trans->flags); + bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b); + } + + trace_trans_restart_would_deadlock_write(trans->fn); + return btree_trans_restart(trans); +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); } /* @@ -532,29 +658,53 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct btree_iter *iter; - int ret; + struct bkey_s_c old; + int ret, u64s_delta = 0; trans_for_each_update(trans, i) { - struct btree *b; + const char *invalid = bch2_bkey_invalid(c, + bkey_i_to_s_c(i->k), i->bkey_type); + if (invalid) { + char buf[200]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); + bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n", + buf, trans->fn, (void *) i->ip_allocated, invalid); + return -EINVAL; + } + btree_insert_entry_checks(trans, i); + } - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + trans_for_each_update(trans, i) { + struct bkey u; - if (btree_iter_type(i->iter) == BTREE_ITER_CACHED) + /* + * peek_slot() doesn't yet work on iterators that point to + * interior nodes: + */ + if (i->cached || i->level) continue; - b = iter_l(i->iter)->b; - if (b->sib_u64s[0] < c->btree_foreground_merge_threshold || - b->sib_u64s[1] < c->btree_foreground_merge_threshold) { - ret = maybe_do_btree_merge(trans, i->iter); - if (unlikely(ret)) - return ret; + old = bch2_btree_path_peek_slot(i->path, &u); + ret = bkey_err(old); + if (unlikely(ret)) + return ret; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, trans->flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; } } - trans_for_each_update(trans, i) - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); - ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| @@ -566,52 +716,21 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) return ret; - /* - * Can't be holding any read locks when we go to take write locks: - * another thread could be holding an intent lock on the same node we - * have a read lock on, and it'll block trying to take a write lock - * (because we hold a read lock) and it could be blocking us by holding - * its own read lock (while we're trying to to take write locks). - * - * note - this must be done after bch2_trans_journal_preres_get_cold() - * or anything else that might call bch2_trans_relock(), since that - * would just retake the read locks: - */ - trans_for_each_iter(trans, iter) - if (iter->nodes_locked != iter->nodes_intent_locked && - !bch2_btree_iter_upgrade(iter, 1)) { - trace_trans_restart_upgrade(trans->ip, trace_ip, - iter->btree_id, - &iter->real_pos); - trans->restarted = true; - return -EINTR; - } + normalize_read_intent_locks(trans); - trans_for_each_update(trans, i) { - const char *invalid = bch2_bkey_invalid(c, - bkey_i_to_s_c(i->k), i->bkey_type); - if (invalid) { - char buf[200]; - - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); - bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid); - bch2_fatal_error(c); - } - btree_insert_entry_checks(trans, i); - } - bch2_btree_trans_verify_locks(trans); - - trans_for_each_update(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_lock_for_insert(trans, i->iter, - iter_l(i->iter)->b); + ret = trans_lock_write(trans); + if (unlikely(ret)) + return ret; ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_drop_overwrites_from_journal(trans); + trans_for_each_update(trans, i) if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, - i->iter); + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); if (!ret && trans->journal_pin) bch2_journal_pin_add(&c->journal, trans->journal_res.seq, @@ -650,14 +769,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, switch (ret) { case BTREE_INSERT_BTREE_NODE_FULL: - ret = bch2_btree_split_leaf(trans, i->iter, trans->flags); + ret = bch2_btree_split_leaf(trans, i->path, trans->flags); if (!ret) return 0; if (ret == -EINTR) - trace_trans_restart_btree_node_split(trans->ip, trace_ip, - i->iter->btree_id, - &i->iter->real_pos); + trace_trans_restart_btree_node_split(trans->fn, trace_ip, + i->btree_id, &i->path->pos); break; case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); @@ -669,7 +787,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_mark_replicas(trans->ip, trace_ip); + trace_trans_restart_mark_replicas(trans->fn, trace_ip); ret = -EINTR; break; case BTREE_INSERT_NEED_JOURNAL_RES: @@ -689,13 +807,13 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_journal_res_get(trans->ip, trace_ip); + trace_trans_restart_journal_res_get(trans->fn, trace_ip); ret = -EINTR; break; case BTREE_INSERT_NEED_JOURNAL_RECLAIM: bch2_trans_unlock(trans); - trace_trans_blocked_journal_reclaim(trans->ip, trace_ip); + trace_trans_blocked_journal_reclaim(trans->fn, trace_ip); wait_event_freezable(c->journal.reclaim_wait, (ret = journal_reclaim_wait_done(c))); @@ -705,7 +823,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, if (bch2_trans_relock(trans)) return 0; - trace_trans_restart_journal_reclaim(trans->ip, trace_ip); + trace_trans_restart_journal_reclaim(trans->fn, trace_ip); ret = -EINTR; break; default: @@ -714,7 +832,9 @@ int bch2_trans_commit_error(struct btree_trans *trans, } BUG_ON((ret == EINTR || ret == -EAGAIN) && !trans->restarted); - BUG_ON(ret == -ENOSPC && (trans->flags & BTREE_INSERT_NOFAIL)); + BUG_ON(ret == -ENOSPC && + !(trans->flags & BTREE_INSERT_NOWAIT) && + (trans->flags & BTREE_INSERT_NOFAIL)); return ret; } @@ -725,7 +845,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) struct bch_fs *c = trans->c; int ret; - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) + if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) return -EROFS; bch2_trans_unlock(trans); @@ -734,125 +855,128 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans) if (ret) return ret; + if (!bch2_trans_relock(trans)) + return -EINTR; + percpu_ref_get(&c->writes); return 0; } -static int extent_handle_overwrites(struct btree_trans *trans, - struct btree_insert_entry *i) +static int run_one_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) { - struct bch_fs *c = trans->c; - struct btree_iter *iter, *update_iter; - struct bpos start = bkey_start_pos(&i->k->k); - struct bkey_i *update; - struct bkey_s_c k; - int ret = 0, compressed_sectors; - - iter = bch2_trans_get_iter(trans, i->btree_id, start, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek(iter); - if (!k.k || (ret = bkey_err(k))) - goto out; - - if (bch2_bkey_maybe_mergable(k.k, &i->k->k)) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto out; + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + struct bkey_s_c old; + struct bkey unpacked; + int ret = 0; - bkey_reassemble(update, k); + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; - if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(i->k))) { - update_iter = bch2_trans_copy_iter(trans, iter); - ret = bch2_btree_delete_at(trans, update_iter, i->flags); - bch2_trans_iter_put(trans, update_iter); + if (!overwrite) { + if (i->insert_trigger_run) + return 0; - if (ret) - goto out; + BUG_ON(i->overwrite_trigger_run); + i->insert_trigger_run = true; + } else { + if (i->overwrite_trigger_run) + return 0; - i->k = update; - goto next; - } + BUG_ON(!i->insert_trigger_run); + i->overwrite_trigger_run = true; } - if (!bkey_cmp(k.k->p, bkey_start_pos(&i->k->k))) - goto next; - - while (bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) > 0) { - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (bkey_cmp(bkey_start_pos(k.k), start) < 0 && - bkey_cmp(k.k->p, i->k->k.p) > 0 && - (compressed_sectors = bch2_bkey_sectors_compressed(k))) - trans->extra_journal_res += compressed_sectors; + old = bch2_btree_path_peek_slot(i->path, &unpacked); + _deleted.p = i->path->pos; + + if (overwrite) { + ret = bch2_trans_mark_key(trans, old, deleted, + BTREE_TRIGGER_OVERWRITE|i->flags); + } else if (old.k->type == i->k->k.type && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags); + } else { + ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k), + BTREE_TRIGGER_INSERT|i->flags); + } - if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto out; + if (ret == -EINTR) + trace_trans_restart_mark(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + return ret ?: 1; +} - bkey_reassemble(update, k); +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; - bch2_cut_back(start, update); + for (overwrite = 0; overwrite < 2; overwrite++) { - update_iter = bch2_trans_get_iter(trans, i->btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(update_iter); - if (ret) { - bch2_trans_iter_put(trans, update_iter); - goto out; + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + ret = run_one_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; } + } while (trans_trigger_run); + } - bch2_trans_update(trans, update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - i->flags); - bch2_trans_iter_put(trans, update_iter); - } - - if (bkey_cmp(k.k->p, i->k->k.p) <= 0) { - update_iter = bch2_trans_copy_iter(trans, iter); - ret = bch2_btree_delete_at(trans, update_iter, - i->flags); - bch2_trans_iter_put(trans, update_iter); - - if (ret) - goto out; - } + return 0; +} - if (bkey_cmp(k.k->p, i->k->k.p) > 0) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto out; +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; - bkey_reassemble(update, k); - bch2_cut_front(i->k->k.p, update); + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; - bch2_trans_update(trans, iter, update, i->flags); - goto out; - } -next: - k = bch2_btree_iter_next(iter); - if (!k.k || (ret = bkey_err(k))) - goto out; + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; } - bch2_bkey_merge(c, bkey_i_to_s(i->k), k); -out: - bch2_trans_iter_put(trans, iter); + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); - return ret; + return 0; } int __bch2_trans_commit(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; - struct btree_iter *iter; - bool trans_trigger_run; unsigned u64s; int ret = 0; @@ -861,77 +985,62 @@ int __bch2_trans_commit(struct btree_trans *trans) goto out_reset; if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) - lockdep_assert_held(&trans->c->gc_lock); + lockdep_assert_held(&c->gc_lock); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); trans->journal_u64s = trans->extra_journal_entry_u64s; trans->journal_preres_u64s = 0; + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += JSET_ENTRY_LOG_U64s; + if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget(&trans->c->writes))) { + unlikely(!percpu_ref_tryget(&c->writes))) { ret = bch2_trans_commit_get_rw_cold(trans); if (ret) goto out_reset; } #ifdef CONFIG_BCACHEFS_DEBUG + /* + * if BTREE_TRIGGER_NORUN is set, it means we're probably being called + * from the key cache flush code: + */ trans_for_each_update(trans, i) - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && + if (!i->cached && !(i->flags & BTREE_TRIGGER_NORUN)) bch2_btree_key_cache_verify_clean(trans, i->btree_id, i->k->k.p); #endif - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - trans_for_each_update(trans, i) { - if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && - !i->trans_triggers_run) { - i->trans_triggers_run = true; - trans_trigger_run = true; - - ret = bch2_trans_mark_update(trans, i->iter, - i->k, i->flags); - if (unlikely(ret)) { - if (ret == -EINTR) - trace_trans_restart_mark(trans->ip, _RET_IP_, - i->iter->btree_id, - &i->iter->pos); - goto out; - } - } - } - } while (trans_trigger_run); + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out; trans_for_each_update(trans, i) { - BUG_ON(!i->iter->should_be_locked); + BUG_ON(!i->path->should_be_locked); - if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) { - trace_trans_restart_upgrade(trans->ip, _RET_IP_, - i->iter->btree_id, - &i->iter->pos); - trans->restarted = true; - ret = -EINTR; + if (unlikely(!bch2_btree_path_upgrade(trans, i->path, i->level + 1))) { + trace_trans_restart_upgrade(trans->fn, _RET_IP_, + i->btree_id, &i->path->pos); + ret = btree_trans_restart(trans); goto out; } - BUG_ON(!btree_node_intent_locked(i->iter, i->level)); + BUG_ON(!btree_node_intent_locked(i->path, i->level)); u64s = jset_u64s(i->k->k.u64s); - if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && + if (i->cached && likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) trans->journal_preres_u64s += u64s; trans->journal_u64s += u64s; } if (trans->extra_journal_res) { - ret = bch2_disk_reservation_add(trans->c, trans->disk_res, + ret = bch2_disk_reservation_add(c, trans->disk_res, trans->extra_journal_res, (trans->flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0); @@ -945,21 +1054,19 @@ retry: ret = do_bch2_trans_commit(trans, &i, _RET_IP_); /* make sure we didn't drop or screw up locks: */ - bch2_btree_trans_verify_locks(trans); + bch2_trans_verify_locks(trans); if (ret) goto err; - - trans_for_each_iter(trans, iter) - if (btree_iter_live(trans, iter) && - (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) - bch2_btree_iter_set_pos(iter, iter->pos_after_commit); out: - bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + bch2_journal_preres_put(&c->journal, &trans->journal_preres); if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&trans->c->writes); + percpu_ref_put(&c->writes); out_reset: + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + trans->extra_journal_res = 0; trans->nr_updates = 0; trans->hooks = NULL; @@ -982,53 +1089,319 @@ err: goto retry; } -int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_update_flags flags) +static int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) { - struct btree_insert_entry *i, n = (struct btree_insert_entry) { - .flags = flags, - .bkey_type = __btree_node_type(iter->level, iter->btree_id), - .btree_id = iter->btree_id, - .level = iter->level, - .iter = iter, - .k = k - }; - bool is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0; - int ret = 0; + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; - BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - BUG_ON(!iter->should_be_locked); + if (!btree_type_has_snapshots(id)) + return 0; -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) - BUG_ON(i != trans->updates && - btree_insert_entry_cmp(i - 1, i) >= 0); -#endif + if (!snapshot_t(c, pos.snapshot)->children[0]) + return 0; - if (is_extent) { - ret = extent_handle_overwrites(trans, &n); + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); if (ret) - return ret; + break; - iter->pos_after_commit = k->k.p; - iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; + if (!k.k) + break; - if (bkey_deleted(&n.k->k)) - return 0; + if (bkey_cmp(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0, compressed_sectors; - n.iter = bch2_trans_get_iter(trans, n.btree_id, n.k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS); - ret = bch2_btree_iter_traverse(n.iter); - bch2_trans_iter_put(trans, n.iter); + bch2_trans_iter_init(trans, &iter, btree_id, start, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + /* + * We can't merge extents if they belong to interior snapshot + * tree nodes, and there's a snapshot in which one extent is + * visible and the other is not - i.e. if visibility is + * different. + * + * Instead of checking if visibilitiy of the two extents is + * different, for now we just check if either has been + * overwritten: + */ + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto err; if (ret) - return ret; + goto nomerge1; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto err; + if (ret) + goto nomerge1; + + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + + if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { + ret = bch2_btree_delete_at(trans, &iter, flags); + if (ret) + goto err; + + insert = update; + goto next; + } + } +nomerge1: + ret = 0; + if (!bkey_cmp(k.k->p, start)) + goto next; + + while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { + bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; + bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(k))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + + bch2_cut_back(start, update); + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; + } + + if (k.k->p.snapshot != insert->k.p.snapshot && + (front_split || back_split)) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + + bch2_cut_front(start, update); + bch2_cut_back(insert->k.p, update); + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + goto err; + } + + if (bkey_cmp(k.k->p, insert->k.p) <= 0) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_init(&update->k); + update->k.p = k.k->p; + + if (insert->k.p.snapshot != k.k->p.snapshot) { + update->k.p.snapshot = insert->k.p.snapshot; + update->k.type = KEY_TYPE_whiteout; + } + + bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + bch2_trans_iter_exit(trans, &update_iter); + + if (ret) + goto err; + } + + if (back_split) { + update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_reassemble(update, k); + bch2_cut_front(insert->k.p, update); + + ret = bch2_trans_update_by_path(trans, iter.path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + if (ret) + goto err; + goto out; + } +next: + k = bch2_btree_iter_next(&iter); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); + if (ret < 0) + goto err; + if (ret) + goto nomerge2; + + ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); + if (ret < 0) + goto err; + if (ret) + goto nomerge2; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + } +nomerge2: + ret = 0; +out: + if (!bkey_deleted(&insert->k)) { + /* + * Rewinding iterators is expensive: get a new one and the one + * that points to the start of insert will be cloned from: + */ + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, insert, flags); + } +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (bkey_cmp(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_insert_entry *i, n; - BUG_ON(n.iter->flags & BTREE_ITER_IS_EXTENTS); + BUG_ON(!path->should_be_locked); - n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; + BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + BUG_ON(bpos_cmp(k->k.p, path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, + .k = k, + .ip_allocated = _RET_IP_, + }; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif /* * Pending updates are kept sorted: first, find position of new update, @@ -1040,27 +1413,80 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, if (i < trans->updates + trans->nr_updates && !btree_insert_entry_cmp(&n, i)) { - BUG_ON(i->trans_triggers_run); + BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); - /* - * This is a hack to ensure that inode creates update the btree, - * not the key cache, which helps with cache coherency issues in - * other areas: - */ - if (btree_iter_type(n.iter) == BTREE_ITER_CACHED && - btree_iter_type(i->iter) != BTREE_ITER_CACHED) { - i->k = n.k; - i->flags = n.flags; - } else { - *i = n; - } + bch2_path_put(trans, i->path, true); + *i = n; } else array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); + __btree_path_get(n.path, true); return 0; } +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + struct bkey_cached *ck; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + bpos_cmp(iter->key_cache_path->pos, k->k.p)) { + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_); + btree_trans_restart(trans); + return -EINTR; + } + + iter->key_cache_path->should_be_locked = true; + } + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags); +} + void bch2_trans_commit_hook(struct btree_trans *trans, struct btree_trans_commit_hook *h) { @@ -1071,15 +1497,14 @@ void bch2_trans_commit_hook(struct btree_trans *trans, int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, struct bkey_i *k) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, k, 0); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1115,18 +1540,21 @@ int bch2_btree_delete_at(struct btree_trans *trans, int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, + unsigned iter_flags, u64 *journal_seq) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT|iter_flags); retry: while ((bch2_trans_begin(trans), - (k = bch2_btree_iter_peek(iter)).k) && + (k = bch2_btree_iter_peek(&iter)).k) && !(ret = bkey_err(k)) && - bkey_cmp(iter->pos, end) < 0) { + bkey_cmp(iter.pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; bkey_init(&delete.k); @@ -1145,9 +1573,9 @@ retry: * (bch2_btree_iter_peek() does guarantee that iter.pos >= * bkey_start_pos(k.k)). */ - delete.k.p = iter->pos; + delete.k.p = iter.pos; - if (btree_node_type_is_extents(iter->btree_id)) { + if (iter.flags & BTREE_ITER_IS_EXTENTS) { unsigned max_sectors = KEY_SIZE_MAX & (~0 << trans->c->block_bits); @@ -1155,18 +1583,17 @@ retry: bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete); - ret = bch2_extent_trim_atomic(&delete, iter); + ret = bch2_extent_trim_atomic(trans, &iter, &delete); if (ret) break; } - ret = bch2_trans_update(trans, iter, &delete, 0) ?: - bch2_trans_commit(trans, NULL, journal_seq, + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); if (ret) break; - - bch2_trans_cond_resched(trans); } if (ret == -EINTR) { @@ -1174,7 +1601,7 @@ retry: goto retry; } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1185,8 +1612,10 @@ retry: */ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, struct bpos start, struct bpos end, + unsigned iter_flags, u64 *journal_seq) { return bch2_trans_do(c, NULL, journal_seq, 0, - bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); + bch2_btree_delete_range_trans(&trans, id, start, end, + iter_flags, journal_seq)); } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 76945e5..eb0eaa9 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -11,11 +11,15 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "buckets_waiting_for_journal.h" #include "ec.h" #include "error.h" +#include "inode.h" #include "movinggc.h" +#include "recovery.h" #include "reflink.h" #include "replicas.h" +#include "subvolume.h" #include #include @@ -40,43 +44,6 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, } } -/* - * Clear journal_seq_valid for buckets for which it's not needed, to prevent - * wraparound: - */ -void bch2_bucket_seq_cleanup(struct bch_fs *c) -{ - u64 journal_seq = atomic64_read(&c->journal.seq); - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket *g; - struct bucket_mark m; - unsigned i; - - if (journal_seq - c->last_bucket_seq_cleanup < - (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) - return; - - c->last_bucket_seq_cleanup = journal_seq; - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) { - bucket_cmpxchg(g, m, ({ - if (!m.journal_seq_valid || - bucket_needs_journal_commit(m, last_seq_ondisk)) - break; - - m.journal_seq_valid = 0; - })); - } - up_read(&ca->bucket_lock); - } -} - void bch2_fs_usage_initialize(struct bch_fs *c) { struct bch_fs_usage *usage; @@ -114,6 +81,8 @@ static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, unsigned journal_seq, bool gc) { + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? ca->usage_gc : ca->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -139,6 +108,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, unsigned journal_seq, bool gc) { + percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + return this_cpu_ptr(gc ? c->usage_gc : c->usage[journal_seq & JOURNAL_BUF_MASK]); @@ -315,8 +287,8 @@ static inline int is_unavailable_bucket(struct bucket_mark m) static inline int bucket_sectors_fragmented(struct bch_dev *ca, struct bucket_mark m) { - return bucket_sectors_used(m) - ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m)) + return m.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors) : 0; } @@ -332,13 +304,6 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m) : m.data_type; } -static bool bucket_became_unavailable(struct bucket_mark old, - struct bucket_mark new) -{ - return is_available_bucket(old) && - !is_available_bucket(new); -} - static inline void account_bucket(struct bch_fs_usage *fs_usage, struct bch_dev_usage *dev_usage, enum bch_data_type type, @@ -357,8 +322,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, struct bch_fs_usage *fs_usage; struct bch_dev_usage *u; - percpu_rwsem_assert_held(&c->mark_lock); - preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); u = dev_usage_ptr(ca, journal_seq, gc); @@ -404,25 +367,48 @@ static inline int __update_replicas(struct bch_fs *c, return 0; } -static inline int update_replicas(struct bch_fs *c, +static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, struct bch_replicas_entry *r, s64 sectors, unsigned journal_seq, bool gc) { struct bch_fs_usage __percpu *fs_usage; - int idx = bch2_replicas_entry_idx(c, r); + int idx, ret = 0; + char buf[200]; - if (idx < 0) - return -1; + percpu_down_read(&c->mark_lock); + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && + (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + fsck_err(c, "no replicas entry\n" + " while marking %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); + if (ret) + return ret; + + percpu_down_read(&c->mark_lock); + idx = bch2_replicas_entry_idx(c, r); + } + if (idx < 0) { + ret = -1; + goto err; + } preempt_disable(); fs_usage = fs_usage_ptr(c, journal_seq, gc); fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); fs_usage->replicas[idx] += sectors; preempt_enable(); - return 0; +err: +fsck_err: + percpu_up_read(&c->mark_lock); + return ret; } static inline int update_cached_sectors(struct bch_fs *c, + struct bkey_s_c k, unsigned dev, s64 sectors, unsigned journal_seq, bool gc) { @@ -430,7 +416,7 @@ static inline int update_cached_sectors(struct bch_fs *c, bch2_replicas_entry_cached(&r.e, dev); - return update_replicas(c, &r.e, sectors, journal_seq, gc); + return update_replicas(c, k, &r.e, sectors, journal_seq, gc); } static struct replicas_delta_list * @@ -496,19 +482,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, update_replicas_list(trans, &r.e, sectors); } -#define do_mark_fn(fn, c, pos, flags, ...) \ -({ \ - int gc, ret = 0; \ - \ - percpu_rwsem_assert_held(&c->mark_lock); \ - \ - for (gc = 0; gc < 2 && !ret; gc++) \ - if (!gc == !(flags & BTREE_TRIGGER_GC) || \ - (gc && gc_visited(c, pos))) \ - ret = fn(c, __VA_ARGS__, gc); \ - ret; \ -}) - void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator) { @@ -522,20 +495,19 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, BUG_ON(owned_by_allocator == old.owned_by_allocator); } -static int bch2_mark_alloc(struct bch_fs *c, +static int bch2_mark_alloc(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - u64 journal_seq, unsigned flags) + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; - struct bkey_alloc_unpacked u; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old); + struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new); struct bch_dev *ca; struct bucket *g; struct bucket_mark old_m, m; - - /* We don't do anything for deletions - do we?: */ - if (new.k->type != KEY_TYPE_alloc && - new.k->type != KEY_TYPE_alloc_v2) - return 0; + int ret = 0; /* * alloc btree is read in by bch2_alloc_read, not gc: @@ -544,35 +516,66 @@ static int bch2_mark_alloc(struct bch_fs *c, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; - ca = bch_dev_bkey_exists(c, new.k->p.inode); + if ((flags & BTREE_TRIGGER_INSERT) && + !old_u.data_type != !new_u.data_type && + new.k->type == KEY_TYPE_alloc_v3) { + struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v; + u64 old_journal_seq = le64_to_cpu(v->journal_seq); + + BUG_ON(!journal_seq); - if (new.k->p.offset >= ca->mi.nbuckets) + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + new_u.journal_seq = !new_u.data_type && + (journal_seq == old_journal_seq || + bch2_journal_noflush_seq(&c->journal, old_journal_seq)) + ? 0 : journal_seq; + v->journal_seq = cpu_to_le64(new_u.journal_seq); + } + + if (old_u.data_type && !new_u.data_type && new_u.journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new_u.dev, new_u.bucket, + new_u.journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + ca = bch_dev_bkey_exists(c, new_u.dev); + + if (new_u.bucket >= ca->mi.nbuckets) return 0; - g = __bucket(ca, new.k->p.offset, gc); - u = bch2_alloc_unpack(new); + percpu_down_read(&c->mark_lock); + if (!gc && new_u.gen != old_u.gen) + *bucket_gen(ca, new_u.bucket) = new_u.gen; + + g = __bucket(ca, new_u.bucket, gc); old_m = bucket_cmpxchg(g, m, ({ - m.gen = u.gen; - m.data_type = u.data_type; - m.dirty_sectors = u.dirty_sectors; - m.cached_sectors = u.cached_sectors; - m.stripe = u.stripe != 0; - - if (journal_seq) { - m.journal_seq_valid = 1; - m.journal_seq = journal_seq; - } + m.gen = new_u.gen; + m.data_type = new_u.data_type; + m.dirty_sectors = new_u.dirty_sectors; + m.cached_sectors = new_u.cached_sectors; + m.stripe = new_u.stripe != 0; })); bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc); - g->io_time[READ] = u.read_time; - g->io_time[WRITE] = u.write_time; - g->oldest_gen = u.oldest_gen; + g->io_time[READ] = new_u.read_time; + g->io_time[WRITE] = new_u.write_time; + g->oldest_gen = new_u.oldest_gen; g->gen_valid = 1; - g->stripe = u.stripe; - g->stripe_redundancy = u.stripe_redundancy; + g->stripe = new_u.stripe; + g->stripe_redundancy = new_u.stripe_redundancy; + percpu_up_read(&c->mark_lock); /* * need to know if we're getting called from the invalidate path or @@ -581,13 +584,15 @@ static int bch2_mark_alloc(struct bch_fs *c, if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && old_m.cached_sectors) { - if (update_cached_sectors(c, ca->dev_idx, -old_m.cached_sectors, - journal_seq, gc)) { + ret = update_cached_sectors(c, new, ca->dev_idx, + -old_m.cached_sectors, + journal_seq, gc); + if (ret) { bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors"); - return -1; + return ret; } - trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket), old_m.cached_sectors); } @@ -604,17 +609,27 @@ static int bch2_mark_alloc(struct bch_fs *c, overflow; \ }) -static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type data_type, - unsigned sectors, bool gc) +void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) { - struct bucket *g = __bucket(ca, b, gc); + struct bucket *g; struct bucket_mark old, new; bool overflow; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); BUG_ON(data_type != BCH_DATA_sb && data_type != BCH_DATA_journal); + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return; + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); old = bucket_cmpxchg(g, new, ({ new.data_type = data_type; overflow = checked_add(new.dirty_sectors, sectors); @@ -632,88 +647,70 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type ?: data_type], old.dirty_sectors, sectors); - if (c) - bch2_dev_usage_update(c, ca, old, new, 0, gc); - - return 0; -} - -void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, enum bch_data_type type, - unsigned sectors, struct gc_pos pos, - unsigned flags) -{ - BUG_ON(type != BCH_DATA_sb && - type != BCH_DATA_journal); - - /* - * Backup superblock might be past the end of our normal usable space: - */ - if (b >= ca->mi.nbuckets) - return; - - if (likely(c)) { - do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, - ca, b, type, sectors); - } else { - __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); - } + bch2_dev_usage_update(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); } static s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) { - return p.crc.compression_type - ? DIV_ROUND_UP(sectors * p.crc.compressed_size, + EBUG_ON(sectors < 0); + + return p.crc.compression_type && + p.crc.compression_type != BCH_COMPRESSION_TYPE_incompressible + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, p.crc.uncompressed_size) : sectors; } -static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, +static int check_bucket_ref(struct bch_fs *c, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, - u8 bucket_gen, u8 bucket_data_type, + u8 b_gen, u8 bucket_data_type, u16 dirty_sectors, u16 cached_sectors) { - size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); u16 bucket_sectors = !ptr->cached ? dirty_sectors : cached_sectors; char buf[200]; - if (gen_after(ptr->gen, bucket_gen)) { + if (gen_after(ptr->gen, b_gen)) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != ptr->gen && !ptr->cached) { + if (b_gen != ptr->gen && !ptr->cached) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, + *bucket_gen(ca, bucket_nr), bch2_data_types[bucket_data_type ?: ptr_data_type], ptr->gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); return -EIO; } - if (bucket_gen != ptr->gen) + if (b_gen != ptr->gen) return 1; if (bucket_data_type && ptr_data_type && @@ -721,7 +718,7 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type], bch2_data_types[ptr_data_type], (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); @@ -732,7 +729,7 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" "while marking %s", - ptr->dev, bucket_nr, bucket_gen, + ptr->dev, bucket_nr, b_gen, bch2_data_types[bucket_data_type ?: ptr_data_type], bucket_sectors, sectors, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); @@ -742,54 +739,68 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k, - unsigned ptr_idx, - u64 journal_seq, unsigned flags) +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) { + struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; - bool gc = flags & BTREE_TRIGGER_GC; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket *g; struct bucket_mark new, old; char buf[200]; - int ret; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); - if (g->stripe && g->stripe != k.k->p.offset) { + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, ptr); + + if (g->mark.dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { bch2_fs_inconsistent(c, "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - return -EINVAL; + ret = -EINVAL; + goto err; } old = bucket_cmpxchg(g, new, ({ - ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, + ret = check_bucket_ref(c, k, ptr, sectors, data_type, + new.gen, new.data_type, new.dirty_sectors, new.cached_sectors); if (ret) - return ret; + goto err; - if (parity) { - new.data_type = BCH_DATA_parity; - new.dirty_sectors = le16_to_cpu(s->sectors); - } + new.dirty_sectors += sectors; + if (data_type) + new.data_type = data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } + new.stripe = true; })); g->stripe = k.k->p.offset; g->stripe_redundancy = s->nr_redundant; - bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); + bch2_dev_usage_update(c, ca, old, new, journal_seq, true); +err: + percpu_up_read(&c->mark_lock); + return 0; } -static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 bucket_gen, u8 *bucket_data_type, @@ -798,7 +809,7 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, u16 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, + int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, bucket_gen, *bucket_data_type, *dirty_sectors, *cached_sectors); @@ -811,38 +822,41 @@ static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, - u64 journal_seq, unsigned flags) + unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); + struct bucket *g; u8 bucket_data_type; u64 v; - int ret; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, &p.ptr); v = atomic64_read(&g->_mark.v); do { new.v.counter = old.v.counter = v; bucket_data_type = new.data_type; - ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, new.gen, &bucket_data_type, &new.dirty_sectors, &new.cached_sectors); if (ret) - return ret; + goto err; new.data_type = bucket_data_type; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - if (flags & BTREE_TRIGGER_NOATOMIC) { g->_mark = new; break; @@ -851,25 +865,32 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, old, new, journal_seq, gc); - - BUG_ON(!gc && bucket_became_unavailable(old, new)); + bch2_dev_usage_update(c, ca, old, new, journal_seq, true); +err: + percpu_up_read(&c->mark_lock); - return 0; + return ret; } -static int bch2_mark_stripe_ptr(struct bch_fs *c, +static int bch2_mark_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, struct bch_extent_stripe_ptr p, enum bch_data_type data_type, s64 sectors, - unsigned journal_seq, unsigned flags) + unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; + struct bch_fs *c = trans->c; struct bch_replicas_padded r; - struct stripe *m; - unsigned i, blocks_nonempty = 0; + struct gc_stripe *m; - m = genradix_ptr(&c->stripes[gc], p.idx); + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.idx); + return -ENOMEM; + } spin_lock(&c->ec_stripes_heap_lock); @@ -884,30 +905,21 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, m->block_sectors[p.block] += sectors; r = m->r; - - for (i = 0; i < m->nr_blocks; i++) - blocks_nonempty += m->block_sectors[i] != 0; - - if (m->blocks_nonempty != blocks_nonempty) { - m->blocks_nonempty = blocks_nonempty; - if (!gc) - bch2_stripes_heap_update(c, m, p.idx); - } - spin_unlock(&c->ec_stripes_heap_lock); r.e.data_type = data_type; - update_replicas(c, &r.e, sectors, journal_seq, gc); + update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); return 0; } -static int bch2_mark_extent(struct bch_fs *c, +static int bch2_mark_extent(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, - unsigned journal_seq, unsigned flags) + unsigned flags) { - bool gc = flags & BTREE_TRIGGER_GC; - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -916,17 +928,13 @@ static int bch2_mark_extent(struct bch_fs *c, ? BCH_DATA_btree : BCH_DATA_user; s64 sectors = bkey_is_btree_ptr(k.k) - ? c->opts.btree_node_size + ? btree_sectors(c) : k.k->size; s64 dirty_sectors = 0; bool stale; int ret; - BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == - (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); - - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); r.e.data_type = data_type; r.e.nr_devs = 0; @@ -935,27 +943,31 @@ static int bch2_mark_extent(struct bch_fs *c, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = ptr_disk_sectors(sectors, p); - ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, - journal_seq, flags); + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + + ret = bch2_mark_pointer(trans, k, p, disk_sectors, + data_type, flags); if (ret < 0) return ret; stale = ret > 0; if (p.ptr.cached) { - if (!stale) - if (update_cached_sectors(c, p.ptr.dev, disk_sectors, - journal_seq, gc)) { + if (!stale) { + ret = update_cached_sectors(c, k, p.ptr.dev, + disk_sectors, journal_seq, true); + if (ret) { bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors"); - return -1; - + return ret; } + } } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; } else { - ret = bch2_mark_stripe_ptr(c, p.ec, data_type, - disk_sectors, journal_seq, flags); + ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, + disk_sectors, flags); if (ret) return ret; @@ -969,182 +981,245 @@ static int bch2_mark_extent(struct bch_fs *c, } if (r.e.nr_devs) { - if (update_replicas(c, &r.e, dirty_sectors, journal_seq, gc)) { + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); + if (ret) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, k); bch2_fs_fatal_error(c, "no replicas entry for %s", buf); - return -1; + return ret; } } return 0; } -static int bch2_mark_stripe(struct bch_fs *c, - struct bkey_s_c old, struct bkey_s_c new, - u64 journal_seq, unsigned flags) +static int bch2_mark_stripe(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; - size_t idx = new.k->p.offset; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(old).v : NULL; const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ? bkey_s_c_to_stripe(new).v : NULL; - struct stripe *m = genradix_ptr(&c->stripes[gc], idx); unsigned i; int ret; BUG_ON(gc && old_s); - if (!m || (old_s && !m->alive)) { - bch_err_ratelimited(c, "error marking nonexistent stripe %zu", - idx); - bch2_inconsistent_error(c); - return -1; - } + if (!gc) { + struct stripe *m = genradix_ptr(&c->stripes, idx); - if (!new_s) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_del(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); + if (!m || (old_s && !m->alive)) { + char buf1[200], buf2[200]; - memset(m, 0, sizeof(*m)); + bch2_bkey_val_to_text(&PBUF(buf1), c, old); + bch2_bkey_val_to_text(&PBUF(buf2), c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1, buf2); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + + memset(m, 0, sizeof(*m)); + } else { + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } } else { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -ENOMEM; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ m->alive = true; m->sectors = le16_to_cpu(new_s->sectors); - m->algorithm = new_s->algorithm; m->nr_blocks = new_s->nr_blocks; m->nr_redundant = new_s->nr_redundant; - m->blocks_nonempty = 0; - - for (i = 0; i < new_s->nr_blocks; i++) { - m->block_sectors[i] = - stripe_blockcount_get(new_s, i); - m->blocks_nonempty += !!m->block_sectors[i]; + for (i = 0; i < new_s->nr_blocks; i++) m->ptrs[i] = new_s->ptrs[i]; - } bch2_bkey_to_replicas(&m->r.e, new); - if (!gc) { - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); - } - } - - if (gc) { /* * gc recalculates this field from stripe ptr * references: */ memset(m->block_sectors, 0, sizeof(m->block_sectors)); - m->blocks_nonempty = 0; for (i = 0; i < new_s->nr_blocks; i++) { - ret = mark_stripe_bucket(c, new, i, journal_seq, flags); + ret = mark_stripe_bucket(trans, new, i, flags); if (ret) return ret; } - if (update_replicas(c, &m->r.e, - ((s64) m->sectors * m->nr_redundant), - journal_seq, gc)) { + ret = update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { char buf[200]; bch2_bkey_val_to_text(&PBUF(buf), c, new); bch2_fs_fatal_error(c, "no replicas entry for %s", buf); - return -1; + return ret; } } return 0; } -static int bch2_mark_inode(struct bch_fs *c, - struct bkey_s_c old, struct bkey_s_c new, - u64 journal_seq, unsigned flags) +static int bch2_mark_inode(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { + struct bch_fs *c = trans->c; struct bch_fs_usage __percpu *fs_usage; + u64 journal_seq = trans->journal_res.seq; - preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); - fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode; - fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode; - preempt_enable(); + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_inode_v2 *v = (struct bch_inode_v2 *) new.v; + + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_inode_v2); + + v->bi_journal_seq = cpu_to_le64(journal_seq); + } + + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + } return 0; } -static int bch2_mark_reservation(struct bch_fs *c, - struct bkey_s_c old, struct bkey_s_c new, - u64 journal_seq, unsigned flags) +static int bch2_mark_reservation(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; struct bch_fs_usage __percpu *fs_usage; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; sectors *= replicas; + percpu_down_read(&c->mark_lock); preempt_disable(); - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(fs_usage->persistent_reserved)); fs_usage->reserved += sectors; fs_usage->persistent_reserved[replicas - 1] += sectors; + preempt_enable(); + percpu_up_read(&c->mark_lock); return 0; } static s64 __bch2_mark_reflink_p(struct bch_fs *c, struct bkey_s_c_reflink_p p, - u64 idx, unsigned flags, size_t *r_idx) + u64 *idx, unsigned flags, size_t r_idx) { struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + s64 ret = 0; - while (1) { - if (*r_idx >= c->reflink_gc_nr) - goto not_found; - r = genradix_ptr(&c->reflink_gc_table, *r_idx); - BUG_ON(!r); + if (r_idx >= c->reflink_gc_nr) + goto not_found; - if (idx < r->offset) - break; - (*r_idx)++; - } + r = genradix_ptr(&c->reflink_gc_table, r_idx); + if (*idx < r->offset - r->size) + goto not_found; BUG_ON((s64) r->refcount + add < 0); r->refcount += add; - return r->offset - idx; + *idx = r->offset; + return 0; not_found: - bch2_fs_inconsistent(c, - "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, idx); - bch2_inconsistent_error(c); - return -EIO; + *idx = U64_MAX; + ret = -EIO; + + /* + * XXX: we're replacing the entire reflink pointer with an error + * key, we should just be replacing the part that was missing: + */ + if (fsck_err(c, "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, *idx)) { + struct bkey_i_error new; + + bkey_init(&new.k); + new.k.type = KEY_TYPE_error; + new.k.p = p.k->p; + new.k.size = p.k->size; + ret = bch2_journal_key_insert(c, BTREE_ID_extents, 0, &new.k_i); + } +fsck_err: + return ret; } -static int bch2_mark_reflink_p(struct bch_fs *c, - struct bkey_s_c old, struct bkey_s_c new, - u64 journal_seq, unsigned flags) +static int bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); struct reflink_gc *ref; size_t l, r, m; u64 idx = le64_to_cpu(p.v->idx); - unsigned sectors = p.k->size; - s64 ret = 0; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; - BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == - (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); + } l = 0; r = c->reflink_gc_nr; @@ -1158,107 +1233,72 @@ static int bch2_mark_reflink_p(struct bch_fs *c, r = m; } - while (sectors) { - ret = __bch2_mark_reflink_p(c, p, idx, flags, &l); - if (ret < 0) - return ret; + while (idx < end && !ret) + ret = __bch2_mark_reflink_p(c, p, &idx, flags, l++); - ret = min_t(s64, ret, sectors); - idx += ret; - sectors -= ret; - } - - return 0; + return ret; } -static int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c old, - struct bkey_s_c new, - u64 journal_seq, unsigned flags) +int bch2_mark_key(struct btree_trans *trans, + struct bkey_s_c old, + struct bkey_s_c new, + unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; - - BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; switch (k.k->type) { case KEY_TYPE_alloc: case KEY_TYPE_alloc_v2: - return bch2_mark_alloc(c, old, new, journal_seq, flags); + case KEY_TYPE_alloc_v3: + return bch2_mark_alloc(trans, old, new, flags); case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return bch2_mark_extent(c, old, new, journal_seq, flags); + return bch2_mark_extent(trans, old, new, flags); case KEY_TYPE_stripe: - return bch2_mark_stripe(c, old, new, journal_seq, flags); + return bch2_mark_stripe(trans, old, new, flags); case KEY_TYPE_inode: - return bch2_mark_inode(c, old, new, journal_seq, flags); + case KEY_TYPE_inode_v2: + return bch2_mark_inode(trans, old, new, flags); case KEY_TYPE_reservation: - return bch2_mark_reservation(c, old, new, journal_seq, flags); + return bch2_mark_reservation(trans, old, new, flags); case KEY_TYPE_reflink_p: - return bch2_mark_reflink_p(c, old, new, journal_seq, flags); + return bch2_mark_reflink_p(trans, old, new, flags); + case KEY_TYPE_snapshot: + return bch2_mark_snapshot(trans, old, new, flags); default: return 0; } } -int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned flags) -{ - struct bkey deleted = KEY(0, 0, 0); - struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; - int ret; - - percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, old, new, 0, flags); - percpu_up_read(&c->mark_lock); - - return ret; -} - -int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter, +int bch2_mark_update(struct btree_trans *trans, struct btree_path *path, struct bkey_i *new, unsigned flags) { - struct bch_fs *c = trans->c; struct bkey _deleted = KEY(0, 0, 0); struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; struct bkey_s_c old; - int iter_flags, ret; + struct bkey unpacked; + int ret; + + _deleted.p = path->pos; if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(iter->btree_id)) + if (!btree_node_type_needs_gc(path->btree_id)) return 0; - if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { - iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; - iter->flags &= ~BTREE_ITER_WITH_UPDATES; - - old = bch2_btree_iter_peek_slot(iter); - iter->flags |= iter_flags; - - ret = bkey_err(old); - if (ret) - return ret; - } else { - /* - * If BTREE_ITER_CACHED_NOFILL was used, we better not be - * running triggers that do anything on removal (alloc btree): - */ - old = deleted; - } + old = bch2_btree_path_peek_slot(path, &unpacked); if (old.k->type == new->k.type && ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), - trans->journal_res.seq, + ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { - ret = bch2_mark_key_locked(c, deleted, bkey_i_to_s_c(new), - trans->journal_res.seq, + ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key_locked(c, old, deleted, - trans->journal_res.seq, + bch2_mark_key(trans, old, deleted, BTREE_TRIGGER_OVERWRITE|flags); } @@ -1283,23 +1323,14 @@ void fs_usage_apply_warn(struct btree_trans *trans, pr_err("%s", buf); pr_err("overlapping with"); - if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { - struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter); - struct bkey_s_c k; - int ret; - - for_each_btree_key_continue(copy, 0, k, ret) { - if (btree_node_type_is_extents(i->iter->btree_id) - ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 - : bkey_cmp(i->k->k.p, k.k->p)) - break; + if (!i->cached) { + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot(i->path, &u); - bch2_bkey_val_to_text(&PBUF(buf), c, k); - pr_err("%s", buf); - } - bch2_trans_iter_put(trans, copy); + bch2_bkey_val_to_text(&PBUF(buf), c, k); + pr_err("%s", buf); } else { - struct bkey_cached *ck = (void *) i->iter->l[0].b; + struct bkey_cached *ck = (void *) i->path->l[0].b; if (ck->valid) { bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); @@ -1310,21 +1341,20 @@ void fs_usage_apply_warn(struct btree_trans *trans, __WARN(); } -void bch2_trans_fs_usage_apply(struct btree_trans *trans, - struct replicas_delta_list *deltas) +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) { struct bch_fs *c = trans->c; static int warned_disk_usage = 0; bool warn = false; unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; - struct replicas_delta *d = deltas->d; + struct replicas_delta *d = deltas->d, *d2; struct replicas_delta *top = (void *) deltas->d + deltas->used; struct bch_fs_usage *dst; s64 added = 0, should_not_have_added; unsigned i; - percpu_rwsem_assert_held(&c->mark_lock); - + percpu_down_read(&c->mark_lock); preempt_disable(); dst = fs_usage_ptr(c, trans->journal_res.seq, false); @@ -1336,7 +1366,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, added += d->delta; } - BUG_ON(__update_replicas(c, dst, &d->r, d->delta)); + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; } dst->nr_inodes += deltas->nr_inodes; @@ -1371,101 +1402,71 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } preempt_enable(); + percpu_up_read(&c->mark_lock); if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); + return 0; +need_mark: + /* revert changes: */ + for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) + BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return -1; } /* trans_mark: */ -static struct btree_iter *trans_get_update(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - struct bkey_s_c *k) -{ - struct btree_insert_entry *i; - - trans_for_each_update(trans, i) - if (i->iter->btree_id == btree_id && - (btree_node_type_is_extents(btree_id) - ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && - bkey_cmp(pos, i->k->k.p) < 0 - : !bkey_cmp(pos, i->iter->pos))) { - *k = bkey_i_to_s_c(i->k); - - /* ugly hack.. */ - BUG_ON(btree_iter_live(trans, i->iter)); - trans->iters_live |= 1ULL << i->iter->idx; - return i->iter; - } - - return NULL; -} - -static struct bkey_alloc_buf * -bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, +static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, const struct bch_extent_ptr *ptr, struct bkey_alloc_unpacked *u) { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); - struct bucket *g; - struct btree_iter *iter; struct bkey_s_c k; - struct bkey_alloc_buf *a; int ret; - a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); - if (IS_ERR(a)) - return a; - - iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k); - if (iter) { - *u = bch2_alloc_unpack(k); - } else { - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter); - if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); - } - - percpu_down_read(&c->mark_lock); - g = bucket(ca, pos.offset); - *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); - percpu_up_read(&c->mark_lock); + bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, + POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)), + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; } - *_iter = iter; - return a; + *u = bch2_alloc_unpack(k); + return 0; } static int bch2_trans_mark_pointer(struct btree_trans *trans, struct bkey_s_c k, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type) { - struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; - struct bkey_alloc_buf *a; int ret; - a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); - if (IS_ERR(a)) - return PTR_ERR(a); + ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); + if (ret) + return ret; - ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, + ret = __mark_pointer(trans, k, &p.ptr, sectors, data_type, + u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); if (ret) goto out; - bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto out; out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1474,16 +1475,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, s64 sectors, enum bch_data_type data_type) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i_stripe *s; struct bch_replicas_padded r; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1514,13 +1515,16 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, stripe_blockcount_set(&s->v, p.ec.block, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - bch2_trans_update(trans, iter, &s->k_i, 0); + + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); r.e.data_type = data_type; update_replicas_list(trans, &r.e, sectors); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1536,18 +1540,12 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, ? BCH_DATA_btree : BCH_DATA_user; s64 sectors = bkey_is_btree_ptr(k.k) - ? c->opts.btree_node_size + ? btree_sectors(c) : k.k->size; s64 dirty_sectors = 0; bool stale; int ret; - BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == - (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); - - if (flags & BTREE_TRIGGER_OVERWRITE) - sectors = -sectors; - r.e.data_type = data_type; r.e.nr_devs = 0; r.e.nr_required = 1; @@ -1555,6 +1553,9 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { s64 disk_sectors = ptr_disk_sectors(sectors, p); + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, data_type); if (ret < 0) @@ -1585,54 +1586,79 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } -static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans, - struct bkey_s_c_stripe s, - unsigned idx, bool deleting) +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) { struct bch_fs *c = trans->c; const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; - struct bkey_alloc_buf *a; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; - bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; int ret = 0; - a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); - if (IS_ERR(a)) - return PTR_ERR(a); - - if (parity) { - s64 sectors = le16_to_cpu(s.v->sectors); + if (deleting) + sectors = -sectors; - if (deleting) - sectors = -sectors; + ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u); + if (ret) + return ret; - u.dirty_sectors += sectors; - u.data_type = u.dirty_sectors - ? BCH_DATA_parity - : 0; - } + ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, + u.gen, u.data_type, + u.dirty_sectors, u.cached_sectors); + if (ret) + goto err; if (!deleting) { - if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c, - "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)", - iter->pos.inode, iter->pos.offset, u.gen, + if (bch2_fs_inconsistent_on(u.stripe || + u.stripe_redundancy, c, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], + u.dirty_sectors, u.stripe, s.k->p.offset)) { ret = -EIO; goto err; } + if (bch2_fs_inconsistent_on(data_type && u.dirty_sectors, c, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, u.gen, + bch2_data_types[u.data_type], + u.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + u.stripe = s.k->p.offset; u.stripe_redundancy = s.v->nr_redundant; } else { + if (bch2_fs_inconsistent_on(u.stripe != s.k->p.offset || + u.stripe_redundancy != s.v->nr_redundant, c, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, u.gen, + s.k->p.offset, u.stripe)) { + ret = -EIO; + goto err; + } + u.stripe = 0; u.stripe_redundancy = 0; } - bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + u.dirty_sectors += sectors; + if (data_type) + u.data_type = !deleting ? data_type : 0; + + ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto err; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1643,7 +1669,7 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, struct bkey_s_c_stripe old_s = { .k = NULL }; struct bkey_s_c_stripe new_s = { .k = NULL }; struct bch_replicas_padded r; - unsigned i; + unsigned i, nr_blocks; int ret = 0; if (old.k->type == KEY_TYPE_stripe) @@ -1661,18 +1687,17 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, new_s.v->nr_blocks * sizeof(struct bch_extent_ptr))) return 0; + BUG_ON(new_s.k && old_s.k && + (new_s.v->nr_blocks != old_s.v->nr_blocks || + new_s.v->nr_redundant != old_s.v->nr_redundant)); + + nr_blocks = new_s.k ? new_s.v->nr_blocks : old_s.v->nr_blocks; + if (new_s.k) { s64 sectors = le16_to_cpu(new_s.v->sectors); bch2_bkey_to_replicas(&r.e, new); update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant); - - for (i = 0; i < new_s.v->nr_blocks; i++) { - ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s, - i, false); - if (ret) - return ret; - } } if (old_s.k) { @@ -1680,12 +1705,25 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans, bch2_bkey_to_replicas(&r.e, old); update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant); + } - for (i = 0; i < old_s.v->nr_blocks; i++) { - ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s, - i, true); + for (i = 0; i < nr_blocks; i++) { + if (new_s.k && old_s.k && + !memcmp(&new_s.v->ptrs[i], + &old_s.v->ptrs[i], + sizeof(new_s.v->ptrs[i]))) + continue; + + if (new_s.k) { + ret = bch2_trans_mark_stripe_bucket(trans, new_s, i, false); if (ret) - return ret; + break; + } + + if (old_s.k) { + ret = bch2_trans_mark_stripe_bucket(trans, old_s, i, true); + if (ret) + break; } } @@ -1697,8 +1735,7 @@ static int bch2_trans_mark_inode(struct btree_trans *trans, struct bkey_s_c new, unsigned flags) { - int nr = (new.k->type == KEY_TYPE_inode) - - (old.k->type == KEY_TYPE_inode); + int nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); if (nr) { struct replicas_delta_list *d = @@ -1716,9 +1753,6 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans, s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; - BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) == - (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)); - if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; sectors *= replicas; @@ -1734,20 +1768,21 @@ static int bch2_trans_mark_reservation(struct btree_trans *trans, static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p, - u64 idx, unsigned flags) + u64 *idx, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i *n; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; - s64 ret; + char buf[200]; + int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, POS(0, idx), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1761,15 +1796,38 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, refcount = bkey_refcount(n); if (!refcount) { + bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); bch2_fs_inconsistent(c, - "%llu:%llu len %u points to nonexistent indirect extent %llu", - p.k->p.inode, p.k->p.offset, p.k->size, idx); - bch2_inconsistent_error(c); + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf); ret = -EIO; goto err; } - BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)); + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&PBUF(buf), c, p.s_c); + bch2_fs_inconsistent(c, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(k.k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k.k->p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + le64_add_cpu(refcount, add); if (!*refcount) { @@ -1777,14 +1835,14 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, set_bkey_val_u64s(&n->k, 0); } - bch2_btree_iter_set_pos_to_extent_start(iter); - ret = bch2_trans_update(trans, iter, n, 0); + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, n, 0); if (ret) goto err; - ret = k.k->p.offset - idx; + *idx = k.k->p.offset; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1792,29 +1850,29 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans, struct bkey_s_c k, unsigned flags) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); - unsigned sectors = p.k->size; - s64 ret = 0; + u64 idx, end_idx; + int ret = 0; - while (sectors) { - ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags); - if (ret < 0) - return ret; + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; - ret = min_t(s64, ret, sectors); - idx += ret; - sectors -= ret; + v->front_pad = v->back_pad = 0; } - return 0; + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); + + while (idx < end_idx && !ret) + ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); + + return ret; } int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { - struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; - - BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old: new; switch (k.k->type) { case KEY_TYPE_btree_ptr: @@ -1825,6 +1883,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, case KEY_TYPE_stripe: return bch2_trans_mark_stripe(trans, old, new, flags); case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: return bch2_trans_mark_inode(trans, old, new, flags); case KEY_TYPE_reservation: return bch2_trans_mark_reservation(trans, k, flags); @@ -1835,64 +1894,14 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, } } -int bch2_trans_mark_update(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_i *new, - unsigned flags) -{ - struct bkey _deleted = KEY(0, 0, 0); - struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; - struct bkey_s_c old; - int iter_flags, ret; - - if (unlikely(flags & BTREE_TRIGGER_NORUN)) - return 0; - - if (!btree_node_type_needs_gc(iter->btree_id)) - return 0; - - - if (likely(!(iter->flags & BTREE_ITER_CACHED_NOFILL))) { - iter_flags = iter->flags & BTREE_ITER_WITH_UPDATES; - iter->flags &= ~BTREE_ITER_WITH_UPDATES; - - old = bch2_btree_iter_peek_slot(iter); - iter->flags |= iter_flags; - - ret = bkey_err(old); - if (ret) - return ret; - } else { - /* - * If BTREE_ITER_CACHED_NOFILL was used, we better not be - * running triggers that do anything on removal (alloc btree): - */ - old = deleted; - } - - if (old.k->type == new->k.type && - ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - } else { - ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new), - BTREE_TRIGGER_INSERT|flags) ?: - bch2_trans_mark_key(trans, old, deleted, - BTREE_TRIGGER_OVERWRITE|flags); - } - - return ret; -} - static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_alloc_unpacked u; - struct bkey_alloc_buf *a; struct bch_extent_ptr ptr = { .dev = ca->dev_idx, .offset = bucket_to_sector(ca, b), @@ -1905,15 +1914,15 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (b >= ca->mi.nbuckets) return 0; - a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); - if (IS_ERR(a)) - return PTR_ERR(a); + ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u); + if (ret) + return ret; if (u.data_type && u.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", - iter->pos.inode, iter->pos.offset, u.gen, + iter.pos.inode, iter.pos.offset, u.gen, bch2_data_types[u.data_type], bch2_data_types[type], bch2_data_types[type]); @@ -1924,10 +1933,11 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, u.data_type = type; u.dirty_sectors = sectors; - bch2_alloc_pack(c, a, u); - bch2_trans_update(trans, iter, &a->k, 0); + ret = bch2_alloc_write(trans, &iter, &u, 0); + if (ret) + goto out; out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2092,20 +2102,29 @@ static void buckets_free_rcu(struct rcu_head *rcu) container_of(rcu, struct bucket_array, rcu); kvpfree(buckets, - sizeof(struct bucket_array) + + sizeof(*buckets) + buckets->nbuckets * sizeof(struct bucket)); } +static void bucket_gens_free_rcu(struct rcu_head *rcu) +{ + struct bucket_gens *buckets = + container_of(rcu, struct bucket_gens, rcu); + + kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); +} + int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, - ca->mi.bucket_size / c->opts.btree_node_size); + ca->mi.bucket_size / btree_sectors(c)); /* XXX: these should be tunable */ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6); @@ -2122,9 +2141,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), + !(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, GFP_KERNEL|__GFP_ZERO)) || + (c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO))) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -2134,6 +2156,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = nbuckets; + bucket_gens->first_bucket = ca->mi.first_bucket; + bucket_gens->nbuckets = nbuckets; bch2_copygc_stop(c); @@ -2144,6 +2168,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) } old_buckets = bucket_array(ca); + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { size_t n = min(buckets->nbuckets, old_buckets->nbuckets); @@ -2151,13 +2176,19 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets->b, old_buckets->b, n * sizeof(struct bucket)); - memcpy(buckets_nouse, - ca->buckets_nouse, - BITS_TO_LONGS(n) * sizeof(unsigned long)); + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); + if (buckets_nouse) + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); } rcu_assign_pointer(ca->buckets[0], buckets); - buckets = old_buckets; + rcu_assign_pointer(ca->bucket_gens, bucket_gens); + buckets = old_buckets; + bucket_gens = old_bucket_gens; swap(ca->buckets_nouse, buckets_nouse); @@ -2191,8 +2222,10 @@ err: free_fifo(&free[i]); kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); if (buckets) - call_rcu(&old_buckets->rcu, buckets_free_rcu); + call_rcu(&buckets->rcu, buckets_free_rcu); return ret; } @@ -2207,6 +2240,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free[i]); kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 0f544b6..7c6c59c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -53,11 +53,34 @@ static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) return buckets->b + b; } +static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +{ + return __bucket(ca, b, true); +} + static inline struct bucket *bucket(struct bch_dev *ca, size_t b) { return __bucket(ca, b, false); } +static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->bucket_gens, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); + +} + +static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +{ + struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + return gens->b + b; +} + /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. @@ -74,11 +97,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, return sector_to_bucket(ca, ptr->offset); } -static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - bool gc) +static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) { - return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); + return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); } static inline enum bch_data_type ptr_data_type(const struct bkey *k, @@ -91,18 +113,6 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } -static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, - const struct bch_extent_ptr *ptr) -{ - struct bucket_mark m; - - rcu_read_lock(); - m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); - rcu_read_unlock(); - - return m; -} - static inline int gen_cmp(u8 a, u8 b) { return (s8) (a - b); @@ -122,28 +132,22 @@ static inline int gen_after(u8 a, u8 b) static inline u8 ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); -} + u8 ret; -/* bucket gc marks */ + rcu_read_lock(); + ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + rcu_read_unlock(); -static inline unsigned bucket_sectors_used(struct bucket_mark mark) -{ - return mark.dirty_sectors + mark.cached_sectors; + return ret; } +/* bucket gc marks */ + static inline bool is_available_bucket(struct bucket_mark mark) { return !mark.dirty_sectors && !mark.stripe; } -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -} - /* Device usage: */ struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); @@ -218,7 +222,6 @@ bch2_fs_usage_read_short(struct bch_fs *); /* key/bucket marking: */ -void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *); void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool); @@ -226,16 +229,14 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned); +int bch2_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_update(struct btree_trans *, struct btree_iter *, +int bch2_mark_update(struct btree_trans *, struct btree_path *, struct bkey_i *, unsigned); int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, - struct bkey_i *insert, unsigned); -void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, size_t, enum bch_data_type, unsigned); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index b2de299..2c73dc6 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -15,18 +15,9 @@ struct bucket_mark { u8 gen; u8 data_type:3, owned_by_allocator:1, - journal_seq_valid:1, stripe:1; u16 dirty_sectors; u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket can't be - * reused until the journal sequence number written to disk is >= the - * bucket's journal sequence number: - */ - u16 journal_seq; }; }; }; @@ -39,7 +30,6 @@ struct bucket { u64 io_time[2]; u8 oldest_gen; - u8 gc_gen; unsigned gen_valid:1; u8 stripe_redundancy; u32 stripe; @@ -52,6 +42,13 @@ struct bucket_array { struct bucket b[]; }; +struct bucket_gens { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + u8 b[]; +}; + struct bch_dev_usage { u64 buckets_ec; u64 buckets_unavailable; diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c new file mode 100644 index 0000000..2e5b955 --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include + +static inline struct bucket_hashed * +bucket_hash(struct buckets_waiting_for_journal_table *t, + unsigned hash_seed_idx, u64 dev_bucket) +{ + unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); + + BUG_ON(!is_power_of_2(t->size)); + + return t->d + (h & (t->size - 1)); +} + +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) +{ + unsigned i; + + t->size = size; + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) + get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); + memset(t->d, 0, sizeof(t->d[0]) * size); +} + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + t = b->t; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + + if (h->dev_bucket == dev_bucket) { + ret = h->journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, + struct bucket_hashed *new, + u64 flushed_seq) +{ + struct bucket_hashed *last_evicted = NULL; + unsigned tries, i; + + for (tries = 0; tries < 10; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + old = bucket_hash(t, i, new->dev_bucket); + + if (old->dev_bucket == new->dev_bucket || + old->journal_seq <= flushed_seq) { + *old = *new; + return true; + } + + if (last_evicted != old) + victim = old; + } + + /* hashed to same slot 3 times: */ + if (!victim) + break; + + /* Failed to find an empty slot: */ + swap(*new, *victim); + last_evicted = victim; + } + + return false; +} + +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal_table *t, *n; + struct bucket_hashed tmp, new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }; + size_t i, new_size, nr_elements = 1, nr_rehashes = 0; + int ret = 0; + + mutex_lock(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) + goto out; + + t = b->t; + for (i = 0; i < t->size; i++) + nr_elements += t->d[i].journal_seq > flushed_seq; + + new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; + + n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); + if (!n) { + ret = -ENOMEM; + goto out; + } + +retry_rehash: + nr_rehashes++; + bucket_table_init(n, new_size); + + tmp = new; + BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); + + for (i = 0; i < t->size; i++) { + if (t->d[i].journal_seq <= flushed_seq) + continue; + + tmp = t->d[i]; + if (!bucket_table_insert(n, &tmp, flushed_seq)) + goto retry_rehash; + } + + b->t = n; + kvfree(t); + + pr_debug("took %zu rehashes, table at %zu/%zu elements", + nr_rehashes, nr_elements, b->t->size); +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->t); +} + +#define INITIAL_TABLE_SIZE 8 + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); + if (!b->t) + return -ENOMEM; + + bucket_table_init(b->t, INITIAL_TABLE_SIZE); + return 0; +} diff --git a/libbcachefs/buckets_waiting_for_journal.h b/libbcachefs/buckets_waiting_for_journal.h new file mode 100644 index 0000000..d2ae19c --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 0000000..fea7f94 --- /dev/null +++ b/libbcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +#include + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal_table { + size_t size; + siphash_key_t hash_seeds[3]; + struct bucket_hashed d[]; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + struct buckets_waiting_for_journal_table *t; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index db68a78..aa26588 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -568,8 +568,11 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!dev) + return -EINVAL; + for_each_online_member(ca, c, i) - if (ca->disk_sb.bdev->bd_dev == dev) { + if (ca->dev == dev) { percpu_ref_put(&ca->io_ref); return i; } diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index d20924e..a1d8992 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -35,18 +35,18 @@ struct bch2_checksum_state { static void bch2_checksum_init(struct bch2_checksum_state *state) { switch (state->type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: state->seed = 0; break; - case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_crc32c_nonzero: state->seed = U32_MAX; break; - case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_crc64_nonzero: state->seed = U64_MAX; break; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: xxh64_reset(&state->h64state, 0); break; default: @@ -57,15 +57,15 @@ static void bch2_checksum_init(struct bch2_checksum_state *state) static u64 bch2_checksum_final(const struct bch2_checksum_state *state) { switch (state->type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: return state->seed; - case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_crc32c_nonzero: return state->seed ^ U32_MAX; - case BCH_CSUM_CRC64_NONZERO: + case BCH_CSUM_crc64_nonzero: return state->seed ^ U64_MAX; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: return xxh64_digest(&state->h64state); default: BUG(); @@ -75,17 +75,17 @@ static u64 bch2_checksum_final(const struct bch2_checksum_state *state) static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) { switch (state->type) { - case BCH_CSUM_NONE: + case BCH_CSUM_none: return; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC32C: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc32c: state->seed = crc32c(state->seed, data, len); break; - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC64: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc64: state->seed = crc64_be(state->seed, data, len); break; - case BCH_CSUM_XXHASH: + case BCH_CSUM_xxhash: xxh64_update(&state->h64state, data, len); break; default: @@ -161,12 +161,12 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, struct nonce nonce, const void *data, size_t len) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_XXHASH: - case BCH_CSUM_CRC64: { + case BCH_CSUM_none: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; @@ -177,8 +177,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; @@ -212,13 +212,13 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, struct bio_vec bv; switch (type) { - case BCH_CSUM_NONE: + case BCH_CSUM_none: return (struct bch_csum) { 0 }; - case BCH_CSUM_CRC32C_NONZERO: - case BCH_CSUM_CRC64_NONZERO: - case BCH_CSUM_CRC32C: - case BCH_CSUM_XXHASH: - case BCH_CSUM_CRC64: { + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { struct bch2_checksum_state state; state.type = type; @@ -238,8 +238,8 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; } - case BCH_CSUM_CHACHA20_POLY1305_80: - case BCH_CSUM_CHACHA20_POLY1305_128: { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { SHASH_DESC_ON_STACK(desc, c->poly1305); u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; @@ -407,16 +407,12 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, } #ifdef __KERNEL__ -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +static int __bch2_request_key(char *key_description, struct bch_key *key) { - char key_description[60]; struct key *keyring_key; const struct user_key_payload *ukp; int ret; - snprintf(key_description, sizeof(key_description), - "bcachefs:%pUb", &sb->user_uuid); - keyring_key = request_key(&key_type_logon, key_description, NULL); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); @@ -436,16 +432,10 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) } #else #include -#include -int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +static int __bch2_request_key(char *key_description, struct bch_key *key) { key_serial_t key_id; - char key_description[60]; - char uuid[40]; - - uuid_unparse_lower(sb->user_uuid.b, uuid); - sprintf(key_description, "bcachefs:%s", uuid); key_id = request_key("user", key_description, NULL, KEY_SPEC_USER_KEYRING); @@ -459,6 +449,17 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) } #endif +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + char key_description[60]; + char uuid[40]; + + uuid_unparse_lower(sb->user_uuid.b, uuid); + sprintf(key_description, "bcachefs:%s", uuid); + + return __bch2_request_key(key_description, key); +} + int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, struct bch_key *key) diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 6841fb1..f5c1a60 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -13,9 +13,9 @@ static inline bool bch2_checksum_mergeable(unsigned type) { switch (type) { - case BCH_CSUM_NONE: - case BCH_CSUM_CRC32C: - case BCH_CSUM_CRC64: + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: return true; default: return false; @@ -78,13 +78,13 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, { switch (type) { case BCH_CSUM_OPT_none: - return BCH_CSUM_NONE; + return BCH_CSUM_none; case BCH_CSUM_OPT_crc32c: - return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; + return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; case BCH_CSUM_OPT_crc64: - return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; + return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; case BCH_CSUM_OPT_xxhash: - return BCH_CSUM_XXHASH; + return BCH_CSUM_xxhash; default: BUG(); } @@ -95,8 +95,8 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, { if (c->sb.encryption_type) return c->opts.wide_macs - ? BCH_CSUM_CHACHA20_POLY1305_128 - : BCH_CSUM_CHACHA20_POLY1305_80; + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; return bch2_csum_opt_to_type(opt, true); } @@ -104,7 +104,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) { if (c->sb.encryption_type) - return BCH_CSUM_CHACHA20_POLY1305_128; + return BCH_CSUM_chacha20_poly1305_128; return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index f63651d..8e4179d 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -26,7 +26,7 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) { void *b; - BUG_ON(size > c->sb.encoded_extent_max << 9); + BUG_ON(size > c->opts.encoded_extent_max); b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); if (b) @@ -68,7 +68,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, struct page **pages = NULL; void *data; - BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); + BUG_ON(start.bi_size > c->opts.encoded_extent_max); if (!PageHighMem(bio_iter_page(bio, start)) && bio_phys_contig(bio, start)) @@ -231,8 +231,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size > c->sb.encoded_extent_max || - crc->compressed_size > c->sb.encoded_extent_max) { + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { bch_err(c, "error rewriting existing data: extent too big"); return -EIO; } @@ -272,8 +272,8 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, size_t dst_len = crc.uncompressed_size << 9; int ret = -ENOMEM; - if (crc.uncompressed_size > c->sb.encoded_extent_max || - crc.compressed_size > c->sb.encoded_extent_max) + if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || + crc.compressed_size << 9 > c->opts.encoded_extent_max) return -EIO; dst_data = dst_len == dst_iter.bi_size @@ -376,7 +376,7 @@ static unsigned __bio_compress(struct bch_fs *c, BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); /* If it's only one block, don't bother trying to compress: */ - if (bio_sectors(src) <= c->opts.block_size) + if (src->bi_iter.bi_size <= c->opts.block_size) return 0; dst_data = bio_map_or_bounce(c, dst, WRITE); @@ -466,7 +466,7 @@ unsigned bch2_bio_compress(struct bch_fs *c, /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, - c->sb.encoded_extent_max << 9); + c->opts.encoded_extent_max); /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); @@ -544,10 +544,9 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { - size_t max_extent = c->sb.encoded_extent_max << 9; size_t decompress_workspace_size = 0; bool decompress_workspace_needed; - ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); + ZSTD_parameters params = ZSTD_getParams(0, c->opts.encoded_extent_max, 0); struct { unsigned feature; unsigned type; @@ -579,14 +578,14 @@ have_compressed: if (!mempool_initialized(&c->compression_bounce[READ])) { ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, max_extent); + 1, c->opts.encoded_extent_max); if (ret) goto out; } if (!mempool_initialized(&c->compression_bounce[WRITE])) { ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, max_extent); + 1, c->opts.encoded_extent_max); if (ret) goto out; } diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index b0a8eb5..ee5b7f6 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -243,7 +243,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int err; @@ -260,10 +260,10 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, - BTREE_ITER_PREFETCH| - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(&trans, &iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); while (k.k && !(err = bkey_err(k))) { bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); @@ -272,8 +272,8 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->buf[i->bytes] = '\n'; i->bytes++; - k = bch2_btree_iter_next(iter); - i->from = iter->pos; + k = bch2_btree_iter_next(&iter); + i->from = iter.pos; err = flush_buf(i); if (err) @@ -282,7 +282,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, if (!i->size) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -301,7 +301,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; int err; @@ -318,7 +318,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, err) { bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); i->bytes = strlen(i->buf); err = flush_buf(i); @@ -336,7 +336,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (!i->size) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); @@ -355,7 +355,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, { struct dump_iter *i = file->private_data; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct btree *prev_node = NULL; int err; @@ -373,11 +373,13 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_trans_init(&trans, i->c, 0, 0); - iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - while ((k = bch2_btree_iter_peek(iter)).k && + while ((k = bch2_btree_iter_peek(&iter)).k && !(err = bkey_err(k))) { - struct btree_iter_level *l = &iter->l[0]; + struct btree_path_level *l = &iter.path->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); @@ -396,8 +398,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (err) break; - bch2_btree_iter_advance(iter); - i->from = iter->pos; + bch2_btree_iter_advance(&iter); + i->from = iter.pos; err = flush_buf(i); if (err) @@ -406,6 +408,8 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, if (!i->size) break; } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); return err < 0 ? err : i->ret; diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 02b2968..6f699b7 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -8,6 +8,7 @@ #include "fs.h" #include "keylist.h" #include "str_hash.h" +#include "subvolume.h" #include @@ -63,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); } +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) + return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; + return true; +} + const struct bch_hash_desc bch2_dirent_hash_desc = { .btree_id = BTREE_ID_dirents, .key_type = KEY_TYPE_dirent, @@ -70,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { .hash_bkey = dirent_hash_bkey, .cmp_key = dirent_cmp_key, .cmp_bkey = dirent_cmp_bkey, + .is_visible = dirent_is_visible, }; const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -99,7 +110,8 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (memchr(d.v->d_name, '/', len)) return "invalid name"; - if (le64_to_cpu(d.v->d_inum) == d.k->p.inode) + if (d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode) return "dirent points to own directory"; return NULL; @@ -112,14 +124,16 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, bch_scnmemcpy(out, d.v->d_name, bch2_dirent_name_bytes(d)); - pr_buf(out, " -> %llu type %s", d.v->d_inum, - d.v->d_type < DT_MAX - ? bch2_d_types[d.v->d_type] - : "(bad d_type)"); + pr_buf(out, " -> %llu type %s", + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), + bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - u8 type, const struct qstr *name, u64 dst) + subvol_inum dir, u8 type, + const struct qstr *name, u64 dst) { struct bkey_i_dirent *dirent; unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); @@ -135,7 +149,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, bkey_dirent_init(&dirent->k_i); dirent->k.u64s = u64s; - dirent->v.d_inum = cpu_to_le64(dst); + + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); + } + dirent->v.d_type = type; memcpy(dirent->v.d_name, name->name, name->len); @@ -149,21 +170,21 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, int flags) { struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, type, name, dst_inum); + dirent = dirent_create_key(trans, dir, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -176,82 +197,130 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) +{ + struct bch_subvolume s; + int ret = 0; + + if (d.v->d_type == DT_SUBVOL && + d.v->d_parent_subvol != dir.subvol) + return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { + target->subvol = dir.subvol; + target->inum = le64_to_cpu(d.v->d_inum); + } else { + target->subvol = le32_to_cpu(d.v->d_child_subvol); + + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + + target->inum = le64_to_cpu(s.inode); + } + + return ret; +} + int bch2_dirent_rename(struct btree_trans *trans, - u64 src_dir, struct bch_hash_info *src_hash, - u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, u64 *src_offset, - const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) { - struct btree_iter *src_iter = NULL, *dst_iter = NULL; - struct bkey_s_c old_src, old_dst; + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst = bkey_s_c_null; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = - POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); + unsigned src_type = 0, dst_type = 0, src_update_flags = 0; int ret = 0; - *src_inum = *dst_inum = 0; + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; - /* - * Lookup dst: - * - * Note that in BCH_RENAME mode, we're _not_ checking if - * the target already exists - we're relying on the VFS - * to do that check for us for correctness: - */ - dst_iter = mode == BCH_RENAME - ? bch2_hash_hole(trans, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name) - : bch2_hash_lookup(trans, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_iter); - if (ret) - goto out; + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); - old_dst = bch2_btree_iter_peek_slot(dst_iter); - ret = bkey_err(old_dst); + /* Lookup src: */ + ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_INTENT); if (ret) goto out; - if (mode != BCH_RENAME) - *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); - if (mode != BCH_RENAME_EXCHANGE) - *src_offset = dst_iter->pos.offset; - - /* Lookup src: */ - src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_iter); + old_src = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(old_src); if (ret) goto out; - old_src = bch2_btree_iter_peek_slot(src_iter); - ret = bkey_err(old_src); + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); if (ret) goto out; - *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + src_type = bkey_s_c_to_dirent(old_src).v->d_type; + + if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) + return -EOPNOTSUPP; + + + /* Lookup dst: */ + if (mode == BCH_RENAME) { + /* + * Note that we're _not_ checking if the target already exists - + * we're relying on the VFS to do that check for us for + * correctness: + */ + ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name); + if (ret) + goto out; + } else { + ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_dst = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(old_dst); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + + dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; + + if (dst_type == DT_SUBVOL) + return -EOPNOTSUPP; + } + + if (mode != BCH_RENAME_EXCHANGE) + *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); - new_dst->k.p = dst_iter->pos; + new_dst->k.p = dst_iter.pos; /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, 0, src_name, 0); + new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); - new_src->k.p = src_iter->pos; + new_src->k.p = src_iter.pos; } else { new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(new_src); @@ -259,10 +328,10 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; bkey_init(&new_src->k); - new_src->k.p = src_iter->pos; + new_src->k.p = src_iter.pos; - if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && - bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { + if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && + bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { /* * We have a hash collision for the new dst key, * and new_src - the key we're deleting - is between @@ -275,10 +344,9 @@ int bch2_dirent_rename(struct btree_trans *trans, * If we're not overwriting, we can just insert * new_dst at the src position: */ - new_dst->k.p = src_iter->pos; - bch2_trans_update(trans, src_iter, - &new_dst->k_i, 0); - goto out_set_offset; + new_src = new_dst; + new_src->k.p = src_iter.pos; + goto out_set_src; } else { /* If we're overwriting, we can't insert new_dst * at a different slot because it has to @@ -290,7 +358,7 @@ int bch2_dirent_rename(struct btree_trans *trans, } else { /* Check if we need a whiteout to delete src: */ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - src_hash, src_iter); + src_hash, &src_iter); if (ret < 0) goto out; @@ -299,75 +367,112 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, src_iter, &new_src->k_i, 0); - bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); -out_set_offset: + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + if (ret) + goto out; +out_set_src: + + /* + * If we're deleting a subvolume, we need to really delete the dirent, + * not just emit a whiteout in the current snapshot: + */ + if (src_type == DT_SUBVOL) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter); + if (ret) + goto out; + + new_src->k.p = src_iter.pos; + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); + if (ret) + goto out; + if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; *dst_offset = new_dst->k.p.offset; out: - bch2_trans_iter_put(trans, src_iter); - bch2_trans_iter_put(trans, dst_iter); + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); return ret; } -int bch2_dirent_delete_at(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - struct btree_iter *iter) -{ - return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, iter); -} - -struct btree_iter * -__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name, unsigned flags) -{ - return bch2_hash_lookup(trans, bch2_dirent_hash_desc, - hash_info, dir_inum, name, flags); -} - -u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name) +int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; - u64 inum = 0; - int ret = 0; + struct bkey_s_c_dirent d; + u32 snapshot; + int ret; - bch2_trans_init(&trans, c, 0, 0); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; - iter = __bch2_dirent_lookup_trans(&trans, dir_inum, - hash_info, name, 0); - ret = PTR_ERR_OR_ZERO(iter); + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); if (ret) - goto out; + return ret; k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) - goto out; + goto err; - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); - bch2_trans_iter_put(&trans, iter); -out: - BUG_ON(ret == -EINTR); + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret > 0) + ret = -ENOENT; +err: + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; +} + +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); + if (ret == -EINTR) + goto retry; + if (!ret) + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return inum; + return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, BTREE_ID_dirents, - POS(dir_inum, 0), 0, k, ret) { - if (k.k->p.inode > dir_inum) + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode > dir.inum) break; if (k.k->type == KEY_TYPE_dirent) { @@ -375,24 +480,32 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; + subvol_inum target; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_dirents, - POS(inum, ctx->pos), 0, k, ret) { - if (k.k->p.inode > inum) + for_each_btree_key_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { + if (k.k->p.inode > inum.inum) break; if (k.k->type != KEY_TYPE_dirent) @@ -400,6 +513,12 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) dirent = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inum, dirent, &target); + if (ret < 0) + break; + if (ret) + continue; + /* * XXX: dir_emit() can fault and block, while we're holding * locks @@ -407,14 +526,25 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ctx->pos = dirent.k->p.offset; if (!dir_emit(ctx, dirent.v->d_name, bch2_dirent_name_bytes(dirent), - le64_to_cpu(dirent.v->d_inum), - dirent.v->d_type)) + target.inum, + vfs_d_type(dirent.v->d_type))) break; ctx->pos = dirent.k->p.offset + 1; + + /* + * read_target looks up subvolumes, we can overflow paths if the + * directory has many subvolumes in it + */ + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); return ret; } diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index e1d8ce3..1bb4d80 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -29,13 +29,17 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int bch2_dirent_create(struct btree_trans *, u64, +int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + +int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); -int bch2_dirent_delete_at(struct btree_trans *, - const struct bch_hash_info *, - struct btree_iter *); +static inline unsigned vfs_d_type(unsigned type) +{ + return type == DT_SUBVOL ? DT_DIR : type; +} enum bch_rename_mode { BCH_RENAME, @@ -44,20 +48,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - u64, struct bch_hash_info *, - u64, struct bch_hash_info *, - const struct qstr *, u64 *, u64 *, - const struct qstr *, u64 *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -struct btree_iter * -__bch2_dirent_lookup_trans(struct btree_trans *, u64, - const struct bch_hash_info *, - const struct qstr *, unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *); +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); -int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_readdir(struct bch_fs *, u64, struct dir_context *); +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index c52b6fa..6c84297 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -17,24 +17,20 @@ static int group_cmp(const void *_l, const void *_r) strncmp(l->label, r->label, sizeof(l->label)); } -static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_disk_groups *groups = field_to_type(f, disk_groups); struct bch_disk_group *g, *sorted = NULL; - struct bch_sb_field_members *mi; - struct bch_member *m; - unsigned i, nr_groups, len; - const char *err = NULL; - - mi = bch2_sb_get_members(sb); - groups = bch2_sb_get_disk_groups(sb); - nr_groups = disk_groups_nr(groups); + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned nr_groups = disk_groups_nr(groups); + unsigned i, len; + int ret = -EINVAL; - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) { + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; unsigned g; if (!BCH_MEMBER_GROUP(m)) @@ -42,45 +38,53 @@ static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, g = BCH_MEMBER_GROUP(m) - 1; - if (g >= nr_groups || - BCH_GROUP_DELETED(&groups->entries[g])) - return "disk has invalid group"; + if (g >= nr_groups) { + pr_buf(err, "disk %u has invalid label %u (have %u)", + i, g, nr_groups); + return -EINVAL; + } + + if (BCH_GROUP_DELETED(&groups->entries[g])) { + pr_buf(err, "disk %u has deleted label %u", i, g); + return -EINVAL; + } } if (!nr_groups) - return NULL; + return 0; + + for (i = 0; i < nr_groups; i++) { + g = groups->entries + i; - for (g = groups->entries; - g < groups->entries + nr_groups; - g++) { if (BCH_GROUP_DELETED(g)) continue; len = strnlen(g->label, sizeof(g->label)); if (!len) { - err = "group with empty label"; - goto err; + pr_buf(err, "label %u empty", i); + return -EINVAL; } } sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); if (!sorted) - return "cannot allocate memory"; + return -ENOMEM; memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); - for (i = 0; i + 1 < nr_groups; i++) - if (!BCH_GROUP_DELETED(sorted + i) && - !group_cmp(sorted + i, sorted + i + 1)) { - err = "duplicate groups"; + for (g = sorted; g + 1 < sorted + nr_groups; g++) + if (!BCH_GROUP_DELETED(g) && + !group_cmp(&g[0], &g[1])) { + pr_buf(err, "duplicate label %llu.", BCH_GROUP_PARENT(g)); + bch_scnmemcpy(err, g->label, strnlen(g->label, sizeof(g->label))); goto err; } - err = NULL; + ret = 0; err: kfree(sorted); - return err; + return 0; } static void bch2_sb_disk_groups_to_text(struct printbuf *out, diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 328e042..9b45640 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -15,6 +15,7 @@ #include "io.h" #include "keylist.h" #include "recovery.h" +#include "replicas.h" #include "super-io.h" #include "util.h" @@ -142,8 +143,8 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, } /* returns blocknr in stripe that we matched: */ -static int bkey_matches_stripe(struct bch_stripe *s, - struct bkey_s_c k) +static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, + struct bkey_s_c k, unsigned *block) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -152,10 +153,12 @@ static int bkey_matches_stripe(struct bch_stripe *s, bkey_for_each_ptr(ptrs, ptr) for (i = 0; i < nr_data; i++) if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, - le16_to_cpu(s->sectors))) - return i; + le16_to_cpu(s->sectors))) { + *block = i; + return ptr; + } - return -1; + return NULL; } static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) @@ -429,13 +432,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, + POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -445,6 +449,7 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip } bkey_reassemble(&stripe->key.k_i, k); err: + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -542,29 +547,29 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) free_heap(&n); } - if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) + if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) return -ENOMEM; if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && - !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) + !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -ENOMEM; return 0; } -static int ec_stripe_mem_alloc(struct bch_fs *c, +static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { size_t idx = iter->pos.offset; int ret = 0; - if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) + if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) return ret; - bch2_trans_unlock(iter->trans); + bch2_trans_unlock(trans); ret = -EINTR; - if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) + if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL)) return ret; return -ENOMEM; @@ -591,13 +596,13 @@ static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, { struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; + genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; } static void heap_verify_backpointer(struct bch_fs *c, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes[0], idx); + struct stripe *m = genradix_ptr(&c->stripes, idx); BUG_ON(!m->alive); BUG_ON(m->heap_idx >= h->used); @@ -672,7 +677,7 @@ static int ec_stripe_delete(struct bch_fs *c, size_t idx) return bch2_btree_delete_range(c, BTREE_ID_stripes, POS(0, idx), POS(0, idx + 1), - NULL); + 0, NULL); } static void ec_stripe_delete_work(struct work_struct *work) @@ -689,7 +694,7 @@ static void ec_stripe_delete_work(struct work_struct *work) break; } - bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); + bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx); spin_unlock(&c->ec_stripes_heap_lock); if (ec_stripe_delete(c, idx)) @@ -699,27 +704,23 @@ static void ec_stripe_delete_work(struct work_struct *work) /* stripe creation: */ -static int ec_stripe_bkey_insert(struct bch_fs *c, +static int ec_stripe_bkey_insert(struct btree_trans *trans, struct bkey_i_stripe *stripe, struct disk_reservation *res) { - struct btree_trans trans; - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_s_c k; struct bpos min_pos = POS(0, 1); struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos, + for_each_btree_key(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(iter, start_pos); + bch2_btree_iter_set_pos(&iter, start_pos); continue; } @@ -733,41 +734,36 @@ retry: goto err; found_slot: - start_pos = iter->pos; + start_pos = iter.pos; - ret = ec_stripe_mem_alloc(c, iter); + ret = ec_stripe_mem_alloc(trans, &iter); if (ret) goto err; - stripe->k.p = iter->pos; + stripe->k.p = iter.pos; - ret = bch2_trans_update(&trans, iter, &stripe->k_i, 0) ?: - bch2_trans_commit(&trans, res, NULL, - BTREE_INSERT_NOFAIL); -err: - bch2_trans_iter_put(&trans, iter); + ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); - if (ret == -EINTR) - goto retry; - - c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; - bch2_trans_exit(&trans); + c->ec_stripe_hint = start_pos.offset; +err: + bch2_trans_iter_exit(trans, &iter); return ret; } static int ec_stripe_bkey_update(struct btree_trans *trans, - struct bkey_i_stripe *new) + struct bkey_i_stripe *new, + struct disk_reservation *res) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; const struct bch_stripe *existing; unsigned i; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -790,9 +786,9 @@ static int ec_stripe_bkey_update(struct btree_trans *trans, stripe_blockcount_set(&new->v, i, stripe_blockcount_get(existing, i)); - ret = bch2_trans_update(trans, iter, &new->k_i, 0); + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -820,10 +816,11 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, struct bkey *pos) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_extent e; struct bkey_buf sk; + struct bpos next_pos; int ret = 0, dev, block; bch2_bkey_buf_init(&sk); @@ -831,23 +828,29 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, /* XXX this doesn't support the reflink btree */ - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - bkey_start_pos(pos), - BTREE_ITER_INTENT); - - while ((k = bch2_btree_iter_peek(iter)).k && + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(pos), + BTREE_ITER_INTENT); +retry: + while (bch2_trans_begin(&trans), + (k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + const struct bch_extent_ptr *ptr_c; struct bch_extent_ptr *ptr, *ec_ptr = NULL; if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } - block = bkey_matches_stripe(&s->key.v, k); - if (block < 0) { - bch2_btree_iter_advance(iter); + ptr_c = bkey_matches_stripe(&s->key.v, k, &block); + /* + * It doesn't generally make sense to erasure code cached ptrs: + * XXX: should we be incrementing a counter? + */ + if (!ptr_c || ptr_c->cached) { + bch2_btree_iter_advance(&iter); continue; } @@ -862,17 +865,21 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, extent_stripe_ptr_add(e, s, ec_ptr, block); - bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, sk.k, 0) ?: + bch2_btree_iter_set_pos(&iter, bkey_start_pos(&sk.k->k)); + next_pos = sk.k->k.p; + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); - if (ret == -EINTR) - ret = 0; + if (!ret) + bch2_btree_iter_set_pos(&iter, next_pos); if (ret) break; } - bch2_trans_iter_put(&trans, iter); + if (ret == -EINTR) + goto retry; + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -938,10 +945,10 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err_put_writes; } - ret = s->have_existing_stripe - ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, - ec_stripe_bkey_update(&trans, &s->new_stripe.key)) - : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res); + ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, + s->have_existing_stripe + ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res) + : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res)); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); goto err_put_writes; @@ -956,7 +963,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) } spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset); + m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); BUG_ON(m->on_heap); bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); @@ -1056,22 +1063,20 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) if (!ob) return NULL; - ca = bch_dev_bkey_exists(c, ob->ptr.dev); + ca = bch_dev_bkey_exists(c, ob->dev); offset = ca->mi.bucket_size - ob->sectors_free; return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); } -void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, - struct bpos pos, unsigned sectors) +void bch2_ob_add_backpointer(struct bch_fs *c, struct open_bucket *ob, + struct bkey *k) { - struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); - struct ec_stripe_new *ec; + struct ec_stripe_new *ec = ob->ec; - if (!ob) + if (!ec) return; - ec = ob->ec; mutex_lock(&ec->lock); if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, @@ -1081,8 +1086,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, } bkey_init(&ec->keys.top->k); - ec->keys.top->k.p = pos; - bch2_key_resize(&ec->keys.top->k, sectors); + ec->keys.top->k.p = k->p; + ec->keys.top->k.size = k->size; bch2_keylist_push(&ec->keys); mutex_unlock(&ec->lock); @@ -1147,8 +1152,8 @@ static void ec_stripe_key_init(struct bch_fs *c, s->v.algorithm = 0; s->v.nr_blocks = nr_data + nr_parity; s->v.nr_redundant = nr_parity; - s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); - s->v.csum_type = BCH_CSUM_CRC32C; + s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); + s->v.csum_type = BCH_CSUM_crc32c; s->v.pad = 0; while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { @@ -1266,16 +1271,15 @@ found: return h; } -static enum bucket_alloc_ret -new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, - struct closure *cl) +static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, + struct closure *cl) { struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; - enum bucket_alloc_ret ret = ALLOC_SUCCESS; + int ret = 0; for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { if (test_bit(i, h->s->blocks_gotten)) { @@ -1314,7 +1318,7 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data + h->s->nr_parity); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1342,7 +1346,7 @@ new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = ob->ptr; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1375,7 +1379,7 @@ static s64 get_existing_stripe(struct bch_fs *c, continue; stripe_idx = h->data[heap_idx].idx; - m = genradix_ptr(&c->stripes[0], stripe_idx); + m = genradix_ptr(&c->stripes, stripe_idx); if (m->algorithm == head->algo && m->nr_redundant == head->redundancy && @@ -1510,7 +1514,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, err: bch2_ec_stripe_head_put(c, h); - return ERR_PTR(-ret); + return ERR_PTR(ret); } void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) @@ -1531,7 +1535,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) continue; ob = c->open_buckets + h->s->blocks[i]; - if (ob->ptr.dev == ca->dev_idx) + if (ob->dev == ca->dev_idx) goto found; } goto unlock; @@ -1549,145 +1553,59 @@ void bch2_stripes_heap_start(struct bch_fs *c) struct genradix_iter iter; struct stripe *m; - genradix_for_each(&c->stripes[0], iter, m) + genradix_for_each(&c->stripes, iter, m) if (m->alive) bch2_stripes_heap_insert(c, m, iter.pos); } -static int __bch2_stripe_write_key(struct btree_trans *trans, - struct btree_iter *iter, - struct stripe *m, - size_t idx, - struct bkey_i_stripe *new_key) +int bch2_stripes_read(struct bch_fs *c) { - const struct bch_stripe *v; + struct btree_trans trans; + struct btree_iter iter; struct bkey_s_c k; + const struct bch_stripe *s; + struct stripe *m; unsigned i; int ret; - bch2_btree_iter_set_pos(iter, POS(0, idx)); - - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (ret) - return ret; - - if (k.k->type != KEY_TYPE_stripe) - return -EIO; - - v = bkey_s_c_to_stripe(k).v; - for (i = 0; i < v->nr_blocks; i++) - if (m->block_sectors[i] != stripe_blockcount_get(v, i)) - goto write; - return 0; -write: - bkey_reassemble(&new_key->k_i, k); - - for (i = 0; i < new_key->v.nr_blocks; i++) - stripe_blockcount_set(&new_key->v, i, - m->block_sectors[i]); - - return bch2_trans_update(trans, iter, &new_key->k_i, 0); -} - -int bch2_stripes_write(struct bch_fs *c, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct genradix_iter giter; - struct bkey_i_stripe *new_key; - struct stripe *m; - int ret = 0; - - new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); - BUG_ON(!new_key); - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - genradix_for_each(&c->stripes[0], giter, m) { - if (!m->alive) + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) continue; - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL|flags, - __bch2_stripe_write_key(&trans, iter, m, - giter.pos, new_key)); - + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); if (ret) break; - } - bch2_trans_iter_put(&trans, iter); - bch2_trans_exit(&trans); + s = bkey_s_c_to_stripe(k).v; - kfree(new_key); - - return ret; -} + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->alive = true; + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; -static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k) -{ - int ret = 0; + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); - if (k.k->type == KEY_TYPE_stripe) - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: - bch2_mark_key(c, k, - BTREE_TRIGGER_INSERT| - BTREE_TRIGGER_NOATOMIC); + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_update(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } + bch2_trans_iter_exit(&trans, &iter); - return ret; -} + bch2_trans_exit(&trans); -int bch2_stripes_read(struct bch_fs *c) -{ - int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes, - bch2_stripes_read_fn); if (ret) bch_err(c, "error reading stripes: %i", ret); return ret; } -int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - size_t i, idx = 0; - int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0); - - k = bch2_btree_iter_prev(iter); - if (!IS_ERR_OR_NULL(k.k)) - idx = k.k->p.offset + 1; - - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans); - if (ret) - return ret; - - if (!idx) - return 0; - - if (!gc && - !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), - GFP_KERNEL)) - return -ENOMEM; -#if 0 - ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -#else - for (i = 0; i < idx; i++) - if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) - return -ENOMEM; -#endif - return 0; -} - void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) { ec_stripes_heap *h = &c->ec_stripes_heap; @@ -1696,7 +1614,7 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) spin_lock(&c->ec_stripes_heap_lock); for (i = 0; i < min_t(size_t, h->used, 20); i++) { - m = genradix_ptr(&c->stripes[0], h->data[i].idx); + m = genradix_ptr(&c->stripes, h->data[i].idx); pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, h->data[i].blocks_nonempty, @@ -1754,7 +1672,7 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes[0]); + genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index e79626b..78d468c 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -108,7 +108,7 @@ static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, le16_to_cpu(s->sectors)); } -static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m, +static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, struct extent_ptr_decoded p) { unsigned nr_data = m->nr_blocks - m->nr_redundant; @@ -193,8 +193,8 @@ struct ec_stripe_head { int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, - struct bpos, unsigned); +void bch2_ob_add_backpointer(struct bch_fs *, struct open_bucket *, + struct bkey *); void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); @@ -216,9 +216,6 @@ void bch2_ec_flush_new_stripes(struct bch_fs *); void bch2_stripes_heap_start(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -int bch2_stripes_write(struct bch_fs *, unsigned); - -int bch2_ec_mem_alloc(struct bch_fs *, bool); void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 3fc3122..edd93da 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -21,6 +21,15 @@ struct stripe { unsigned alive:1; /* does a corresponding key exist in stripes btree? */ unsigned on_heap:1; u8 blocks_nonempty; +}; + +struct gc_stripe { + u16 sectors; + + u8 nr_blocks; + u8 nr_redundant; + + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h new file mode 100644 index 0000000..f7d1291 --- /dev/null +++ b/libbcachefs/errcode.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + +enum { + /* Bucket allocator: */ + OPEN_BUCKETS_EMPTY = 2048, + FREELIST_EMPTY, /* Allocator thread not keeping up */ + INSUFFICIENT_DEVICES, +}; + +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 2cea694..8279a9b 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "inconsistency detected - emergency read only"); return true; case BCH_ON_ERROR_panic: panic(bch2_fmt(c, "panic after error")); @@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c) void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) - bch_err(c, "emergency read only"); + bch_err(c, "fatal error - emergency read only"); } void bch2_io_error_work(struct work_struct *work) diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 4a8dd08..58b2c96 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -58,10 +58,10 @@ static int count_iters_for_insert(struct btree_trans *trans, u64 idx = le64_to_cpu(p.v->idx); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c r_k; - for_each_btree_key(trans, iter, + for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), BTREE_ITER_SLOTS, r_k, ret2) { if (bkey_cmp(bkey_start_pos(r_k.k), @@ -83,8 +83,8 @@ static int count_iters_for_insert(struct btree_trans *trans, break; } } + bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_put(trans, iter); break; } } @@ -94,12 +94,12 @@ static int count_iters_for_insert(struct btree_trans *trans, #define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -int bch2_extent_atomic_end(struct btree_iter *iter, +int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_i *insert, struct bpos *end) { - struct btree_trans *trans = iter->trans; - struct btree_iter *copy; + struct btree_iter copy; struct bkey_s_c k; unsigned nr_iters = 0; int ret; @@ -118,9 +118,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter, if (ret < 0) return ret; - copy = bch2_trans_copy_iter(trans, iter); + bch2_trans_copy_iter(©, iter); - for_each_btree_key_continue(copy, 0, k, ret) { + for_each_btree_key_continue_norestart(copy, 0, k, ret) { unsigned offset = 0; if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) @@ -149,31 +149,21 @@ int bch2_extent_atomic_end(struct btree_iter *iter, break; } - bch2_trans_iter_put(trans, copy); + bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } -int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) +int bch2_extent_trim_atomic(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) { struct bpos end; int ret; - ret = bch2_extent_atomic_end(iter, k, &end); + ret = bch2_extent_atomic_end(trans, iter, k, &end); if (ret) return ret; bch2_cut_back(end, k); return 0; } - -int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -{ - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(iter, k, &end); - if (ret) - return ret; - - return !bkey_cmp(end, k->k.p); -} diff --git a/libbcachefs/extent_update.h b/libbcachefs/extent_update.h index 2fa4602..6f5cf44 100644 --- a/libbcachefs/extent_update.h +++ b/libbcachefs/extent_update.h @@ -4,9 +4,9 @@ #include "bcachefs.h" -int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, - struct bpos *); -int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); +int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bpos *); +int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, + struct bkey_i *); #endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 563e130..44c584e 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -303,7 +303,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) if (lp.crc.csum_type && lp.crc.uncompressed_size + - rp.crc.uncompressed_size > c->sb.encoded_extent_max) + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) return false; if (lp.crc.uncompressed_size + rp.crc.uncompressed_size > @@ -480,7 +480,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -612,38 +612,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - bch2_trans_iter_put(&trans, iter); - - bch2_trans_exit(&trans); - - return ret; -} - unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -817,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, return i; } -union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - union bch_extent_entry *dst, *src, *prev; + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; bool drop_crc = true; EBUG_ON(ptr < &ptrs.start->ptr || ptr >= &ptrs.end->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = extent_entry_next(to_entry(ptr)); - if (src != ptrs.end && - !extent_entry_is_crc(src)) - drop_crc = false; - - dst = to_entry(ptr); - while ((prev = extent_entry_prev(ptrs, dst))) { - if (extent_entry_is_ptr(prev)) + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { break; - - if (extent_entry_is_crc(prev)) { - if (drop_crc) - dst = prev; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; break; } + } - dst = prev; + extent_entry_drop(k, entry); + + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; + + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } } - memmove_u64s_down(dst, src, - (u64 *) ptrs.end - (u64 *) src); - k.k->u64s -= (u64 *) src - (u64 *) dst; + return ret; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + __bch2_bkey_drop_ptr(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } - return dst; + return ret; } void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) @@ -921,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); - /* will only happen if all pointers were cached: */ - if (!bch2_bkey_nr_ptrs(k.s_c)) - k.k->type = KEY_TYPE_deleted; - return bkey_deleted(k.k); } @@ -961,12 +969,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", crc.compressed_size, crc.uncompressed_size, crc.offset, crc.nonce, - crc.csum_type, - crc.compression_type); + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); break; case BCH_EXTENT_ENTRY_stripe_ptr: ec = &entry->stripe_ptr; @@ -1030,7 +1038,7 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) if (k.k->type == KEY_TYPE_btree_ptr || k.k->type == KEY_TYPE_btree_ptr_v2) - size_ondisk = c->opts.btree_node_size; + size_ondisk = btree_sectors(c); bkey_extent_entry_for_each(ptrs, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 43cef0a..9c25672 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { - switch (extent_entry_type(e)) { - case BCH_EXTENT_ENTRY_ptr: - return true; - default: - return false; - } + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} + +static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; } static inline bool extent_entry_is_crc(const union bch_extent_entry *e) @@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); @@ -579,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); +union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index 26d5cad..05429c9 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -17,10 +17,6 @@ * * With one based indexing each level of the tree starts at a power of two - * good for cacheline alignment: - * - * Size parameter is treated as if we were using 0 based indexing, however: - * valid nodes, and inorder indices, are in the range [1..size) - that is, there - * are actually size - 1 elements */ static inline unsigned eytzinger1_child(unsigned i, unsigned child) @@ -42,12 +38,12 @@ static inline unsigned eytzinger1_right_child(unsigned i) static inline unsigned eytzinger1_first(unsigned size) { - return rounddown_pow_of_two(size - 1); + return rounddown_pow_of_two(size); } static inline unsigned eytzinger1_last(unsigned size) { - return rounddown_pow_of_two(size) - 1; + return rounddown_pow_of_two(size + 1) - 1; } /* @@ -62,13 +58,13 @@ static inline unsigned eytzinger1_last(unsigned size) static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EBUG_ON(i >= size); + EBUG_ON(i > size); - if (eytzinger1_right_child(i) < size) { + if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size) - __fls(i); - i >>= i >= size; + i <<= __fls(size + 1) - __fls(i); + i >>= i > size; } else { i >>= ffz(i) + 1; } @@ -78,14 +74,14 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EBUG_ON(i >= size); + EBUG_ON(i > size); - if (eytzinger1_left_child(i) < size) { + if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size) - __fls(i); + i <<= __fls(size + 1) - __fls(i); i -= 1; - i >>= i >= size; + i >>= i > size; } else { i >>= __ffs(i) + 1; } @@ -95,17 +91,17 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) static inline unsigned eytzinger1_extra(unsigned size) { - return (size - rounddown_pow_of_two(size - 1)) << 1; + return (size + 1 - rounddown_pow_of_two(size)) << 1; } static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, unsigned extra) { unsigned b = __fls(i); - unsigned shift = __fls(size - 1) - b; + unsigned shift = __fls(size) - b; int s; - EBUG_ON(!i || i >= size); + EBUG_ON(!i || i > size); i ^= 1U << b; i <<= 1; @@ -130,7 +126,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, unsigned shift; int s; - EBUG_ON(!i || i >= size); + EBUG_ON(!i || i > size); /* * sign bit trick: @@ -144,7 +140,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, shift = __ffs(i); i >>= shift + 1; - i |= 1U << (__fls(size - 1) - shift); + i |= 1U << (__fls(size) - shift); return i; } @@ -185,39 +181,39 @@ static inline unsigned eytzinger0_right_child(unsigned i) static inline unsigned eytzinger0_first(unsigned size) { - return eytzinger1_first(size + 1) - 1; + return eytzinger1_first(size) - 1; } static inline unsigned eytzinger0_last(unsigned size) { - return eytzinger1_last(size + 1) - 1; + return eytzinger1_last(size) - 1; } static inline unsigned eytzinger0_next(unsigned i, unsigned size) { - return eytzinger1_next(i + 1, size + 1) - 1; + return eytzinger1_next(i + 1, size) - 1; } static inline unsigned eytzinger0_prev(unsigned i, unsigned size) { - return eytzinger1_prev(i + 1, size + 1) - 1; + return eytzinger1_prev(i + 1, size) - 1; } static inline unsigned eytzinger0_extra(unsigned size) { - return eytzinger1_extra(size + 1); + return eytzinger1_extra(size); } static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, unsigned extra) { - return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; + return __eytzinger1_to_inorder(i + 1, size, extra) - 1; } static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, unsigned extra) { - return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; + return __inorder_to_eytzinger1(i + 1, size, extra) - 1; } static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 2189a11..d543480 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -6,114 +6,207 @@ #include "dirent.h" #include "fs-common.h" #include "inode.h" +#include "subvolume.h" #include "xattr.h" #include -int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *new_inode, const struct qstr *name, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct posix_acl *default_acl, - struct posix_acl *acl) + struct posix_acl *acl, + subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL; - struct btree_iter *inode_iter = NULL; - struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); - u64 dir_offset = 0; + u64 dir_target; + u32 snapshot; + unsigned dir_type = mode_to_type(mode); int ret; - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); if (ret) goto err; - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); - - if (!name) - new_inode->bi_flags |= BCH_INODE_UNLINKED; - - inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - default_acl, ACL_TYPE_DEFAULT); + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, + BTREE_ITER_CACHED, &s); + if (ret) + goto err; + + snapshot_src.inum = le64_to_cpu(s.inode); + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; } - if (acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, - acl, ACL_TYPE_ACCESS); + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; + + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); if (ret) goto err; } - if (name) { + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } + } + + if (!(flags & BCH_CREATE_TMPFILE)) { struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - dir_u->bi_mtime = dir_u->bi_ctime = now; + u64 dir_offset; - if (S_ISDIR(new_inode->bi_mode)) + if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; - ret = bch2_inode_write(trans, dir_iter, dir_u); + ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(new_inode->bi_mode), - name, new_inode->bi_inum, + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; - } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } } - /* XXX use bch2_btree_iter_set_snapshot() */ - inode_iter->snapshot = U32_MAX; - bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); - ret = bch2_btree_iter_traverse(inode_iter) ?: - bch2_inode_write(trans, inode_iter, new_inode); + ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); err: - bch2_trans_iter_put(trans, inode_iter); - bch2_trans_iter_put(trans, dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); return ret; } -int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, - u64 inum, struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, const struct qstr *name) +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL, *inode_iter = NULL; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; int ret; - inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + if (dir.subvol != inum.subvol) + return -EXDEV; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; @@ -121,84 +214,110 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum, &dir_offset, + name, inum.inum, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir_inum; + inode_u->bi_dir = dir.inum; inode_u->bi_dir_offset = dir_offset; } - ret = bch2_inode_write(trans, dir_iter, dir_u) ?: - bch2_inode_write(trans, inode_iter, inode_u); + ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_put(trans, dir_iter); - bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); return ret; } int bch2_unlink_trans(struct btree_trans *trans, - u64 dir_inum, struct bch_inode_unpacked *dir_u, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, - const struct qstr *name) + const struct qstr *name, + bool deleting_snapshot) { struct bch_fs *c = trans->c; - struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, - *inode_iter = NULL; + struct btree_iter dir_iter = { NULL }; + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(c); + subvol_inum inum; + u64 now = bch2_current_time(c); struct bkey_s_c k; int ret; - dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dir_iter); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); - dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, - name, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dirent_iter); + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; - k = bch2_btree_iter_peek_slot(dirent_iter); - ret = bkey_err(k); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); if (ret) goto err; - inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } - inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); - if (ret) + if (deleting_snapshot && !inode_u->bi_subvol) { + ret = -ENOENT; goto err; + } + + if (deleting_snapshot || inode_u->bi_subvol) { + ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); + if (ret) + goto err; - if (inode_u->bi_dir == k.k->p.inode && - inode_u->bi_dir_offset == k.k->p.offset) { + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } else { + bch2_inode_nlink_dec(inode_u); + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { inode_u->bi_dir = 0; inode_u->bi_dir_offset = 0; } dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); - bch2_inode_nlink_dec(inode_u); - - ret = (S_ISDIR(inode_u->bi_mode) - ? bch2_empty_dir_trans(trans, inum) - : 0) ?: - bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: - bch2_inode_write(trans, dir_iter, dir_u) ?: - bch2_inode_write(trans, inode_iter, inode_u); + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); err: - bch2_trans_iter_put(trans, inode_iter); - bch2_trans_iter_put(trans, dirent_iter); - bch2_trans_iter_put(trans, dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(trans, &dir_iter); return ret; } @@ -210,6 +329,7 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, bool ret = false; for (id = 0; id < Inode_opt_nr; id++) { + /* Skip attributes that were explicitly set on this inode */ if (dst_u->bi_fields_set & (1 << id)) continue; @@ -227,8 +347,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, } int bch2_rename_trans(struct btree_trans *trans, - u64 src_dir, struct bch_inode_unpacked *src_dir_u, - u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, struct bch_inode_unpacked *src_inode_u, struct bch_inode_unpacked *dst_inode_u, const struct qstr *src_name, @@ -236,25 +356,27 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; - struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; + struct btree_iter src_dir_iter = { NULL }; + struct btree_iter dst_dir_iter = { NULL }; + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, src_offset, dst_inode, dst_offset; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; u64 now = bch2_current_time(c); int ret; - src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_dir_iter); + ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, + BTREE_ITER_INTENT); if (ret) goto err; src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir != src_dir) { - dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_dir_iter); + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); if (ret) goto err; @@ -267,22 +389,20 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, &src_offset, - dst_name, &dst_inode, &dst_offset, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, mode); if (ret) goto err; - src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(src_inode_iter); + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, + BTREE_ITER_INTENT); if (ret) goto err; - if (dst_inode) { - dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dst_inode_iter); + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, + BTREE_ITER_INTENT); if (ret) goto err; } @@ -312,7 +432,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inode)) { + bch2_empty_dir_trans(trans, dst_inum)) { ret = -ENOTEMPTY; goto err; } @@ -331,12 +451,12 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(src_inode_u->bi_mode)) { + if (is_subdir_for_nlink(src_inode_u)) { src_dir_u->bi_nlink--; dst_dir_u->bi_nlink++; } - if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; } @@ -347,28 +467,28 @@ int bch2_rename_trans(struct btree_trans *trans, src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; - if (src_dir != dst_dir) { + if (src_dir.inum != dst_dir.inum) { dst_dir_u->bi_mtime = now; dst_dir_u->bi_ctime = now; } src_inode_u->bi_ctime = now; - if (dst_inode) + if (dst_inum.inum) dst_inode_u->bi_ctime = now; - ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: - (src_dir != dst_dir - ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) : 0 ) ?: - bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: - (dst_inode - ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) : 0 ); err: - bch2_trans_iter_put(trans, dst_inode_iter); - bch2_trans_iter_put(trans, src_inode_iter); - bch2_trans_iter_put(trans, dst_dir_iter); - bch2_trans_iter_put(trans, src_dir_iter); + bch2_trans_iter_exit(trans, &dst_inode_iter); + bch2_trans_iter_exit(trans, &src_inode_iter); + bch2_trans_iter_exit(trans, &dst_dir_iter); + bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } diff --git a/libbcachefs/fs-common.h b/libbcachefs/fs-common.h index 2273b79..dde2378 100644 --- a/libbcachefs/fs-common.h +++ b/libbcachefs/fs-common.h @@ -4,27 +4,33 @@ struct posix_acl; -int bch2_create_trans(struct btree_trans *, u64, +#define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + +int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, - struct posix_acl *); + struct posix_acl *, + subvol_inum, unsigned); -int bch2_link_trans(struct btree_trans *, u64, - u64, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, const struct qstr *); -int bch2_unlink_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, +int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, struct bch_inode_unpacked *, - const struct qstr *); + const struct qstr *, bool); int bch2_rename_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, - u64, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 3333f61..1d0871f 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -223,6 +223,9 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, return; mutex_lock(&inode->ei_quota_lock); + BUG_ON((s64) inode->v.i_blocks + sectors < 0); + inode->v.i_blocks += sectors; + #ifdef CONFIG_BCACHEFS_QUOTA if (quota_res && sectors > 0) { BUG_ON(sectors > quota_res->sectors); @@ -234,7 +237,6 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); } #endif - inode->v.i_blocks += sectors; mutex_unlock(&inode->ei_quota_lock); } @@ -243,24 +245,26 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* stored in page->private: */ struct bch_page_sector { - /* Uncompressed, fully allocated replicas: */ - unsigned nr_replicas:3; + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; - /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ - unsigned replicas_reserved:3; + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; /* i_sectors: */ enum { SECTOR_UNALLOCATED, SECTOR_RESERVED, SECTOR_DIRTY, + SECTOR_DIRTY_RESERVED, SECTOR_ALLOCATED, - } state:2; + } state:8; }; struct bch_page_state { spinlock_t lock; atomic_t write_count; + bool uptodate; struct bch_page_sector s[PAGE_SECTORS]; }; @@ -311,6 +315,212 @@ static struct bch_page_state *bch2_page_state_create(struct page *page, return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); } +static unsigned bkey_to_sector_state(const struct bkey *k) +{ + if (k->type == KEY_TYPE_reservation) + return SECTOR_RESERVED; + if (bkey_extent_is_allocation(k)) + return SECTOR_ALLOCATED; + return SECTOR_UNALLOCATED; +} + +static void __bch2_page_state_set(struct page *page, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); + unsigned i; + + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + s->s[i].state = state; + } + + if (i == PAGE_SECTORS) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, + struct page **pages, unsigned nr_pages) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; + unsigned pg_idx = 0; + u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + while (pg_idx < nr_pages) { + struct page *page = pages[pg_idx]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; + unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; + + BUG_ON(k.k->p.offset < pg_start); + BUG_ON(bkey_start_offset(k.k) > pg_end); + + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) + __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); + + if (k.k->p.offset < pg_end) + break; + pg_idx++; + } + + if (pg_idx == nr_pages) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k.k); + + bio_for_each_segment(bv, bio, iter) + __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, + bv.bv_len >> 9, nr_ptrs, state); +} + +static void mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); +} + +static void mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct pagevec pvec; + s64 i_sectors_delta = 0; + + if (end <= start) + return; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i, j; + + nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, + &index, end_index); + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + u64 pg_start = page->index << PAGE_SECTORS_SHIFT; + u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; + unsigned pg_offset = max(start, pg_start) - pg_start; + unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; + struct bch_page_state *s; + + BUG_ON(end <= pg_start); + BUG_ON(pg_offset >= PAGE_SECTORS); + BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + + lock_page(page); + s = bch2_page_state(page); + + if (s) { + spin_lock(&s->lock); + for (j = pg_offset; j < pg_offset + pg_len; j++) + switch (s->s[j].state) { + case SECTOR_UNALLOCATED: + s->s[j].state = SECTOR_RESERVED; + break; + case SECTOR_DIRTY: + s->s[j].state = SECTOR_DIRTY_RESERVED; + i_sectors_delta--; + break; + default: + break; + } + spin_unlock(&s->lock); + } + + unlock_page(page); + } + pagevec_release(&pvec); + } while (index <= end_index); + + i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) { /* XXX: this should not be open coded */ @@ -395,6 +605,8 @@ static int bch2_page_reservation_get(struct bch_fs *c, if (!s) return -ENOMEM; + BUG_ON(!s->uptodate); + for (i = round_down(offset, block_bytes(c)) >> 9; i < round_up(offset + len, block_bytes(c)) >> 9; i++) { @@ -449,16 +661,22 @@ static void bch2_clear_page_bits(struct page *page) disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - if (s->s[i].state == SECTOR_DIRTY) { - dirty_sectors++; + switch (s->s[i].state) { + case SECTOR_DIRTY: s->s[i].state = SECTOR_UNALLOCATED; + --dirty_sectors; + break; + case SECTOR_DIRTY_RESERVED: + s->s[i].state = SECTOR_RESERVED; + break; + default: + break; } } bch2_disk_reservation_put(c, &disk_res); - if (dirty_sectors) - i_sectors_acct(c, inode, NULL, -dirty_sectors); + i_sectors_acct(c, inode, NULL, dirty_sectors); bch2_page_state_release(page); } @@ -491,16 +709,22 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; - if (s->s[i].state == SECTOR_UNALLOCATED) + switch (s->s[i].state) { + case SECTOR_UNALLOCATED: + s->s[i].state = SECTOR_DIRTY; dirty_sectors++; - - s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); + break; + case SECTOR_RESERVED: + s->s[i].state = SECTOR_DIRTY_RESERVED; + break; + default: + break; + } } spin_unlock(&s->lock); - if (dirty_sectors) - i_sectors_acct(c, inode, &res->quota, dirty_sectors); + i_sectors_acct(c, inode, &res->quota, dirty_sectors); if (!PageDirty(page)) __set_page_dirty_nobuffers(page); @@ -554,7 +778,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct bch2_page_reservation res; unsigned len; loff_t isize; - int ret = VM_FAULT_LOCKED; + int ret; bch2_page_reservation_init(c, inode, &res); @@ -580,6 +804,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { + unlock_page(page); + ret = VM_FAULT_SIGBUS; + goto out; + } + } + if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { unlock_page(page); ret = VM_FAULT_SIGBUS; @@ -590,6 +822,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) bch2_page_reservation_put(c, inode, &res); wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; out: bch2_pagecache_add_put(&inode->ei_pagecache_lock); sb_end_pagefault(inode->v.i_sb); @@ -703,29 +936,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) return iter->pages[iter->idx]; } -static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -{ - struct bvec_iter iter; - struct bio_vec bv; - unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v - ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); - unsigned state = k.k->type == KEY_TYPE_reservation - ? SECTOR_RESERVED - : SECTOR_ALLOCATED; - - bio_for_each_segment(bv, bio, iter) { - struct bch_page_state *s = bch2_page_state(bv.bv_page); - unsigned i; - - for (i = bv.bv_offset >> 9; - i < (bv.bv_offset + bv.bv_len) >> 9; - i++) { - s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = state; - } - } -} - static bool extent_partial_reads_expensive(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -745,7 +955,7 @@ static void readpage_bio_extend(struct readpages_iter *iter, { while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { - pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; + pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; struct page *page = readpage_iter_next(iter); int ret; @@ -786,23 +996,35 @@ static void readpage_bio_extend(struct readpages_iter *iter, } } -static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, - struct bch_read_bio *rbio, u64 inum, +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, struct readpages_iter *readpages_iter) { struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_buf sk; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + u32 snapshot; int ret = 0; rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); retry: bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); while (1) { struct bkey_s_c k; unsigned bytes, sectors, offset_into_extent; @@ -817,15 +1039,15 @@ retry: break; } - bch2_btree_iter_set_pos(iter, - POS(inum, rbio->bio.bi_iter.bi_sector)); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -840,8 +1062,6 @@ retry: sectors = min(sectors, k.k->size - offset_into_extent); - bch2_trans_unlock(trans); - if (readpages_iter) readpage_bio_extend(readpages_iter, &rbio->bio, sectors, extent_partial_reads_expensive(k)); @@ -852,10 +1072,9 @@ retry: if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - if (bkey_extent_is_allocation(k.k)) - bch2_add_page_sectors(&rbio->bio, k); + bch2_bio_page_state_set(&rbio->bio, k); - bch2_read_extent(trans, rbio, iter->pos, + bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) @@ -863,13 +1082,19 @@ retry: swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; } +err: + bch2_trans_iter_exit(trans, &iter); if (ret == -EINTR) goto retry; if (ret) { - bch_err_inum_ratelimited(c, inum, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); @@ -884,7 +1109,6 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_io_opts opts = io_opts(c, &inode->ei_inode); struct btree_trans trans; - struct btree_iter *iter; struct page *page; struct readpages_iter readpages_iter; int ret; @@ -893,8 +1117,6 @@ void bch2_readahead(struct readahead_control *ractl) BUG_ON(ret); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); bch2_pagecache_add_get(&inode->ei_pagecache_lock); @@ -911,41 +1133,34 @@ void bch2_readahead(struct readahead_control *ractl) readpages_iter.idx++; bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); - rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; + rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); - bchfs_read(&trans, iter, rbio, inode->v.i_ino, + bchfs_read(&trans, rbio, inode_inum(inode), &readpages_iter); } bch2_pagecache_add_put(&inode->ei_pagecache_lock); - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); kfree(readpages_iter.pages); } static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inum, struct page *page) + subvol_inum inum, struct page *page) { struct btree_trans trans; - struct btree_iter *iter; bch2_page_state_create(page, __GFP_NOFAIL); bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); rbio->bio.bi_iter.bi_sector = - (sector_t) page->index << PAGE_SECTOR_SHIFT; + (sector_t) page->index << PAGE_SECTORS_SHIFT; BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, - BTREE_ITER_SLOTS); - - bchfs_read(&trans, iter, rbio, inum, NULL); - - bch2_trans_iter_put(&trans, iter); + bchfs_read(&trans, rbio, inum, NULL); bch2_trans_exit(&trans); } @@ -959,7 +1174,7 @@ int bch2_readpage(struct file *file, struct page *page) rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); rbio->bio.bi_end_io = bch2_readpages_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); return 0; } @@ -982,7 +1197,7 @@ static int bch2_read_single_page(struct page *page, rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; - __bchfs_readpage(c, rbio, inode->v.i_ino, page); + __bchfs_readpage(c, rbio, inode_inum(inode), page); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -1063,7 +1278,7 @@ static void bch2_writepage_io_done(struct closure *cl) * racing with fallocate can cause us to add fewer sectors than * expected - but we shouldn't add more sectors than expected: */ - BUG_ON(io->op.i_sectors_delta > 0); + WARN_ON(io->op.i_sectors_delta > 0); /* * (error (due to going RO) halfway through a page can screw that up @@ -1122,10 +1337,10 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; - op_journal_seq_set(op, &inode->ei_journal_seq); op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); @@ -1168,16 +1383,16 @@ static int __bch2_writepage(struct page *page, do_io: s = bch2_page_state_create(page, __GFP_NOFAIL); - ret = bch2_get_page_disk_reservation(c, inode, page, true); - if (ret) { - SetPageError(page); - mapping_set_error(page->mapping, ret); - unlock_page(page); - return 0; - } + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_page_disk_reservation(c, inode, page, false); + BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); orig = *s; + spin_unlock(&s->lock); for (i = 0; i < PAGE_SECTORS; i++) { if (s->s[i].state < SECTOR_DIRTY) @@ -1210,7 +1425,7 @@ do_io: offset = 0; while (1) { - unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; while (offset < PAGE_SECTORS && @@ -1220,16 +1435,15 @@ do_io: if (offset == PAGE_SECTORS) break; - sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; - while (offset + sectors < PAGE_SECTORS && - orig.s[offset + sectors].state >= SECTOR_DIRTY) + orig.s[offset + sectors].state >= SECTOR_DIRTY) { + reserved_sectors += orig.s[offset + sectors].replicas_reserved; + dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; sectors++; - - for (i = offset; i < offset + sectors; i++) { - reserved_sectors += orig.s[i].replicas_reserved; - dirty_sectors += orig.s[i].state == SECTOR_DIRTY; } + BUG_ON(!sectors); + + sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || @@ -1346,6 +1560,12 @@ readpage: if (ret) goto err; out: + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); + if (ret) + goto out; + } + ret = bch2_page_reservation_get(c, inode, page, res, offset, len, true); if (ret) { @@ -1475,20 +1695,21 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, } while (reserved < len) { - struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; + unsigned i = (offset + reserved) >> PAGE_SHIFT; + struct page *page = pages[i]; unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - reserved, PAGE_SIZE - pg_offset); -retry_reservation: - ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len, true); - if (ret && !PageUptodate(page)) { - ret = bch2_read_single_page(page, mapping); - if (!ret) - goto retry_reservation; + if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { + ret = bch2_page_state_set(c, inode_inum(inode), + pages + i, nr_pages - i); + if (ret) + goto out; } + ret = bch2_page_reservation_get(c, inode, page, &res, + pg_offset, pg_len, true); if (ret) goto out; @@ -1504,8 +1725,8 @@ retry_reservation: unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); unsigned pg_len = min_t(unsigned, len - copied, PAGE_SIZE - pg_offset); - unsigned pg_copied = iov_iter_copy_from_user_atomic(page, - iter, pg_offset, pg_len); + unsigned pg_copied = copy_page_from_iter_atomic(page, + pg_offset, pg_len,iter); if (!pg_copied) break; @@ -1518,7 +1739,6 @@ retry_reservation: } flush_dcache_page(page); - iov_iter_advance(iter, pg_copied); copied += pg_copied; if (pg_copied != pg_len) @@ -1758,7 +1978,7 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } iter->count += shorten; @@ -1813,6 +2033,50 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) /* O_DIRECT writes */ +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (err == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + static void bch2_dio_write_loop_async(struct bch_write_op *); static long bch2_dio_write_loop(struct dio_write *dio) @@ -1888,9 +2152,9 @@ static long bch2_dio_write_loop(struct dio_write *dio) bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); dio->op.end_io = bch2_dio_write_loop_async; dio->op.target = dio->op.opts.foreground_target; - op_journal_seq_set(&dio->op, &inode->ei_journal_seq); dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); if ((req->ki_flags & IOCB_DSYNC) && @@ -1901,8 +2165,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), + !bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), dio->op.opts.data_replicas, dio->op.opts.compression != 0)) goto err; @@ -2119,45 +2383,58 @@ unlock: /* fsync: */ -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +/* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead + */ +static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2; + struct bch_inode_unpacked inode; + int ret; - ret = file_write_and_wait_range(file, start, end); + if (c->opts.journal_flush_disabled) + return 0; + + ret = bch2_inode_find_by_inum(c, inum, &inode); if (ret) return ret; - if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) - goto out; + return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); +} - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - return ret; -out: - if (!c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, - inode->ei_journal_seq); - ret2 = file_check_and_advance_wb_err(file); +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret, ret2, ret3; - return ret ?: ret2; + ret = file_write_and_wait_range(file, start, end); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode_inum(inode)); + + return ret ?: ret2 ?: ret3; } /* truncate: */ -static inline int range_has_data(struct bch_fs *c, - struct bpos start, - struct bpos end) +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; @@ -2166,9 +2443,14 @@ static inline int range_has_data(struct bch_fs *c, break; } } - bch2_trans_iter_put(&trans, iter); + start = iter.pos; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } static int __bch2_truncate_page(struct bch_inode_info *inode, @@ -2181,6 +2463,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; unsigned i; struct page *page; + s64 i_sectors_delta = 0; int ret = 0; /* Page boundary? Nothing to do */ @@ -2198,9 +2481,9 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * page */ - ret = range_has_data(c, - POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), - POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); + ret = range_has_data(c, inode->ei_subvol, + POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), + POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); if (ret <= 0) return ret; @@ -2232,9 +2515,21 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; + if (s->s[i].state == SECTOR_DIRTY) + i_sectors_delta--; s->s[i].state = SECTOR_UNALLOCATED; } + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + /* + * Caller needs to know whether this page will be written out by + * writeback - doing an i_size update if necessary - or whether it will + * be responsible for the i_size update: + */ + ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), + PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; + zero_user_segment(page, start_offset, end_offset); /* @@ -2243,8 +2538,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: because we aren't currently tracking whether the page has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - ret = bch2_get_page_disk_reservation(c, inode, page, false); - BUG_ON(ret); + BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); /* * This removes any writeable userspace mappings; we need to force @@ -2266,6 +2560,20 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } +static int bch2_truncate_pages(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, + start, end); + + if (ret >= 0 && + start >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_page(inode, + end >> PAGE_SHIFT, + start, end); + return ret; +} + static int bch2_extend(struct user_namespace *mnt_userns, struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, @@ -2332,7 +2640,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); - ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) goto err; @@ -2356,7 +2664,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, iattr->ia_valid &= ~ATTR_SIZE; ret = bch2_truncate_page(inode, iattr->ia_size); - if (unlikely(ret)) + if (unlikely(ret < 0)) goto err; /* @@ -2390,11 +2698,14 @@ int bch2_truncate(struct user_namespace *mnt_userns, truncate_setsize(&inode->v, iattr->ia_size); - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); + WARN_ON(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal)); + if (unlikely(ret)) goto err; @@ -2422,49 +2733,39 @@ static int inode_update_times_fn(struct bch_inode_info *inode, static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - u64 discard_start = round_up(offset, block_bytes(c)) >> 9; - u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; + u64 end = offset + len; + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; int ret = 0; - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) goto err; - if (offset >> PAGE_SHIFT != - (offset + len) >> PAGE_SHIFT) { - ret = __bch2_truncate_page(inode, - (offset + len) >> PAGE_SHIFT, - offset, offset + len); - if (unlikely(ret)) - goto err; - } + truncated_last_page = ret; - truncate_pagecache_range(&inode->v, offset, offset + len - 1); + truncate_pagecache_range(&inode->v, offset, end - 1); - if (discard_start < discard_end) { + if (block_start < block_end ) { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, - discard_start, discard_end, - &inode->ei_journal_seq, + ret = bch2_fpunch(c, inode_inum(inode), + block_start >> 9, block_end >> 9, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); } mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_MTIME|ATTR_CTIME) ?: ret; + if (end >= inode->v.i_size && !truncated_last_page) { + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + } else { + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } mutex_unlock(&inode->ei_update_lock); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; } @@ -2476,7 +2777,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct address_space *mapping = inode->v.i_mapping; struct bkey_buf copy; struct btree_trans trans; - struct btree_iter *src, *dst, *del; + struct btree_iter src, dst, del; loff_t shift, new_size; u64 src_start; int ret = 0; @@ -2484,31 +2785,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, if ((offset | len) & (block_bytes(c) - 1)) return -EINVAL; - /* - * We need i_mutex to keep the page cache consistent with the extents - * btree, and the btree consistent with i_size - we don't need outside - * locking for the extents btree itself, because we're using linked - * iterators - */ - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); - if (insert) { - ret = -EFBIG; if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) - goto err; + return -EFBIG; - ret = -EINVAL; if (offset >= inode->v.i_size) - goto err; + return -EINVAL; src_start = U64_MAX; shift = len; } else { - ret = -EINVAL; if (offset + len >= inode->v.i_size) - goto err; + return -EINVAL; src_start = offset + len; shift = -len; @@ -2518,7 +2806,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); if (ret) - goto err; + return ret; if (insert) { i_size_write(&inode->v, new_size); @@ -2529,23 +2817,22 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, } else { s64 i_sectors_delta = 0; - ret = bch2_fpunch(c, inode->v.i_ino, + ret = bch2_fpunch(c, inode_inum(inode), offset >> 9, (offset + len) >> 9, - &inode->ei_journal_seq, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); if (ret) - goto err; + return ret; } bch2_bkey_buf_init(©); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - src = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, POS(inode->v.i_ino, src_start >> 9), BTREE_ITER_INTENT); - dst = bch2_trans_copy_iter(&trans, src); - del = bch2_trans_copy_iter(&trans, src); + bch2_trans_copy_iter(&dst, &src); + bch2_trans_copy_iter(&del, &src); while (ret == 0 || ret == -EINTR) { struct disk_reservation disk_res = @@ -2556,12 +2843,24 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); struct bpos atomic_end; unsigned trigger_flags = 0; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src, snapshot); + bch2_btree_iter_set_snapshot(&dst, snapshot); + bch2_btree_iter_set_snapshot(&del, snapshot); bch2_trans_begin(&trans); k = insert - ? bch2_btree_iter_peek_prev(src) - : bch2_btree_iter_peek(src); + ? bch2_btree_iter_peek_prev(&src) + : bch2_btree_iter_peek(&src); if ((ret = bkey_err(k))) continue; @@ -2579,9 +2878,9 @@ reassemble: bch2_cut_front(move_pos, copy.k); copy.k->k.p.offset += shift >> 9; - bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); + bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); - ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); + ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); if (ret) continue; @@ -2599,7 +2898,7 @@ reassemble: delete.k.p = copy.k->k.p; delete.k.size = copy.k->k.size; delete.k.p.offset -= shift >> 9; - bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k)); + bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; @@ -2620,36 +2919,36 @@ reassemble: BUG_ON(ret); } - ret = bch2_btree_iter_traverse(del) ?: - bch2_trans_update(&trans, del, &delete, trigger_flags) ?: - bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: - bch2_trans_commit(&trans, &disk_res, - &inode->ei_journal_seq, + ret = bch2_btree_iter_traverse(&del) ?: + bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(c, &disk_res); if (!ret) - bch2_btree_iter_set_pos(src, next_pos); + bch2_btree_iter_set_pos(&src, next_pos); } - bch2_trans_iter_put(&trans, del); - bch2_trans_iter_put(&trans, dst); - bch2_trans_iter_put(&trans, src); + bch2_trans_iter_exit(&trans, &del); + bch2_trans_iter_exit(&trans, &dst); + bch2_trans_iter_exit(&trans, &src); bch2_trans_exit(&trans); bch2_bkey_buf_exit(©, c); if (ret) - goto err; + return ret; + mutex_lock(&inode->ei_update_lock); if (!insert) { i_size_write(&inode->v, new_size); - mutex_lock(&inode->ei_update_lock); ret = bch2_write_inode_size(c, inode, new_size, ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); + mutex_unlock(&inode->ei_update_lock); return ret; } @@ -2658,41 +2957,49 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; int ret = 0; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (!ret && bkey_cmp(iter->pos, end_pos) < 0) { + while (!ret && bkey_cmp(iter.pos, end_pos) < 0) { s64 i_sectors_delta = 0; struct disk_reservation disk_res = { 0 }; struct quota_res quota_res = { 0 }; struct bkey_i_reservation reservation; struct bkey_s_c k; unsigned sectors; + u32 snapshot; bch2_trans_begin(&trans); - k = bch2_btree_iter_peek_slot(iter); + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; /* already reserved */ if (k.k->type == KEY_TYPE_reservation && bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } @@ -2701,7 +3008,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.k.p = k.k->p; reservation.k.size = k.k->size; - bch2_cut_front(iter->pos, &reservation.k_i); + bch2_cut_front(iter.pos, &reservation.k_i); bch2_cut_back(end_pos, &reservation.k_i); sectors = reservation.k.size; @@ -2725,9 +3032,12 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = disk_res.nr_replicas; } - ret = bch2_extent_update(&trans, iter, &reservation.k_i, - &disk_res, &inode->ei_journal_seq, + ret = bch2_extent_update(&trans, inode_inum(inode), &iter, + &reservation.k_i, + &disk_res, NULL, 0, &i_sectors_delta, true); + if (ret) + goto bkey_err; i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); @@ -2735,7 +3045,21 @@ bkey_err: if (ret == -EINTR) ret = 0; } - bch2_trans_iter_put(&trans, iter); + + bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ + mark_pagecache_reserved(inode, start_sector, iter.pos.offset); + + if (ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + + bch2_fpunch_at(&trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -2743,77 +3067,58 @@ bkey_err: static long bchfs_fallocate(struct bch_inode_info *inode, int mode, loff_t offset, loff_t len) { - struct address_space *mapping = inode->v.i_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - loff_t end = offset + len; - loff_t block_start = round_down(offset, block_bytes(c)); - loff_t block_end = round_up(end, block_bytes(c)); - int ret; - - inode_lock(&inode->v); - inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + u64 end = offset + len; + u64 block_start = round_down(offset, block_bytes(c)); + u64 block_end = round_up(end, block_bytes(c)); + bool truncated_last_page = false; + int ret, ret2 = 0; if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ret = inode_newsize_ok(&inode->v, end); if (ret) - goto err; + return ret; } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = __bch2_truncate_page(inode, - offset >> PAGE_SHIFT, - offset, end); - - if (!ret && - offset >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - offset, end); + ret = bch2_truncate_pages(inode, offset, end); + if (unlikely(ret < 0)) + return ret; - if (unlikely(ret)) - goto err; + truncated_last_page = ret; truncate_pagecache_range(&inode->v, offset, end - 1); + + block_start = round_up(offset, block_bytes(c)); + block_end = round_down(end, block_bytes(c)); } ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); - if (ret) - goto err; /* - * Do we need to extend the file? - * - * If we zeroed up to the end of the file, we dropped whatever writes - * were going to write out the current i_size, so we have to extend - * manually even if FL_KEEP_SIZE was set: + * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, + * so that the VFS cache i_size is consistent with the btree i_size: */ - if (end >= inode->v.i_size && - (!(mode & FALLOC_FL_KEEP_SIZE) || - (mode & FALLOC_FL_ZERO_RANGE))) { + if (ret && + !(ret == -ENOSPC && (mode & FALLOC_FL_ZERO_RANGE))) + return ret; - /* - * Sync existing appends before extending i_size, - * as in bch2_extend(): - */ - ret = filemap_write_and_wait_range(mapping, - inode->ei_inode.bi_size, S64_MAX); - if (ret) - goto err; + if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) + end = inode->v.i_size; - if (mode & FALLOC_FL_KEEP_SIZE) - end = inode->v.i_size; - else - i_size_write(&inode->v, end); + if (end >= inode->v.i_size && + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { + spin_lock(&inode->v.i_lock); + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, end, 0); + ret2 = bch2_write_inode_size(c, inode, end, 0); mutex_unlock(&inode->ei_update_lock); } -err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); - inode_unlock(&inode->v); - return ret; + + return ret ?: ret2; } long bch2_fallocate_dispatch(struct file *file, int mode, @@ -2826,6 +3131,10 @@ long bch2_fallocate_dispatch(struct file *file, int mode, if (!percpu_ref_tryget(&c->writes)) return -EROFS; + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(&inode->ei_pagecache_lock); + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ret = bchfs_fallocate(inode, mode, offset, len); else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) @@ -2837,48 +3146,14 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; + + bch2_pagecache_block_put(&inode->ei_pagecache_lock); + inode_unlock(&inode->v); percpu_ref_put(&c->writes); return ret; } -static void mark_range_unallocated(struct bch_inode_info *inode, - loff_t start, loff_t end) -{ - pgoff_t index = start >> PAGE_SHIFT; - pgoff_t end_index = (end - 1) >> PAGE_SHIFT; - struct pagevec pvec; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i, j; - - nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, - &index, end_index); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - struct bch_page_state *s; - - lock_page(page); - s = bch2_page_state(page); - - if (s) { - spin_lock(&s->lock); - for (j = 0; j < PAGE_SECTORS; j++) - s->s[j].nr_replicas = 0; - spin_unlock(&s->lock); - } - - unlock_page(page); - } - pagevec_release(&pvec); - } while (index <= end_index); -} - loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, struct file *file_dst, loff_t pos_dst, loff_t len, unsigned remap_flags) @@ -2924,13 +3199,13 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if (ret) goto err; - mark_range_unallocated(src, pos_src, pos_src + aligned_len); + mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); ret = bch2_remap_range(c, - POS(dst->v.i_ino, pos_dst >> 9), - POS(src->v.i_ino, pos_src >> 9), + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, aligned_len >> 9, - &dst->ei_journal_seq, pos_dst + len, &i_sectors_delta); if (ret < 0) goto err; @@ -2948,10 +3223,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, i_size_write(&dst->v, pos_dst + ret); spin_unlock(&dst->v.i_lock); - if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) && - !c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, inode_inum(dst)); err: bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); @@ -3017,9 +3291,11 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3027,9 +3303,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), 0, k, ret) { + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -3038,9 +3320,12 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) } else if (k.k->p.offset >> 9 > isize) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); if (ret) return ret; @@ -3113,9 +3398,11 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3123,9 +3410,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, @@ -3142,9 +3435,12 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) offset = max(offset, bkey_start_offset(k.k) << 9); } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); if (ret) return ret; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 91a0e76..9f329a6 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -10,7 +10,11 @@ #include "quota.h" #include +#include #include +#include +#include +#include #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) #define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -192,7 +196,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, char *kname = NULL; struct qstr qstr; int ret = 0; - u64 inum; + subvol_inum inum; kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); if (!kname) @@ -205,10 +209,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.len = ret; qstr.name = kname; - ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, - &qstr); - if (!inum) + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) goto err1; vinode = bch2_vfs_inode_get(c, inum); @@ -294,6 +296,161 @@ err: return ret; } +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -ENOENT; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_user_ns(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + + inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + struct inode *dir; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + path_put(&path); + return -EXDEV; + } + + dir = path.dentry->d_parent->d_inode; + + ret = __bch2_unlink(dir, path.dentry, true); + if (!ret) { + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); + } + path_put(&path); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -324,6 +481,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_GOINGDOWN: return bch2_ioc_goingdown(c, (u32 __user *) arg); + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_create(c, file, i); + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_destroy(c, file, i); + } + default: return bch2_fs_ioctl(c, cmd, (void __user *) arg); } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 631fb87..91fa189 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -36,30 +36,10 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, - struct bch_inode_unpacked *); - -static void journal_seq_copy(struct bch_fs *c, - struct bch_inode_info *dst, - u64 journal_seq) -{ - /* - * atomic64_cmpxchg has a fallback for archs that don't support it, - * cmpxchg does not: - */ - atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; - u64 old, v = READ_ONCE(dst->ei_journal_seq); - - do { - old = v; - - if (old >= journal_seq) - break; - } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); - - bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); -} + struct bch_inode_unpacked *, + struct bch_subvolume *); static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { @@ -113,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock) __pagecache_lock_get(lock, -1); } -void bch2_inode_update_after_write(struct bch_fs *c, +void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + c->opts.inodes_use_key_cache); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); @@ -141,7 +129,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, void *p, unsigned fields) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bch_inode_unpacked inode_u; int ret; @@ -149,23 +137,20 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(&trans); - iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter) ?: + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: (set ? set(inode, &inode_u, p) : 0) ?: - bch2_inode_write(&trans, iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOFAIL); + bch2_inode_write(&trans, &iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(c, inode, &inode_u, fields); + bch2_inode_update_after_write(&trans, inode, &inode_u, fields); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -209,44 +194,73 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; + struct btree_trans trans; + struct bch_subvolume subvol; int ret; - inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum, &inode_u); + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, + bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_trans_exit(&trans); + if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inode, &inode_u); - - inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); - unlock_new_inode(&inode->v); return &inode->v; } -static int inum_test(struct inode *inode, void *p) -{ - unsigned long *ino = p; - - return *ino == inode->i_ino; -} - -static struct bch_inode_info * +struct bch_inode_info * __bch2_create(struct user_namespace *mnt_userns, struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, bool tmpfile) + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans trans; @@ -254,6 +268,8 @@ __bch2_create(struct user_namespace *mnt_userns, struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; + struct bch_subvolume subvol; u64 journal_seq = 0; int ret; @@ -274,26 +290,34 @@ __bch2_create(struct user_namespace *mnt_userns, bch2_inode_init_early(c, &inode_u); - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); bch2_trans_init(&trans, c, 8, - 2048 + (!tmpfile ? dentry->d_name.len : 0)); + 2048 + (!(flags & BCH_CREATE_TMPFILE) + ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); - ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, - !tmpfile ? &dentry->d_name : NULL, + ret = bch2_create_trans(&trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, from_kuid(mnt_userns, current_fsuid()), from_kgid(mnt_userns, current_fsgid()), mode, rdev, - default_acl, acl) ?: + default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) goto err_before_quota; - ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(&trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(&trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -303,15 +327,14 @@ err_before_quota: goto err_trans; } - if (!tmpfile) { - bch2_inode_update_after_write(c, dir, &dir_u, + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dir, journal_seq); mutex_unlock(&dir->ei_update_lock); } - bch2_vfs_inode_init(c, inode, &inode_u); - journal_seq_copy(c, inode, journal_seq); + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -323,8 +346,12 @@ err_before_quota: */ inode->v.i_state |= I_CREATING; - old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, - inum_test, NULL, &inode->v.i_ino)); + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); BUG_ON(!old); if (unlikely(old != inode)) { @@ -332,7 +359,6 @@ err_before_quota: * We raced, another process pulled the new inode into cache * before us: */ - journal_seq_copy(c, old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -351,7 +377,7 @@ err: posix_acl_release(acl); return inode; err_trans: - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_exit(&trans); @@ -370,12 +396,13 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; - u64 inum; + subvol_inum inum = { .subvol = 1 }; + int ret; - inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, - &dentry->d_name); + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); - if (inum) + if (!ret) vinode = bch2_vfs_inode_get(c, inum); return d_splice_alias(vinode, dentry); @@ -386,7 +413,8 @@ static int bch2_mknod(struct user_namespace *mnt_userns, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, false); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -414,19 +442,16 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_link_trans(&trans, - dir->v.i_ino, - inode->v.i_ino, &dir_u, &inode_u, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - journal_seq_copy(c, inode, dir->ei_journal_seq); - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_exit(&trans); @@ -453,7 +478,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, return 0; } -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + bool deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); @@ -465,19 +491,17 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, + ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, - dir->v.i_ino, &dir_u, - &inode_u, &dentry->d_name)); + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - journal_seq_copy(c, inode, dir->ei_journal_seq); - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_MTIME); } @@ -487,6 +511,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return ret; } +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, false); +} + static int bch2_symlink(struct user_namespace *mnt_userns, struct inode *vdir, struct dentry *dentry, const char *symname) @@ -495,7 +524,8 @@ static int bch2_symlink(struct user_namespace *mnt_userns, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); + inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -510,8 +540,6 @@ static int bch2_symlink(struct user_namespace *mnt_userns, if (unlikely(ret)) goto err; - journal_seq_copy(c, dir, inode->ei_journal_seq); - ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; @@ -546,7 +574,6 @@ static int bch2_rename2(struct user_namespace *mnt_userns, ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; - u64 journal_seq = 0; int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) @@ -586,10 +613,10 @@ static int bch2_rename2(struct user_namespace *mnt_userns, goto err; } - ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_rename_trans(&trans, - src_dir->v.i_ino, &src_dir_u, - dst_dir->v.i_ino, &dst_dir_u, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, @@ -602,25 +629,19 @@ static int bch2_rename2(struct user_namespace *mnt_userns, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(c, src_dir, &src_dir_u, + bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, src_dir, journal_seq); - if (src_dir != dst_dir) { - bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + if (src_dir != dst_dir) + bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dst_dir, journal_seq); - } - bch2_inode_update_after_write(c, src_inode, &src_inode_u, + bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, ATTR_CTIME); - journal_seq_copy(c, src_inode, journal_seq); - if (dst_inode) { - bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + if (dst_inode) + bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, ATTR_CTIME); - journal_seq_copy(c, dst_inode, journal_seq); - } err: bch2_trans_exit(&trans); @@ -686,7 +707,7 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans trans; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; int ret; @@ -712,33 +733,32 @@ retry: kfree(acl); acl = NULL; - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); if (ret) goto btree_err; bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); if (ret) goto btree_err; } - ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); btree_err: - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); if (ret == -EINTR) goto retry; if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -812,7 +832,8 @@ static int bch2_tmpfile(struct user_namespace *mnt_userns, struct inode *vdir, struct dentry *dentry, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, true); + __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -844,8 +865,8 @@ static int bch2_fill_extent(struct bch_fs *c, else offset += p.crc.offset; - if ((offset & (c->opts.block_size - 1)) || - (k.k->size & (c->opts.block_size - 1))) + if ((offset & (block_sectors(c) - 1)) || + (k.k->size & (block_sectors(c) - 1))) flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ret = fiemap_fill_next_extent(info, @@ -881,12 +902,13 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; + u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -896,27 +918,33 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; + start >>= 9; + bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(ei->v.i_ino, start >> 9), 0); retry: bch2_trans_begin(&trans); - while ((k = bch2_btree_iter_peek(iter)).k && + ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + + while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && - bkey_cmp(iter->pos, end) < 0) { + bkey_cmp(iter.pos, end) < 0) { enum btree_id data_btree = BTREE_ID_extents; if (!bkey_extent_is_data(k.k) && k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -937,7 +965,7 @@ retry: offset_into_extent), cur.k); bch2_key_resize(&cur.k->k, sectors); - cur.k->k.p = iter->pos; + cur.k->k.p = iter.pos; cur.k->k.p.offset += cur.k->k.size; if (have_extent) { @@ -950,10 +978,12 @@ retry: bkey_copy(prev.k, cur.k); have_extent = true; - bch2_btree_iter_set_pos(iter, - POS(iter->pos.inode, iter->pos.offset + sectors)); + bch2_btree_iter_set_pos(&iter, + POS(iter.pos.inode, iter.pos.offset + sectors)); } - + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: if (ret == -EINTR) goto retry; @@ -961,8 +991,7 @@ retry: ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); return ret < 0 ? ret : 0; @@ -998,7 +1027,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode->v.i_ino, ctx); + return bch2_readdir(c, inode_inum(inode), ctx); } static const struct file_operations bch_file_operations = { @@ -1098,51 +1127,243 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; -static struct inode *bch2_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +struct bcachefs_fid { + u64 inum; + u32 subvol; + u32 gen; +} __packed; + +struct bcachefs_fid_with_parent { + struct bcachefs_fid fid; + struct bcachefs_fid dir; +} __packed; + +static int bcachefs_fid_valid(int fh_len, int fh_type) { - struct bch_fs *c = sb->s_fs_info; - struct inode *vinode; + switch (fh_type) { + case FILEID_BCACHEFS_WITHOUT_PARENT: + return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); + case FILEID_BCACHEFS_WITH_PARENT: + return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); + default: + return false; + } +} + +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) +{ + return (struct bcachefs_fid) { + .inum = inode->ei_inode.bi_inum, + .subvol = inode->ei_subvol, + .gen = inode->ei_inode.bi_generation, + }; +} - if (ino < BCACHEFS_ROOT_INO) - return ERR_PTR(-ESTALE); +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, + struct inode *vdir) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *dir = to_bch_ei(vdir); + + if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) + return FILEID_INVALID; + + if (!S_ISDIR(inode->v.i_mode) && dir) { + struct bcachefs_fid_with_parent *fid = (void *) fh; + + fid->fid = bch2_inode_to_fid(inode); + fid->dir = bch2_inode_to_fid(dir); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITH_PARENT; + } else { + struct bcachefs_fid *fid = (void *) fh; + + *fid = bch2_inode_to_fid(inode); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITHOUT_PARENT; + } +} - vinode = bch2_vfs_inode_get(c, ino); - if (IS_ERR(vinode)) - return ERR_CAST(vinode); - if (generation && vinode->i_generation != generation) { - /* we didn't find the right inode.. */ +static struct inode *bch2_nfs_get_inode(struct super_block *sb, + struct bcachefs_fid fid) +{ + struct bch_fs *c = sb->s_fs_info; + struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { + .subvol = fid.subvol, + .inum = fid.inum, + }); + if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { iput(vinode); - return ERR_PTR(-ESTALE); + vinode = ERR_PTR(-ESTALE); } return vinode; } -static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type)) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); } -static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - bch2_nfs_get_inode); + struct bcachefs_fid_with_parent *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type) || + fh_type != FILEID_BCACHEFS_WITH_PARENT) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); +} + +static struct dentry *bch2_get_parent(struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + subvol_inum parent_inum = { + .subvol = inode->ei_inode.bi_parent_subvol ?: + inode->ei_subvol, + .inum = inode->ei_inode.bi_dir, + }; + + if (!parent_inum.inum) + return NULL; + + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter1; + struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; + unsigned name_len; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + + k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { + ret = -ENOENT; + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } else { + /* + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret < 0) + break; + if (ret) + continue; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } + } + + ret = -ENOENT; + goto err; +found: + name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); + + memcpy(name, d.v->d_name, name_len); + name[name_len] = '\0'; +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter1); + bch2_trans_iter_exit(&trans, &iter2); + bch2_trans_exit(&trans); + + return ret; } static const struct export_operations bch_export_ops = { + .encode_fh = bch2_encode_fh, .fh_to_dentry = bch2_fh_to_dentry, .fh_to_parent = bch2_fh_to_parent, - //.get_parent = bch2_get_parent, + .get_parent = bch2_get_parent, + .get_name = bch2_get_name, }; -static void bch2_vfs_inode_init(struct bch_fs *c, +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, - struct bch_inode_unpacked *bi) + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) { - bch2_inode_update_after_write(c, inode, bi, ~0); + bch2_inode_update_after_write(trans, inode, bi, ~0); + + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; @@ -1151,9 +1372,9 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_size = bi->bi_size; inode->ei_flags = 0; - inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1189,7 +1410,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) mutex_init(&inode->ei_update_lock); pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); - inode->ei_journal_seq = 0; return &inode->v; } @@ -1251,10 +1471,57 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino, true); + bch2_inode_rm(c, inode_inum(inode)); } } +void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) +{ + struct super_block *sb = c->vfs_sb; + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + spin_unlock(&sb->s_inode_list_lock); +again: + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + if (!(inode->i_state & I_DONTCACHE)) { + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + + spin_lock(&inode->i_lock); + if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && + !(inode->i_state & I_FREEING)) { + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); +} + static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -1413,7 +1680,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->mode & OPT_MOUNT)) + if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) @@ -1595,7 +1862,9 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); + sb->s_shrink.seeks = 0; + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { bch_err(c, "error mounting: error getting root inode %i", (int) PTR_ERR(vinode)); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 36cc6ba..b2211ec 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -36,7 +36,6 @@ struct bch_inode_info { unsigned long ei_flags; struct mutex ei_update_lock; - u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; @@ -45,16 +44,32 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; + u32 ei_subvol; + /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: */ #define EI_INODE_ERROR 0 +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) @@ -135,6 +150,10 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +struct bch_inode_info * +__bch2_create(struct user_namespace *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -154,13 +173,13 @@ static inline int bch2_set_projid(struct bch_fs *c, KEY_TYPE_QUOTA_PREALLOC); } -struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); -void bch2_inode_update_after_write(struct bch_fs *, +void bch2_inode_update_after_write(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); @@ -170,12 +189,17 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, int bch2_setattr_nonsize(struct user_namespace *, struct bch_inode_info *, struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, bool); + +void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); void bch2_vfs_exit(void); int bch2_vfs_init(void); #else +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 36eba46..ced4d67 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -9,6 +9,7 @@ #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "subvolume.h" #include "super.h" #include "xattr.h" @@ -17,15 +18,16 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + u32 snapshot) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 sectors = 0; int ret; for_each_btree_key(trans, iter, BTREE_ID_extents, - POS(inum, 0), 0, k, ret) { + SPOS(inum, 0, snapshot), 0, k, ret) { if (k.k->p.inode != inum) break; @@ -33,33 +35,138 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) sectors += k.k->size; } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret ?: sectors; } +static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 subdirs = 0; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode != inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_DIR) + subdirs++; + } + + bch2_trans_iter_exit(trans, &iter); + + return ret ?: subdirs; +} + +static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS(0, snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not fonud", snapshot); + ret = -ENOENT; + goto err; + } + + *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; + +} + +static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + + *snapshot = le32_to_cpu(s.snapshot); + *inum = le64_to_cpu(s.inode); + return ret; +} + +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); +} + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + POS(0, inode_nr), + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) { + ret = -ENOENT; + goto err; + } + + ret = bch2_inode_unpack(k, inode); +err: + if (ret && ret != -EINTR) + bch_err(trans->c, "error %i fetching inode %llu", + ret, inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bch_inode_unpacked *inode, u32 *snapshot) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, - POS(0, inode_nr), 0); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, *snapshot), 0); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (snapshot) - *snapshot = iter->pos.snapshot; - ret = k.k->type == KEY_TYPE_inode - ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) : -ENOENT; + if (!ret) + *snapshot = iter.pos.snapshot; err: - bch2_trans_iter_free(trans, iter); + if (ret && ret != -EINTR) + bch_err(trans->c, "error %i fetching inode %llu:%u", + ret, inode_nr, *snapshot); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -70,17 +177,41 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr, return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); } +static int __lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0); + if (ret) + return ret; + + d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); + return 0; +} + static int __write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - struct btree_iter *inode_iter = - bch2_trans_get_iter(trans, BTREE_ID_inodes, - SPOS(0, inode->bi_inum, snapshot), - BTREE_ITER_INTENT); - int ret = bch2_btree_iter_traverse(inode_iter) ?: - bch2_inode_write(trans, inode_iter, inode); - bch2_trans_iter_put(trans, inode_iter); + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode->bi_inum, snapshot), + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_inode_write(trans, &iter, inode); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -97,110 +228,176 @@ static int write_inode(struct btree_trans *trans, return ret; } +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(trans->c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) { + ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); + if (ret) + goto err; + } + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (ret == -EINTR) + goto retry; + + return ret; +} + static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bch_inode_unpacked dir_inode; struct bch_hash_info dir_hash_info; int ret; - ret = lookup_inode(trans, pos.inode, &dir_inode, NULL); + ret = lookup_first_inode(trans, pos.inode, &dir_inode); if (ret) return ret; dir_hash_info = bch2_hash_info_init(c, &dir_inode); - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, iter); - bch2_trans_iter_put(trans, iter); - return ret; -} - -static int remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - int ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - __remove_dirent(trans, pos)); - if (ret) - bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret); + &dir_hash_info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } /* Get lost+found, create if it doesn't exist: */ -static int lookup_lostfound(struct btree_trans *trans, +static int lookup_lostfound(struct btree_trans *trans, u32 subvol, struct bch_inode_unpacked *lostfound) { struct bch_fs *c = trans->c; struct bch_inode_unpacked root; struct bch_hash_info root_hash_info; struct qstr lostfound_str = QSTR("lost+found"); - u64 inum; + subvol_inum root_inum = { .subvol = subvol }; + u64 inum = 0; + unsigned d_type = 0; u32 snapshot; int ret; - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot); - if (ret && ret != -ENOENT) + ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) + return ret; + + ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + if (ret) return ret; root_hash_info = bch2_hash_info_init(c, &root); - inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, - &lostfound_str); - if (!inum) { + + ret = __lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (ret == -ENOENT) { bch_notice(c, "creating lost+found"); goto create_lostfound; } - ret = lookup_inode(trans, inum, lostfound, &snapshot); - if (ret && ret != -ENOENT) { - /* - * The check_dirents pass has already run, dangling dirents - * shouldn't exist here: - */ + if (ret && ret != -EINTR) bch_err(c, "error looking up lost+found: %i", ret); + if (ret) return ret; - } - if (ret == -ENOENT) { -create_lostfound: - bch2_inode_init_early(c, lostfound); - - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_create_trans(trans, - BCACHEFS_ROOT_INO, &root, - lostfound, - &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL)); - if (ret) - bch_err(c, "error creating lost+found: %i", ret); + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return ret; } - return 0; + /* + * The check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + return __lookup_inode(trans, inum, lostfound, &snapshot); + +create_lostfound: + bch2_inode_init_early(c, lostfound); + + ret = bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0); + if (ret && ret != -EINTR) + bch_err(c, "error creating lost+found: %i", ret); + return ret; } -static int reattach_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode) +static int __reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) { struct bch_hash_info dir_hash; struct bch_inode_unpacked lostfound; char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 subvol; int ret; - ret = lookup_lostfound(trans, &lostfound); + ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + + ret = lookup_lostfound(trans, subvol, &lostfound); if (ret) return ret; if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = write_inode(trans, &lostfound, U32_MAX); + ret = __write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } @@ -210,33 +407,51 @@ static int reattach_inode(struct btree_trans *trans, snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); - ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash, - mode_to_type(inode->bi_mode), - &name, inode->bi_inum, &dir_offset, - BCH_HASH_SET_MUST_CREATE)); + ret = bch2_dirent_create(trans, + (subvol_inum) { + .subvol = subvol, + .inum = lostfound.bi_inum, + }, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; + + return __write_inode(trans, inode, inode_snapshot); +} + +static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + int ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + __reattach_inode(trans, inode, inode_snapshot)); if (ret) { bch_err(trans->c, "error %i reattaching inode %llu", ret, inode->bi_inum); return ret; } - inode->bi_dir = lostfound.bi_inum; - inode->bi_dir_offset = dir_offset; - - return write_inode(trans, inode, U32_MAX); + return ret; } static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, + POS(inode->bi_dir, inode->bi_dir_offset), 0); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto out; @@ -245,51 +460,254 @@ static int remove_backpointer(struct btree_trans *trans, goto out; } - ret = remove_dirent(trans, k.k->p); + ret = __remove_dirent(trans, k.k->p); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, struct bpos pos) +{ + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; + + if (bkey_cmp(s->pos, pos)) + s->nr = 0; + s->pos = pos; + + /* Might get called multiple times due to lock restarts */ + if (s->nr && s->d[s->nr - 1] == pos.snapshot) + return 0; + + return snapshots_seen_add(c, s, pos.snapshot); +} + +/** + * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, + * and @ancestor hasn't been overwritten in @seen + * + * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + */ +static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) +{ + ssize_t i; + + BUG_ON(id > ancestor); + + id = snapshot_t(c, id)->equiv; + ancestor = snapshot_t(c, ancestor)->equiv; + + /* @ancestor should be the snapshot most recently added to @seen */ + BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); + BUG_ON(seen->pos.snapshot != ancestor); + + if (id == ancestor) + return true; + + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + + for (i = seen->nr - 2; + i >= 0 && seen->d[i] >= id; + --i) + if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && + bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + return false; + + return true; +} + +/** + * ref_visible - given a key with snapshot id @src that points to a key with + * snapshot id @dst, test whether there is some snapshot in which @dst is + * visible. + * + * This assumes we're visiting @src keys in natural key order. + * + * @s - list of snapshot IDs already seen at @src + * @src - snapshot ID of src key + * @dst - snapshot ID of dst key + */ +static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) +{ + return dst <= src + ? key_visible_in_snapshot(c, s, dst, src) + : bch2_snapshot_is_ancestor(c, src, dst); +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + struct inode_walker { - bool first_this_inode; - bool have_inode; - u64 cur_inum; - u32 snapshot; - struct bch_inode_unpacked inode; + bool first_this_inode; + u64 cur_inum; + + size_t nr; + size_t size; + struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; + } *d; }; +static void inode_walker_exit(struct inode_walker *w) +{ + kfree(w->d); + w->d = NULL; +} + static struct inode_walker inode_walker_init(void) { - return (struct inode_walker) { - .cur_inum = -1, - .have_inode = false, + return (struct inode_walker) { 0, }; +} + +static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w) +{ + if (w->nr == w->size) { + size_t new_size = max_t(size_t, 8UL, w->size * 2); + void *d = krealloc(w->d, new_size * sizeof(w->d[0]), + GFP_KERNEL); + if (!d) { + bch_err(c, "fsck: error allocating memory for inode_walker, size %zu", + new_size); + return -ENOMEM; + } + + w->d = d; + w->size = new_size; + } + + return 0; +} + +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) +{ + struct bch_inode_unpacked u; + int ret; + + ret = inode_walker_realloc(c, w); + if (ret) + return ret; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + w->d[w->nr++] = (struct inode_walker_entry) { + .inode = u, + .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, }; + + return 0; } static int __walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) + struct inode_walker *w, struct bpos pos) { - if (inum != w->cur_inum) { - int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + unsigned i, ancestor_pos; + int ret; - if (ret && ret != -ENOENT) - return ret; + pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; - w->have_inode = !ret; - w->cur_inum = inum; - w->first_this_inode = true; - } else { + if (pos.inode == w->cur_inum) { w->first_this_inode = false; + goto lookup_snapshot; } - return 0; + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != pos.inode) + break; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + w->cur_inum = pos.inode; + w->first_this_inode = true; +lookup_snapshot: + for (i = 0; i < w->nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + goto found; + return INT_MAX; +found: + BUG_ON(pos.snapshot > w->d[i].snapshot); + + if (pos.snapshot != w->d[i].snapshot) { + ancestor_pos = i; + + while (i && w->d[i - 1].snapshot > pos.snapshot) + --i; + + ret = inode_walker_realloc(c, w); + if (ret) + return ret; + + array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); + w->d[i].snapshot = pos.snapshot; + w->d[i].count = 0; + } + + return i; } -static int walk_inode(struct btree_trans *trans, - struct inode_walker *w, u64 inum) +static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) { - return lockrestart_do(trans, __walk_inode(trans, w, inum)); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != inum) + break; + + if (!bkey_is_inode(k.k)) + continue; + + if (ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) { + add_inode(c, w, k); + if (k.k->p.snapshot >= s->pos.snapshot) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + char buf[200]; + int ret = 0; + + if (mustfix_fsck_err_on(!snapshot_t(c, k.k->p.snapshot)->equiv, c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: + return ret; } static int hash_redo_key(struct btree_trans *trans, @@ -297,6 +715,9 @@ static int hash_redo_key(struct btree_trans *trans, struct bch_hash_info *hash_info, struct btree_iter *k_iter, struct bkey_s_c k) { + bch_err(trans->c, "hash_redo_key() not implemented yet"); + return -EINVAL; +#if 0 struct bkey_i *delete; struct bkey_i *tmp; @@ -315,26 +736,7 @@ static int hash_redo_key(struct btree_trans *trans, return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0); -} - -static int fsck_hash_delete_at(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *info, - struct btree_iter *iter) -{ - int ret; -retry: - ret = bch2_hash_delete_at(trans, desc, info, iter) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret == -EINTR) { - ret = bch2_btree_iter_traverse(iter); - if (!ret) - goto retry; - } - - return ret; +#endif } static int hash_check_key(struct btree_trans *trans, @@ -343,7 +745,7 @@ static int hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; char buf[200]; struct bkey_s_c k; u64 hash; @@ -370,20 +772,17 @@ static int hash_check_key(struct btree_trans *trans, "duplicate hash table keys:\n%s", (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf))) { - ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter); - if (ret) - return ret; - ret = 1; + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; break; } if (bkey_deleted(k.k)) { - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); goto bad_hash; } } - bch2_trans_iter_free(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; bad_hash: if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, " @@ -392,9 +791,7 @@ bad_hash: (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE) return 0; - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - hash_redo_key(trans, desc, hash_info, k_iter, hash_k)); + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); if (ret) { bch_err(c, "hash_redo_key err %i", ret); return ret; @@ -406,30 +803,64 @@ fsck_err: static int check_inode(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_inode inode) + struct bch_inode_unpacked *prev, + bool full) { struct bch_fs *c = trans->c; + struct bkey_s_c k; struct bch_inode_unpacked u; bool do_update = false; - int ret = 0; + int ret; - ret = bch2_inode_unpack(inode, &u); + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; - if (bch2_fs_inconsistent_on(ret, c, - "error unpacking inode %llu in fsck", - inode.k->p.inode)) + ret = bkey_err(k); + if (ret) return ret; + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + + /* + * if snapshot id isn't a leaf node, skip it - deletion in + * particular is not atomic, so on the internal snapshot nodes + * we can see inodes marked for deletion after a clean shutdown + */ + if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) + return 0; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && + !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED))) + return 0; + + if (prev->bi_inum != u.bi_inum) + *prev = u; + + if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || + inode_d_type(prev) != inode_d_type(&u), c, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + if (u.bi_flags & BCH_INODE_UNLINKED && (!c->sb.clean || fsck_err(c, "filesystem marked clean, but inode %llu unlinked", u.bi_inum))) { - bch_verbose(c, "deleting inode %llu", u.bi_inum); - bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); - ret = bch2_inode_rm(c, u.bi_inum, false); + ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); return ret; @@ -449,9 +880,10 @@ static int check_inode(struct btree_trans *trans, * just switch units to bytes and that issue goes away */ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - POS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9), + SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, + iter->pos.snapshot), POS(u.bi_inum, U64_MAX), - NULL); + 0, NULL); if (ret) { bch_err(c, "error in fsck: error %i truncating inode", ret); return ret; @@ -476,7 +908,7 @@ static int check_inode(struct btree_trans *trans, bch_verbose(c, "recounting sectors for inode %llu", u.bi_inum); - sectors = bch2_count_inode_sectors(trans, u.bi_inum); + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); if (sectors < 0) { bch_err(c, "error in fsck: error %i recounting inode sectors", (int) sectors); @@ -496,11 +928,7 @@ static int check_inode(struct btree_trans *trans, } if (do_update) { - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_inode_write(trans, iter, &u)); + ret = write_inode(trans, &u, iter->pos.snapshot); if (ret) bch_err(c, "error in fsck: error %i " "updating inode", ret); @@ -513,41 +941,99 @@ noinline_for_stack static int check_inodes(struct bch_fs *c, bool full) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_s_c_inode inode; + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) - continue; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - inode = bkey_s_c_to_inode(k); + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_inode(&trans, &iter, &prev, full)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - if (full || - (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| - BCH_INODE_I_SECTORS_DIRTY| - BCH_INODE_UNLINKED))) { - ret = check_inode(&trans, iter, inode); - if (ret) - break; - } + bch2_trans_exit(&trans); + return ret; +} + +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + subvol = bkey_s_c_to_subvolume(k); + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + if (ret && ret != -EINTR) + bch_err(trans->c, "error deleting subvolume %llu: %i", + iter->pos.offset, ret); + if (ret) + return ret; } - bch2_trans_iter_put(&trans, iter); - BUG_ON(ret == -EINTR); + return 0; +} - return bch2_trans_exit(&trans) ?: ret; +noinline_for_stack +static int check_subvols(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_subvolumes, + POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_subvol(&trans, &iter)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; } +/* + * Checking for overlapping extents needs to be reimplemented + */ +#if 0 static int fix_overlapping_extent(struct btree_trans *trans, struct bkey_s_c k, struct bpos cut_at) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i *u; int ret; @@ -567,46 +1053,208 @@ static int fix_overlapping_extent(struct btree_trans *trans, * assume things about extent overwrites - we should be running the * triggers manually here */ - iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p, - BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); - BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN) ?: + BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } +#endif -static int inode_backpointer_exists(struct btree_trans *trans, - struct bch_inode_unpacked *inode) +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) { - struct btree_iter *iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); + bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); + if (!ret && k.k->type != KEY_TYPE_dirent) + ret = -ENOENT; + if (ret) { + bch2_trans_iter_exit(trans, iter); + return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }; + } + + return bkey_s_c_to_dirent(k); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + +static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); + ret = bkey_err(d.s_c); if (ret) - goto out; - if (k.k->type != KEY_TYPE_dirent) - goto out; + return ret; - ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum; -out: - bch2_trans_iter_free(trans, iter); + ret = dirent_points_to_inode(d, inode); + bch2_trans_iter_exit(trans, &iter); return ret; } -static bool inode_backpointer_matches(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_sectors == i->count) + continue; + + count2 = lockrestart_do(trans, + bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot)); + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_sectors == i->count) + continue; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->cur_inum, i->snapshot, + i->inode.bi_sectors, i->count) == FSCK_ERR_IGNORE) + continue; + + i->inode.bi_sectors = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + ret2 = -EINTR; + } +fsck_err: + return ret ?: ret2; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker *inode, + struct snapshots_seen *s) { - return d.k->p.inode == inode->bi_dir && - d.k->p.offset == inode->bi_dir_offset; + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct inode_walker_entry *i; + char buf[200]; + int ret = 0; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, k.k->p); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + return ret; + } +#if 0 + if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { + char buf1[200]; + char buf2[200]; + + bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); + bch2_bkey_val_to_text(&PBUF(buf2), c, k); + + if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) + return fix_overlapping_extent(trans, k, prev.k->k.p) ?: -EINTR; + } +#endif + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, + "extent in missing inode:\n %s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (ret == INT_MAX) + return 0; + + i = inode->d + ret; + ret = 0; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + + if (!bch2_snapshot_internal_node(c, k.k->p.snapshot)) { + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->type != KEY_TYPE_reservation && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9, c, + "extent type %u offset %llu past end of inode %llu, i_size %llu", + k.k->type, k.k->p.offset, k.k->p.inode, i->inode.bi_size)) { + bch2_fs_lazy_rw(c); + return bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(k.k->p.inode, round_up(i->inode.bi_size, block_bytes(c)) >> 9, + k.k->p.snapshot), + POS(k.k->p.inode, U64_MAX), + 0, NULL) ?: -EINTR; + } + } + } + + if (bkey_extent_is_allocation(k.k)) + for_each_visible_inode(c, s, inode, k.k->p.snapshot, i) + i->count += k.k->size; +#if 0 + bch2_bkey_buf_reassemble(&prev, c, k); +#endif + +fsck_err: + return ret; } /* @@ -617,111 +1265,208 @@ noinline_for_stack static int check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_buf prev; - u64 i_sectors = 0; + struct btree_iter iter; int ret = 0; +#if 0 + struct bkey_buf prev; bch2_bkey_buf_init(&prev); prev.k->k = KEY(0, 0, 0); +#endif + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch_verbose(c, "checking extents"); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - if (w.have_inode && - w.cur_inum != k.k->p.inode && - !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && - fsck_err_on(w.inode.bi_sectors != i_sectors, c, - "inode %llu has incorrect i_sectors: got %llu, should be %llu", - w.inode.bi_inum, - w.inode.bi_sectors, i_sectors)) { - w.inode.bi_sectors = i_sectors; - - ret = write_inode(&trans, &w.inode, w.snapshot); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent(&trans, &iter, &w, &s)); + if (ret) + break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); +#if 0 + bch2_bkey_buf_exit(&prev, c); +#endif + inode_walker_exit(&w); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + + return ret; +} + +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret = 0, ret2 = 0; + s64 count2; + + for (i = w->d; i < w->d + w->nr; i++) { + if (i->inode.bi_nlink == i->count) + continue; + + count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); + if (count2 < 0) + return count2; + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; + } + + if (fsck_err_on(i->inode.bi_nlink != i->count, c, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); if (ret) break; + ret2 = -EINTR; } + } +fsck_err: + return ret ?: ret2; +} - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { - char buf1[200]; - char buf2[200]; +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; + char buf[200]; + int ret = 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); - bch2_bkey_val_to_text(&PBUF(buf2), c, k); + if (!inode_points_to_dirent(target, d)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) - return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR; + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target->bi_mode) && + backpointer_exists, c, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); + if (ret) + goto err; + return 0; } - ret = walk_inode(&trans, &w, k.k->p.inode); - if (ret) - break; + if (fsck_err_on(backpointer_exists && + !target->bi_nlink, c, + "inode %llu has multiple links but i_nlink 0", + target->bi_inum)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_UNLINKED; - if (w.first_this_inode) - i_sectors = 0; - - if (fsck_err_on(!w.have_inode, c, - "extent type %u for missing inode %llu", - k.k->type, k.k->p.inode) || - fsck_err_on(w.have_inode && - !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, - "extent type %u for non regular file, inode %llu mode %o", - k.k->type, k.k->p.inode, w.inode.bi_mode)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, 0), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; } - if (fsck_err_on(w.have_inode && - !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->type != KEY_TYPE_reservation && - k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, - "extent type %u offset %llu past end of inode %llu, i_size %llu", - k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { - bch2_fs_lazy_rw(c); - return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c)) >> 9), - POS(k.k->p.inode, U64_MAX), - NULL) ?: -EINTR; + if (fsck_err_on(!backpointer_exists, c, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; } + } - if (bkey_extent_is_allocation(k.k)) - i_sectors += k.k->size; - bch2_bkey_buf_reassemble(&prev, c, k); + if (fsck_err_on(d.v->d_type != inode_d_type(target), c, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (bch2_bkey_val_to_text(&PBUF(buf), c, d.s_c), buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + return ret; - bch2_btree_iter_advance(iter); + d = dirent_i_to_s_c(n); } + + if (d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && + (c->sb.version < bcachefs_metadata_version_subvol_dirent || + fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + return ret; + + d = dirent_i_to_s_c(n); + } +err: fsck_err: - if (ret == -EINTR) - goto retry; - bch2_trans_iter_put(&trans, iter); - bch2_bkey_buf_exit(&prev, c); - return bch2_trans_exit(&trans) ?: ret; + return ret; } static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, - struct inode_walker *w, unsigned *nr_subdirs) + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) { struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_s_c_dirent d; - struct bch_inode_unpacked target; - u32 target_snapshot; - bool have_target; - bool backpointer_exists = true; - u64 d_inum; + struct inode_walker_entry *i; char buf[200]; int ret; @@ -733,38 +1478,47 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (w->have_inode && - w->cur_inum != k.k->p.inode && - fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c, - "directory %llu with wrong i_nlink: got %u, should be %u", - w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) { - w->inode.bi_nlink = *nr_subdirs; - ret = write_inode(trans, &w->inode, w->snapshot); - return ret ?: -EINTR; - } + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret < 0 ? ret : 0; - ret = __walk_inode(trans, w, k.k->p.inode); + ret = snapshots_seen_update(c, s, k.k->p); if (ret) return ret; - if (w->first_this_inode) - *nr_subdirs = 0; + if (k.k->type == KEY_TYPE_whiteout) + return 0; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) + return ret; + } + + ret = __walk_inode(trans, dir, k.k->p); + if (ret < 0) + return ret; - if (fsck_err_on(!w->have_inode, c, + if (fsck_err_on(ret == INT_MAX, c, "dirent in nonexisting directory:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) || - fsck_err_on(!S_ISDIR(w->inode.bi_mode), c, - "dirent in non directory inode type %u:\n%s", - mode_to_type(w->inode.bi_mode), (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) - return __bch2_trans_do(trans, NULL, NULL, 0, - bch2_btree_delete_at(trans, iter, 0)); + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - if (!w->have_inode) + if (ret == INT_MAX) return 0; - if (w->first_this_inode) - *hash_info = bch2_hash_info_init(c, &w->inode); + i = dir->d + ret; + ret = 0; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) + return bch2_btree_delete_at(trans, iter, 0); + + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); @@ -777,105 +1531,76 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, return 0; d = bkey_s_c_to_dirent(k); - d_inum = le64_to_cpu(d.v->d_inum); - ret = __lookup_inode(trans, d_inum, &target, &target_snapshot); - if (ret && ret != -ENOENT) - return ret; + if (d.v->d_type == DT_SUBVOL) { + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; - have_target = !ret; - ret = 0; + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && ret != -ENOENT) + return ret; - if (fsck_err_on(!have_target, c, - "dirent points to missing inode:\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) - return remove_dirent(trans, d.k->p); + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %llu", + le64_to_cpu(d.v->d_child_subvol))) + return __remove_dirent(trans, d.k->p); - if (!have_target) - return 0; + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && ret != -ENOENT) + return ret; + + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } - if (!target.bi_dir && - !target.bi_dir_offset) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } - ret = __write_inode(trans, &target, target_snapshot) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); if (ret) return ret; - return -EINTR; - } - - if (!inode_backpointer_matches(d, &target)) { - ret = inode_backpointer_exists(trans, &target); - if (ret < 0) + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) return ret; - backpointer_exists = ret; - ret = 0; - - if (fsck_err_on(S_ISDIR(target.bi_mode) && - backpointer_exists, c, - "directory %llu with multiple links", - target.bi_inum)) - return remove_dirent(trans, d.k->p); - - if (fsck_err_on(backpointer_exists && - !target.bi_nlink, c, - "inode %llu has multiple links but i_nlink 0", - d_inum)) { - target.bi_nlink++; - target.bi_flags &= ~BCH_INODE_UNLINKED; - - ret = write_inode(trans, &target, target_snapshot); - return ret ?: -EINTR; + if (fsck_err_on(!target->nr, c, + "dirent points to missing inode:\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, + k), buf))) { + ret = __remove_dirent(trans, d.k->p); + if (ret) + return ret; } - if (fsck_err_on(!backpointer_exists, c, - "inode %llu has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - d_inum, - target.bi_dir, - target.bi_dir_offset, - k.k->p.inode, - k.k->p.offset)) { - target.bi_dir = k.k->p.inode; - target.bi_dir_offset = k.k->p.offset; - - ret = write_inode(trans, &target, target_snapshot); - return ret ?: -EINTR; + for (i = target->d; i < target->d + target->nr; i++) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) + return ret; } } - if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c, - "incorrect d_type: should be %u:\n%s", - mode_to_type(target.bi_mode), - (bch2_bkey_val_to_text(&PBUF(buf), c, - k), buf))) { - struct bkey_i_dirent *n; - - n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); - if (!n) - return -ENOMEM; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = mode_to_type(target.bi_mode); - - ret = __bch2_trans_do(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, &n->k_i, 0)); - kfree(n); - return ret ?: -EINTR; - } + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) + i->count++; - *nr_subdirs += d.v->d_type == DT_DIR; - return 0; fsck_err: return ret; } @@ -887,31 +1612,83 @@ fsck_err: noinline_for_stack static int check_dirents(struct bch_fs *c) { - struct inode_walker w = inode_walker_init(); + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); + struct snapshots_seen s; struct bch_hash_info hash_info; struct btree_trans trans; - struct btree_iter *iter; - unsigned nr_subdirs = 0; + struct btree_iter iter; int ret = 0; bch_verbose(c, "checking dirents"); + snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); do { - ret = lockrestart_do(&trans, - check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs)); + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_dirent(&trans, &iter, &hash_info, + &dir, &target, &s)); if (ret) break; - } while (bch2_btree_iter_advance(iter)); - bch2_trans_iter_put(&trans, iter); + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + inode_walker_exit(&dir); + inode_walker_exit(&target); + return ret; +} + +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, + struct inode_walker *inode) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; + + ret = __walk_inode(trans, inode, k.k->p); + if (ret < 0) + return ret; + + if (fsck_err_on(ret == INT_MAX, c, + "xattr for missing inode %llu", + k.k->p.inode)) + return bch2_btree_delete_at(trans, iter, 0); + + if (ret == INT_MAX) + return 0; + + ret = 0; + + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); - return bch2_trans_exit(&trans) ?: ret; + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: + return ret; } /* @@ -920,90 +1697,101 @@ static int check_dirents(struct bch_fs *c) noinline_for_stack static int check_xattrs(struct bch_fs *c) { - struct inode_walker w = inode_walker_init(); + struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + struct btree_iter iter; int ret = 0; bch_verbose(c, "checking xattrs"); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, - POS(BCACHEFS_ROOT_INO, 0), - BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH); -retry: - while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k))) { - ret = walk_inode(&trans, &w, k.k->p.inode); - if (ret) - break; - - if (fsck_err_on(!w.have_inode, c, - "xattr for missing inode %llu", - k.k->p.inode)) { - ret = bch2_btree_delete_at(&trans, iter, 0); - if (ret) - break; - continue; - } - - if (w.first_this_inode && w.have_inode) - hash_info = bch2_hash_info_init(c, &w.inode); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - ret = hash_check_key(&trans, bch2_xattr_hash_desc, - &hash_info, iter, k); + do { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_xattr(&trans, &iter, &hash_info, + &inode)); if (ret) break; + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(&trans, &iter); - bch2_btree_iter_advance(iter); - } -fsck_err: - if (ret == -EINTR) - goto retry; - - bch2_trans_iter_put(&trans, iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } -/* Get root directory, create if it doesn't exist: */ -static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) +static int check_root_trans(struct btree_trans *trans) { - struct bkey_inode_buf packed; + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root_inode; u32 snapshot; + u64 inum; int ret; - bch_verbose(c, "checking root directory"); - - ret = bch2_trans_do(c, NULL, NULL, 0, - lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot)); + ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); if (ret && ret != -ENOENT) return ret; - if (fsck_err_on(ret, c, "root directory missing")) - goto create_root; + if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + struct bkey_i_subvolume root_subvol; - if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, - "root inode not a directory")) - goto create_root; + snapshot = U32_MAX; + inum = BCACHEFS_ROOT_INO; - return 0; + bkey_subvolume_init(&root_subvol.k_i); + root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + if (ret) { + bch_err(c, "error writing root subvol: %i", ret); + goto err; + } + + } + + ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && ret != -ENOENT) + return ret; + + if (mustfix_fsck_err_on(ret, c, "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); + root_inode.bi_inum = inum; + + ret = __write_inode(trans, &root_inode, snapshot); + if (ret) + bch_err(c, "error writing root inode: %i", ret); + } +err: fsck_err: return ret; -create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, - 0, NULL); - root_inode->bi_inum = BCACHEFS_ROOT_INO; +} - bch2_inode_pack(c, &packed, root_inode); +/* Get root directory, create if it doesn't exist: */ +noinline_for_stack +static int check_root(struct bch_fs *c) +{ + bch_verbose(c, "checking root directory"); - return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); + return bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + check_root_trans(&trans)); } struct pathbuf { @@ -1012,10 +1800,24 @@ struct pathbuf { struct pathbuf_entry { u64 inum; + u32 snapshot; } *entries; }; -static int path_down(struct pathbuf *p, u64 inum) +static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) +{ + struct pathbuf_entry *i; + + for (i = p->entries; i < p->entries + p->nr; i++) + if (i->inum == inum && + i->snapshot == snapshot) + return true; + + return false; +} + +static int path_down(struct bch_fs *c, struct pathbuf *p, + u64 inum, u32 snapshot) { if (p->nr == p->size) { size_t new_size = max_t(size_t, 256UL, p->size * 2); @@ -1023,6 +1825,8 @@ static int path_down(struct pathbuf *p, u64 inum) new_size * sizeof(p->entries[0]), GFP_KERNEL); if (!n) { + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + new_size); return -ENOMEM; } @@ -1031,73 +1835,109 @@ static int path_down(struct pathbuf *p, u64 inum) }; p->entries[p->nr++] = (struct pathbuf_entry) { - .inum = inum, + .inum = inum, + .snapshot = snapshot, }; return 0; } +/* + * Check that a given inode is reachable from the root: + * + * XXX: we should also be verifying that inodes are in the right subvolumes + */ static int check_path(struct btree_trans *trans, struct pathbuf *p, - struct bch_inode_unpacked *inode) + struct bch_inode_unpacked *inode, + u32 snapshot) { struct bch_fs *c = trans->c; - u32 snapshot; - size_t i; int ret = 0; + snapshot = snapshot_t(c, snapshot)->equiv; p->nr = 0; - while (inode->bi_inum != BCACHEFS_ROOT_INO) { + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + struct btree_iter dirent_iter; + struct bkey_s_c_dirent d; + u32 parent_snapshot = snapshot; + + if (inode->bi_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &parent_snapshot, &inum); + if (ret) + break; + } + ret = lockrestart_do(trans, - inode_backpointer_exists(trans, inode)); - if (ret < 0) + PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot))).k)); + if (ret && ret != -ENOENT) break; - if (!ret) { - if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu", - inode->bi_inum, - mode_to_type(inode->bi_mode), + if (!ret && !dirent_points_to_inode(d, inode)) { + bch2_trans_iter_exit(trans, &dirent_iter); + ret = -ENOENT; + } + + if (ret == -ENOENT) { + if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + bch2_d_type_str(inode_d_type(inode)), inode->bi_nlink, inode->bi_dir, inode->bi_dir_offset)) - ret = reattach_inode(trans, inode); + ret = reattach_inode(trans, inode, snapshot); break; } - ret = 0; + + bch2_trans_iter_exit(trans, &dirent_iter); if (!S_ISDIR(inode->bi_mode)) break; - ret = path_down(p, inode->bi_inum); + ret = path_down(c, p, inode->bi_inum, snapshot); if (ret) { bch_err(c, "memory allocation failure"); return ret; } - for (i = 0; i < p->nr; i++) { - if (inode->bi_dir != p->entries[i].inum) - continue; + snapshot = parent_snapshot; + + ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err(c, "error looking up parent directory: %i", ret); + break; + } + + if (path_is_dup(p, inode->bi_inum, snapshot)) { + struct pathbuf_entry *i; /* XXX print path */ + bch_err(c, "directory structure loop"); + + for (i = p->entries; i < p->entries + p->nr; i++) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + if (!fsck_err(c, "directory structure loop")) return 0; - ret = lockrestart_do(trans, - remove_backpointer(trans, inode)); + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); if (ret) { bch_err(c, "error removing dirent: %i", ret); break; } - ret = reattach_inode(trans, inode); - break; - } - - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); - if (ret) { - /* Should have been caught in dirents pass */ - bch_err(c, "error looking up parent directory: %i", ret); - break; + ret = reattach_inode(trans, inode, snapshot); } } fsck_err: @@ -1111,10 +1951,11 @@ fsck_err: * After check_dirents(), if an inode backpointer doesn't exist that means it's * unreachable: */ +noinline_for_stack static int check_directory_structure(struct bch_fs *c) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; struct pathbuf path = { 0, 0, NULL }; @@ -1124,28 +1965,33 @@ static int check_directory_structure(struct bch_fs *c) for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) continue; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); + ret = bch2_inode_unpack(k, &u); if (ret) { /* Should have been caught earlier in fsck: */ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); break; } - ret = check_path(&trans, &path, &u); + if (u.bi_flags & BCH_INODE_UNLINKED) + continue; + + ret = check_path(&trans, &path, &u, iter.pos.snapshot); if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(ret == -EINTR); kfree(path.entries); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } struct nlink_table { @@ -1159,12 +2005,15 @@ struct nlink_table { } *d; }; -static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot) +static int add_nlink(struct bch_fs *c, struct nlink_table *t, + u64 inum, u32 snapshot) { if (t->nr == t->size) { size_t new_size = max_t(size_t, 128UL, t->size * 2); void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL); if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); return -ENOMEM; } @@ -1193,8 +2042,9 @@ static int nlink_cmp(const void *_l, const void *_r) return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); } -static void inc_link(struct bch_fs *c, struct nlink_table *links, - u64 range_start, u64 range_end, u64 inum) +static void inc_link(struct bch_fs *c, struct snapshots_seen *s, + struct nlink_table *links, + u64 range_start, u64 range_end, u64 inum, u32 snapshot) { struct nlink *link, key = { .inum = inum, .snapshot = U32_MAX, @@ -1205,8 +2055,18 @@ static void inc_link(struct bch_fs *c, struct nlink_table *links, link = __inline_bsearch(&key, links->d, links->nr, sizeof(links->d[0]), nlink_cmp); - if (link) - link->count++; + if (!link) + return; + + while (link > links->d && link[0].inum == link[-1].inum) + --link; + + for (; link < links->d + links->nr && link->inum == inum; link++) + if (ref_visible(c, s, snapshot, link->snapshot)) { + link->count++; + if (link->snapshot >= snapshot) + break; + } } noinline_for_stack @@ -1215,9 +2075,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, u64 start, u64 *end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_inode inode; struct bch_inode_unpacked u; int ret = 0; @@ -1226,26 +2085,25 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { - if (k.k->type != KEY_TYPE_inode) + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) continue; - inode = bkey_s_c_to_inode(k); + /* Should never fail, checked by bch2_inode_invalid: */ + BUG_ON(bch2_inode_unpack(k, &u)); /* * Backpointer and directory structure checks are sufficient for * directories, since they can't have hardlinks: */ - if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) + if (S_ISDIR(le16_to_cpu(u.bi_mode))) continue; - /* Should never fail, checked by bch2_inode_invalid: */ - BUG_ON(bch2_inode_unpack(inode, &u)); - if (!u.bi_nlink) continue; - ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot); + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); if (ret) { *end = k.k->p.offset; ret = 0; @@ -1253,7 +2111,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) @@ -1267,34 +2125,43 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links u64 range_start, u64 range_end) { struct btree_trans trans; - struct btree_iter *iter; + struct snapshots_seen s; + struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent d; int ret; + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = snapshots_seen_update(c, &s, k.k->p); + if (ret) + break; + switch (k.k->type) { case KEY_TYPE_dirent: d = bkey_s_c_to_dirent(k); - if (d.v->d_type != DT_DIR) - inc_link(c, links, range_start, range_end, - le64_to_cpu(d.v->d_inum)); + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + d.k->p.snapshot); break; } - - bch2_trans_cond_resched(&trans); } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); return ret; } @@ -1304,9 +2171,8 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, u64 range_start, u64 range_end) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; - struct bkey_s_c_inode inode; struct bch_inode_unpacked u; struct nlink *link = links->d; int ret = 0; @@ -1316,23 +2182,24 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS(0, range_start), BTREE_ITER_INTENT| - BTREE_ITER_PREFETCH, k, ret) { + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { if (k.k->p.offset >= range_end) break; - if (k.k->type != KEY_TYPE_inode) + if (!bkey_is_inode(k.k)) continue; - inode = bkey_s_c_to_inode(k); - if (S_ISDIR(le16_to_cpu(inode.v->bi_mode))) - continue; + BUG_ON(bch2_inode_unpack(k, &u)); - BUG_ON(bch2_inode_unpack(inode, &u)); + if (S_ISDIR(le16_to_cpu(u.bi_mode))) + continue; if (!u.bi_nlink) continue; - while (link->inum < k.k->p.offset) { + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { link++; BUG_ON(link >= links->d + links->nr); } @@ -1343,17 +2210,13 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __bch2_trans_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_btree_iter_traverse(iter) ?: - bch2_inode_write(&trans, iter, &u)); + ret = write_inode(&trans, &u, k.k->p.snapshot); if (ret) bch_err(c, "error in fsck: error %i updating inode", ret); } } fsck_err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) @@ -1399,21 +2262,91 @@ static int check_nlinks(struct bch_fs *c) return ret; } +static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter) +{ + struct bkey_s_c k; + struct bkey_s_c_reflink_p p; + struct bkey_i_reflink_p *u; + int ret; + + k = bch2_btree_iter_peek(iter); + if (!k.k) + return 0; + + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_reflink_p) + return 0; + + p = bkey_s_c_to_reflink_p(k); + + if (!p.v->front_pad && !p.v->back_pad) + return 0; + + u = bch2_trans_kmalloc(trans, sizeof(*u)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(&u->k_i, k); + u->v.front_pad = 0; + u->v.back_pad = 0; + + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); +} + +noinline_for_stack +static int fix_reflink_p(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + + bch_verbose(c, "fixing reflink_p keys"); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->type == KEY_TYPE_reflink_p) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(&trans, &iter)); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + /* * Checks for inconsistencies that shouldn't happen, unless we have a bug. * Doesn't fix them yet, mainly because they haven't yet been observed: */ int bch2_fsck_full(struct bch_fs *c) { - struct bch_inode_unpacked root_inode; - - return check_inodes(c, true) ?: + return bch2_fs_snapshots_check(c) ?: + check_inodes(c, true) ?: + check_subvols(c) ?: check_extents(c) ?: check_dirents(c) ?: check_xattrs(c) ?: - check_root(c, &root_inode) ?: + check_root(c) ?: check_directory_structure(c) ?: - check_nlinks(c); + check_nlinks(c) ?: + fix_reflink_p(c); } int bch2_fsck_walk_inodes_only(struct bch_fs *c) diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 3b67108..78e2db6 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -4,10 +4,13 @@ #include "btree_key_cache.h" #include "bkey_methods.h" #include "btree_update.h" +#include "buckets.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "inode.h" #include "str_hash.h" +#include "subvolume.h" #include "varint.h" #include @@ -22,39 +25,6 @@ const char * const bch2_inode_opts[] = { }; static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -static const u8 bits_table[8] = { - 1 * 8 - 1, - 2 * 8 - 2, - 3 * 8 - 3, - 4 * 8 - 4, - 6 * 8 - 5, - 8 * 8 - 6, - 10 * 8 - 7, - 13 * 8 - 8, -}; - -static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) -{ - __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; - unsigned shift, bytes, bits = likely(!hi) - ? fls64(lo) - : fls64(hi) + 64; - - for (shift = 1; shift <= 8; shift++) - if (bits < bits_table[shift - 1]) - goto got_shift; - - BUG(); -got_shift: - bytes = byte_table[shift - 1]; - - BUG_ON(out + bytes > end); - - memcpy(out, (u8 *) in + 16 - bytes, bytes); - *out |= (1 << 8) >> shift; - - return bytes; -} static int inode_decode_field(const u8 *in, const u8 *end, u64 out[2], unsigned *out_bits) @@ -90,42 +60,11 @@ static int inode_decode_field(const u8 *in, const u8 *end, return bytes; } -static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - struct bkey_i_inode *k = &packed->inode; - u8 *out = k->v.fields; - u8 *end = (void *) &packed[1]; - u8 *last_nonzero_field = out; - unsigned nr_fields = 0, last_nonzero_fieldnr = 0; - unsigned bytes; - -#define x(_name, _bits) \ - out += inode_encode_field(out, end, 0, inode->_name); \ - nr_fields++; \ - \ - if (inode->_name) { \ - last_nonzero_field = out; \ - last_nonzero_fieldnr = nr_fields; \ - } - - BCH_INODE_FIELDS() -#undef x - - out = last_nonzero_field; - nr_fields = last_nonzero_fieldnr; - - bytes = out - (u8 *) &packed->inode.v; - set_bkey_val_bytes(&packed->inode.k, bytes); - memset_u64s_tail(&packed->inode.v, 0, bytes); - - SET_INODE_NR_FIELDS(&k->v, nr_fields); -} - -static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) +void bch2_inode_pack(struct bch_fs *c, + struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) { - struct bkey_i_inode *k = &packed->inode; + struct bkey_i_inode_v2 *k = &packed->inode; u8 *out = k->v.fields; u8 *end = (void *) &packed[1]; u8 *last_nonzero_field = out; @@ -133,6 +72,14 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, unsigned bytes; int ret; + bkey_inode_v2_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); + #define x(_name, _bits) \ nr_fields++; \ \ @@ -163,30 +110,12 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, set_bkey_val_bytes(&packed->inode.k, bytes); memset_u64s_tail(&packed->inode.v, 0, bytes); - SET_INODE_NR_FIELDS(&k->v, nr_fields); -} - -void bch2_inode_pack(struct bch_fs *c, - struct bkey_inode_buf *packed, - const struct bch_inode_unpacked *inode) -{ - bkey_inode_init(&packed->inode.k_i); - packed->inode.k.p.offset = inode->bi_inum; - packed->inode.v.bi_hash_seed = inode->bi_hash_seed; - packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); - packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); - - if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { - SET_INODE_NEW_VARINT(&packed->inode.v, true); - bch2_inode_pack_v2(packed, inode); - } else { - bch2_inode_pack_v1(packed, inode); - } + SET_INODEv2_NR_FIELDS(&k->v, nr_fields); if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { struct bch_inode_unpacked unpacked; - int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), + int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), &unpacked); BUG_ON(ret); BUG_ON(unpacked.bi_inum != inode->bi_inum); @@ -235,17 +164,16 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return 0; } -static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, - struct bch_inode_unpacked *unpacked) +static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, + const u8 *in, const u8 *end, + unsigned nr_fields) { - const u8 *in = inode.v->fields; - const u8 *end = bkey_val_end(inode); unsigned fieldnr = 0; int ret; u64 v[2]; #define x(_name, _bits) \ - if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ + if (fieldnr < nr_fields) { \ ret = bch2_varint_decode_fast(in, end, &v[0]); \ if (ret < 0) \ return ret; \ @@ -275,52 +203,79 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, return 0; } -int bch2_inode_unpack(struct bkey_s_c_inode inode, +int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_inum = inode.k->p.offset; - unpacked->bi_hash_seed = inode.v->bi_hash_seed; - unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); - unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - if (INODE_NEW_VARINT(inode.v)) { - return bch2_inode_unpack_v2(inode, unpacked); - } else { - return bch2_inode_unpack_v1(inode, unpacked); + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODE_NR_FIELDS(inode.v)); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } + break; + } + case KEY_TYPE_inode_v2: { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODEv2_NR_FIELDS(inode.v)); + } + default: + BUG(); } - - return 0; } -struct btree_iter *bch2_inode_peek(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u64 inum, unsigned flags) +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - if (trans->c->opts.inodes_use_key_cache) - flags |= BTREE_ITER_CACHED; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags); + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT; + ret = bkey_is_inode(k.k) ? 0 : -ENOENT; if (ret) goto err; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bch2_inode_unpack(k, inode); if (ret) goto err; - return iter; + return 0; err: - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } int bch2_inode_write(struct btree_trans *trans, @@ -340,8 +295,8 @@ int bch2_inode_write(struct btree_trans *trans, const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + struct bch_inode_unpacked unpacked; if (k.k->p.inode) return "nonzero k.p.inode"; @@ -355,7 +310,7 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) return "invalid str hash type"; - if (bch2_inode_unpack(inode, &unpacked)) + if (bch2_inode_unpack(k, &unpacked)) return "invalid variable length fields"; if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) @@ -368,15 +323,56 @@ const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) unpacked.bi_nlink != 0) return "flagged as unlinked but bi_nlink != 0"; + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) + return "subvolume root but not a directory"; + + return NULL; +} + +const char *bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + struct bch_inode_unpacked unpacked; + + if (k.k->p.inode) + return "nonzero k.p.inode"; + + if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) + return "incorrect value size"; + + if (k.k->p.offset < BLOCKDEV_INODE_MAX) + return "fs inode in blockdev range"; + + if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) + return "invalid str hash type"; + + if (bch2_inode_unpack(k, &unpacked)) + return "invalid variable length fields"; + + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) + return "invalid data checksum type"; + + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) + return "invalid data checksum type"; + + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && + unpacked.bi_nlink != 0) + return "flagged as unlinked but bi_nlink != 0"; + + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) + return "subvolume root but not a directory"; + return NULL; } static void __bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) { - pr_buf(out, "mode %o flags %x ", inode->bi_mode, inode->bi_flags); + pr_buf(out, "mode %o flags %x journal_seq %llu", + inode->bi_mode, inode->bi_flags, + inode->bi_journal_seq); #define x(_name, _bits) \ - pr_buf(out, #_name " %llu ", (u64) inode->_name); + pr_buf(out, " "#_name " %llu", (u64) inode->_name); BCH_INODE_FIELDS() #undef x } @@ -390,15 +386,14 @@ void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - struct bch_inode_unpacked unpacked; + struct bch_inode_unpacked inode; - if (bch2_inode_unpack(inode, &unpacked)) { + if (bch2_inode_unpack(k, &inode)) { pr_buf(out, "(unpack error)"); return; } - __bch2_inode_unpacked_to_text(out, &unpacked); + __bch2_inode_unpacked_to_text(out, &inode); } const char *bch2_inode_generation_invalid(const struct bch_fs *c, @@ -474,6 +469,7 @@ static inline u32 bkey_generation(struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: BUG(); case KEY_TYPE_inode_generation: return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); @@ -482,12 +478,15 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -struct btree_iter *bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) { struct bch_fs *c = trans->c; - struct btree_iter *iter = NULL; struct bkey_s_c k; u64 min, max, start, pos, *hint; int ret = 0; @@ -513,9 +512,9 @@ struct btree_iter *bch2_inode_create(struct btree_trans *trans, start = min; pos = start; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos), - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -528,7 +527,7 @@ again: } if (k.k->p.snapshot == snapshot && - k.k->type != KEY_TYPE_inode && + !bkey_is_inode(k.k) && !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { bch2_btree_iter_advance(iter); continue; @@ -553,8 +552,8 @@ again: ret = -ENOSPC; if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } /* Retry from start */ @@ -566,36 +565,80 @@ found_slot: k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) { - bch2_trans_iter_put(trans, iter); - return ERR_PTR(ret); + bch2_trans_iter_exit(trans, iter); + return ret; } /* We may have raced while the iterator wasn't pointing at pos: */ - if (k.k->type == KEY_TYPE_inode || + if (bkey_is_inode(k.k) || bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) goto again; *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); - return iter; + return 0; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; + int ret = 0; + + /* + * We're never going to be deleting extents, no need to use an extent + * iterator: + */ + bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + + while (1) { + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || iter.pos.inode != inum.inum) + break; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + if (ret && ret != -EINTR) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; - struct bpos start = POS(inode_nr, 0); - struct bpos end = POS(inode_nr + 1, 0); struct bch_inode_unpacked inode_u; struct bkey_s_c k; - unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; int ret; - if (cached && c->opts.inodes_use_key_cache) - iter_flags |= BTREE_ITER_CACHED; - bch2_trans_init(&trans, c, 0, 1024); /* @@ -606,44 +649,49 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, - start, end, NULL); + ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, - POS(0, inode_nr), iter_flags); - k = bch2_btree_iter_peek_slot(iter); + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_inode) { + if (!bkey_is_inode(k.k)) { bch2_fs_inconsistent(trans.c, "inode %llu not found when deleting", - inode_nr); + inum.inum); ret = -EIO; goto err; } - bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + BUG_ON(inode_u.bi_subvol); bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter->pos; + delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - ret = bch2_trans_update(&trans, iter, &delete.k_i, 0) ?: + ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; @@ -651,21 +699,22 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_inode_peek(trans, inode, inode_nr, 0); - ret = PTR_ERR_OR_ZERO(iter); - bch2_trans_iter_put(trans, iter); + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + bch2_inode_find_by_inum_trans(&trans, inum, inode)); } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index d67af4f..77957cc 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -7,6 +7,7 @@ extern const char * const bch2_inode_opts[]; const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); +const char *bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode (struct bkey_ops) { \ @@ -14,6 +15,17 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_inode_to_text, \ } +#define bch2_bkey_ops_inode_v2 (struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ +} + +static inline bool bkey_is_inode(const struct bkey *k) +{ + return k->type == KEY_TYPE_inode || + k->type == KEY_TYPE_inode_v2; +} + const char *bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, @@ -34,6 +46,7 @@ typedef u64 u96; struct bch_inode_unpacked { u64 bi_inum; + u64 bi_journal_seq; __le64 bi_hash_seed; u32 bi_flags; u16 bi_mode; @@ -44,7 +57,7 @@ struct bch_inode_unpacked { }; struct bkey_inode_buf { - struct bkey_i_inode inode; + struct bkey_i_inode_v2 inode; #define x(_name, _bits) + 8 + _bits / 8 u8 _pad[0 + BCH_INODE_FIELDS()]; @@ -53,12 +66,12 @@ struct bkey_inode_buf { void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, const struct bch_inode_unpacked *); -int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); -struct btree_iter *bch2_inode_peek(struct btree_trans *, - struct bch_inode_unpacked *, u64, unsigned); +int bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *); @@ -71,12 +84,15 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -struct btree_iter *bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, u32, u64); +int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, u64, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum); -int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { @@ -133,6 +149,11 @@ static inline u8 mode_to_type(umode_t mode) return (mode >> 12) & 15; } +static inline u8 inode_d_type(struct bch_inode_unpacked *inode) +{ + return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 4585a40..10f8b3a 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -27,6 +27,7 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" @@ -186,26 +187,24 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, int bch2_sum_sector_overwrites(struct btree_trans *trans, struct btree_iter *extent_iter, struct bkey_i *new, - bool *maybe_extending, bool *usage_increasing, s64 *i_sectors_delta, s64 *disk_sectors_delta) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c old; unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; *usage_increasing = false; *i_sectors_delta = 0; *disk_sectors_delta = 0; - iter = bch2_trans_copy_iter(trans, extent_iter); + bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -220,42 +219,21 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, : 0; if (!*usage_increasing && - (new_replicas > bch2_bkey_replicas(c, old) || + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; - if (bkey_cmp(old.k->p, new->k.p) >= 0) { - /* - * Check if there's already data above where we're - * going to be writing to - this means we're definitely - * not extending the file: - * - * Note that it's not sufficient to check if there's - * data up to the sector offset we're going to be - * writing to, because i_size could be up to one block - * less: - */ - if (!bkey_cmp(old.k->p, new->k.p)) { - old = bch2_btree_iter_next(iter); - ret = bkey_err(old); - if (ret) - break; - } - - if (old.k && !bkey_err(old) && - old.k->p.inode == extent_iter->pos.inode && - bkey_extent_is_data(old.k)) - *maybe_extending = false; - + if (bkey_cmp(old.k->p, new->k.p) >= 0) break; - } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, @@ -264,213 +242,208 @@ int bch2_extent_update(struct btree_trans *trans, s64 *i_sectors_delta_total, bool check_enospc) { - /* this must live until after bch2_trans_commit(): */ - struct bkey_inode_buf inode_p; - bool extending = false, usage_increasing; + struct btree_iter inode_iter; + struct bch_inode_unpacked inode_u; + struct bpos next_pos; + bool usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; - ret = bch2_extent_trim_atomic(k, iter); + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); if (ret) return ret; + ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; + + new_i_size = min(k->k.p.offset << 9, new_i_size); + next_pos = k->k.p; + ret = bch2_sum_sector_overwrites(trans, iter, k, - &extending, &usage_increasing, &i_sectors_delta, &disk_sectors_delta); if (ret) return ret; - if (!usage_increasing) - check_enospc = false; - if (disk_res && disk_sectors_delta > (s64) disk_res->sectors) { ret = bch2_disk_reservation_add(trans->c, disk_res, disk_sectors_delta - disk_res->sectors, - !check_enospc + !check_enospc || !usage_increasing ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) return ret; } - new_i_size = extending - ? min(k->k.p.offset << 9, new_i_size) - : 0; - - if (i_sectors_delta || new_i_size) { - struct btree_iter *inode_iter; - struct bch_inode_unpacked inode_u; - - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); - if (ret) - return ret; - - /* - * XXX: - * writeback can race a bit with truncate, because truncate - * first updates the inode then truncates the pagecache. This is - * ugly, but lets us preserve the invariant that the in memory - * i_size is always >= the on disk i_size. - * - BUG_ON(new_i_size > inode_u.bi_size && - (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); - */ - BUG_ON(new_i_size > inode_u.bi_size && !extending); - - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) - inode_u.bi_size = new_i_size; - else - new_i_size = 0; - - inode_u.bi_sectors += i_sectors_delta; - - if (i_sectors_delta || new_i_size) { - bch2_inode_pack(trans->c, &inode_p, &inode_u); - - inode_p.inode.k.p.snapshot = iter->snapshot; - - ret = bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i, 0); - } + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + return ret; - bch2_trans_iter_put(trans, inode_iter); + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; - if (ret) - return ret; - } + inode_u.bi_sectors += i_sectors_delta; ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_inode_write(trans, &inode_iter, &inode_u) ?: bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL); - BUG_ON(ret == -ENOSPC); + bch2_trans_iter_exit(trans, &inode_iter); + if (ret) return ret; if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; } +/* + * Returns -EINTR if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, + subvol_inum inum, u64 end, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((bch2_trans_begin(trans), - (k = bch2_btree_iter_peek(iter)).k) && - bkey_cmp(iter->pos, end) < 0) { + while (!ret || ret == -EINTR) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); + if (bkey_cmp(iter->pos, end_pos) >= 0) { + bch2_btree_iter_set_pos(iter, end_pos); + break; + } + ret = bkey_err(k); if (ret) - goto btree_err; + continue; bkey_init(&delete.k); delete.k.p = iter->pos; /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, - &disk_res, journal_seq, + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, NULL, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; - } - - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); } return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, - u64 *journal_seq, s64 *i_sectors_delta) +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inum, start), - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, iter, POS(inum, end), - journal_seq, i_sectors_delta); + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret == -EINTR) - ret = 0; - - return ret; + return ret == -EINTR ? 0 : ret; } int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; struct bkey_buf sk; + struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; + BUG_ON(!inum.subvol); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { bch2_trans_begin(&trans); k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); - k->k.p.snapshot = iter->snapshot; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (ret == -EINTR) + continue; + if (ret) + break; - bch2_bkey_buf_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter->pos, sk.k); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, iter, sk.k, + ret = bch2_extent_update(&trans, inum, &iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) continue; if (ret) break; - if (bkey_cmp(iter->pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + if (ec_ob) + bch2_ob_add_backpointer(c, ec_ob, &sk.k->k); + + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -692,11 +665,7 @@ static void init_append_extent(struct bch_write_op *op, { struct bch_fs *c = op->c; struct bkey_i_extent *e; - struct open_bucket *ob; - unsigned i; - BUG_ON(crc.compressed_size > wp->sectors_free); - wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; e = bkey_extent_init(op->insert_keys.top); @@ -709,22 +678,8 @@ static void init_append_extent(struct bch_write_op *op, crc.nonce) bch2_extent_crc_append(&e->k_i, crc); - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - union bch_extent_entry *end = - bkey_val_end(bkey_i_to_s(&e->k_i)); - - end->ptr = ob->ptr; - end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - end->ptr.cached = !ca->mi.durability || - (op->flags & BCH_WRITE_CACHED) != 0; - end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; - - e->k.u64s++; - - BUG_ON(crc.compressed_size > ob->sectors_free); - ob->sectors_free -= crc.compressed_size; - } + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); bch2_keylist_push(&op->insert_keys); } @@ -744,6 +699,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ? ((unsigned long) buf & (PAGE_SIZE - 1)) : 0), PAGE_SIZE); + pages = min(pages, BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; @@ -763,7 +720,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, */ bch2_bio_alloc_pages_pool(c, bio, min_t(unsigned, output_available, - c->sb.encoded_extent_max << 9)); + c->opts.encoded_extent_max)); if (bio->bi_iter.bi_size < output_available) *page_alloc_failed = @@ -909,7 +866,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; void *ec_buf; - struct bpos ec_pos = op->pos; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; @@ -926,7 +882,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: - BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -962,8 +917,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, size_t dst_len, src_len; if (page_alloc_failed && - bio_sectors(dst) < wp->sectors_free && - bio_sectors(dst) < c->sb.encoded_extent_max) + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; BUG_ON(op->compression_type && @@ -983,7 +938,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (op->csum_type) dst_len = min_t(unsigned, dst_len, - c->sb.encoded_extent_max << 9); + c->opts.encoded_extent_max); if (bounce) { swap(dst->bi_iter.bi_size, dst_len); @@ -1080,9 +1035,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, dst->bi_iter.bi_size = total_output; do_write: - /* might have done a realloc... */ - bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); - *_dst = dst; return more; csum_err: @@ -1141,7 +1093,7 @@ again: */ wp = bch2_alloc_sectors_start(c, op->target, - op->opts.erasure_code, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), op->write_point, &op->devs_have, op->nr_replicas, @@ -1319,7 +1271,7 @@ void bch2_write(struct closure *cl) bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio_sectors(bio) & (c->opts.block_size - 1)) { + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { bch_err_inum_ratelimited(c, op->pos.inode, "misaligned write"); op->error = -EIO; @@ -1631,12 +1583,12 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; @@ -1647,12 +1599,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); if (bkey_err(k)) goto err; @@ -1679,7 +1631,7 @@ retry: goto err; out: bch2_rbio_done(rbio); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return; @@ -1695,7 +1647,10 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->read_pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1711,12 +1666,12 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); } else { flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - __bch2_read(c, rbio, iter, inode, &failed, flags); + __bch2_read(c, rbio, iter, inum, &failed, flags); } } @@ -1745,7 +1700,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - struct btree_iter *iter = NULL; + struct btree_iter iter; struct bkey_i *new; struct bkey_s_c k; int ret = 0; @@ -1753,9 +1708,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto out; @@ -1790,9 +1745,10 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - ret = bch2_trans_update(trans, iter, new, 0); + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1937,9 +1893,8 @@ static void bch2_read_endio(struct bio *bio) return; } - if (rbio->pick.ptr.cached && - (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr))) { + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) @@ -1963,7 +1918,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, struct bkey_buf *orig_k) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 reflink_offset; int ret; @@ -1971,10 +1926,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1991,13 +1946,40 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, goto err; } - *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + char buf[200]; + int ret; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch2_fs_inconsistent(c, "Attempting to read from stale dirty pointer: %s", buf); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ptr.dev, PTR_BUCKET_NR(ca, &ptr)), + BTREE_ITER_CACHED); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + return; + + bch2_bkey_val_to_text(&PBUF(buf), c, k); + bch_err(c, "%s", buf); + bch_err(c, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + bch2_trans_iter_exit(trans, &iter); +} + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, @@ -2007,7 +1989,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca; + struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); @@ -2024,7 +2006,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, zero_fill_bio_iter(&orig->bio, iter); goto out_read_done; } - +retry_pick: pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -2037,8 +2019,20 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, goto err; } - if (pick_ret > 0) - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + if (!pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); if (flags & BCH_READ_NODECODE) { /* @@ -2065,7 +2059,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_NONE && + (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || @@ -2158,6 +2152,7 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; + rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; @@ -2260,26 +2255,31 @@ out_read_done: } void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + u32 snapshot; int ret; BUG_ON(flags & BCH_READ_NODECODE); bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; @@ -2293,15 +2293,15 @@ retry: break; } - bch2_btree_iter_set_pos(iter, - POS(inode, bvec_iter.bi_sector)); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -2320,19 +2320,13 @@ retry: */ sectors = min(sectors, k.k->size - offset_into_extent); - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(&trans); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags); if (ret) @@ -2343,17 +2337,22 @@ retry: swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } +err: + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); if (ret) { - bch_err_inum_ratelimited(c, inode, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); @@ -2381,8 +2380,8 @@ int bch2_fs_io_init(struct bch_fs *c) mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, - c->sb.encoded_extent_max) / - PAGE_SECTORS, 0) || + c->opts.encoded_extent_max) / + PAGE_SIZE, 0) || rhashtable_init(&c->promote_table, &bch_promote_params)) return -ENOMEM; diff --git a/libbcachefs/io.h b/libbcachefs/io.h index bc0a0bd..1aa422d 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -48,12 +48,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } -static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -{ - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -} - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_MOVINGGC @@ -62,13 +56,14 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) } int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, - struct bkey_i *, bool *, bool *, s64 *, s64 *); -int bch2_extent_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct disk_reservation *, - u64 *, u64, s64 *, bool); + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64 *, u64, s64 *, bool); + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - struct bpos, u64 *, s64 *); -int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); int bch2_write_index_default(struct bch_write_op *); @@ -90,6 +85,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->devs_have.nr = 0; op->target = 0; op->opts = opts; + op->subvol = 0; op->pos = POS_MAX; op->version = ZERO_VERSION; op->write_point = (struct write_point_specifier) { 0 }; @@ -157,10 +153,10 @@ static inline void bch2_read_extent(struct btree_trans *trans, } void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - u64, struct bch_io_failures *, unsigned flags); + subvol_inum, struct bch_io_failures *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - u64 inode) + subvol_inum inum) { struct bch_io_failures failed = { .nr = 0 }; @@ -168,8 +164,9 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, rbio->c = c; rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed, + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE| BCH_READ_USER_MAPPED); diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 0aab779..78bff13 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -62,6 +62,7 @@ struct bch_read_bio { /* * pos we read from - different from data_pos for indirect extents: */ + u32 subvol; struct bpos read_pos; /* @@ -122,6 +123,7 @@ struct bch_write_op { u16 nonce; struct bch_io_opts opts; + u32 subvol; struct bpos pos; struct bversion version; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index ac4071f..158df42 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -88,8 +88,6 @@ static void bch2_journal_buf_init(struct journal *j) buf->must_flush = false; buf->separate_flush = false; - memset(buf->has_inode, 0, sizeof(buf->has_inode)); - memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); buf->data->u64s = 0; @@ -109,7 +107,12 @@ void bch2_journal_halt(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - j->err_seq = journal_cur_seq(j); + /* + * XXX: we're not using j->lock here because this can be called from + * interrupt context, this can race with journal_write_done() + */ + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); journal_wake(j); closure_wake_up(&journal_cur_buf(j)->wait); } @@ -308,7 +311,7 @@ static int journal_entry_open(struct journal *j) mod_delayed_work(c->io_complete_wq, &j->write_work, - msecs_to_jiffies(j->write_delay_ms)); + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); return 0; } @@ -335,55 +338,6 @@ static void journal_write_work(struct work_struct *work) journal_entry_close(j); } -/* - * Given an inode number, if that inode number has data in the journal that - * hasn't yet been flushed, return the journal sequence number that needs to be - * flushed: - */ -u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - union journal_res_state s; - unsigned i; - u64 seq; - - - spin_lock(&j->lock); - seq = journal_cur_seq(j); - s = READ_ONCE(j->reservations); - i = s.idx; - - while (1) { - if (test_bit(h, j->buf[i].has_inode)) - goto out; - - if (i == s.unwritten_idx) - break; - - i = (i - 1) & JOURNAL_BUF_MASK; - seq--; - } - - seq = 0; -out: - spin_unlock(&j->lock); - - return seq; -} - -void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) -{ - size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); - struct journal_buf *buf; - - spin_lock(&j->lock); - - if ((buf = journal_seq_to_buf(j, seq))) - set_bit(h, buf->has_inode); - - spin_unlock(&j->lock); -} - static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -602,7 +556,10 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); + if (WARN_ONCE(seq > journal_cur_seq(j), + "requested to flush journal seq %llu, but currently at %llu", + seq, journal_cur_seq(j))) + goto out; /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { @@ -669,6 +626,12 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) u64 start_time = local_clock(); int ret, ret2; + /* + * Don't update time_stats when @seq is already flushed: + */ + if (seq <= j->flushed_seq_ondisk) + return 0; + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); if (!ret) @@ -679,6 +642,7 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) int bch2_journal_meta(struct journal *j) { + struct journal_buf *buf; struct journal_res res; int ret; @@ -688,6 +652,10 @@ int bch2_journal_meta(struct journal *j) if (ret) return ret; + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + set_bit(JOURNAL_NEED_WRITE, &j->flags); + bch2_journal_res_put(j, &res); return bch2_journal_flush_seq(j, res.seq); @@ -737,6 +705,44 @@ int bch2_journal_flush(struct journal *j) return bch2_journal_flush_seq(j, seq); } +/* + * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * @seq + */ +bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; + bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + + if (seq <= c->journal.flushed_seq_ondisk) + return false; + + spin_lock(&j->lock); + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + + for (unwritten_seq = last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ + if (unwritten_seq == last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; + } + + ret = true; +out: + spin_unlock(&j->lock); + return ret; +} + /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) @@ -807,11 +813,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, long b; if (new_fs) { - if (c) - percpu_down_read(&c->mark_lock); b = bch2_bucket_alloc_new_fs(ca); if (b < 0) { - percpu_up_read(&c->mark_lock); ret = -ENOSPC; goto err; } @@ -825,7 +828,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, goto err; } - b = sector_to_bucket(ca, ob->ptr.offset); + b = ob->bucket; } if (c) @@ -859,14 +862,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (c) spin_unlock(&c->journal.lock); - if (new_fs) { - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, - ca->mi.bucket_size, - gc_phase(GC_PHASE_SB), - 0); - if (c) - percpu_up_read(&c->mark_lock); - } else { + if (!new_fs) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_trans_mark_metadata_bucket(&trans, ca, b, BCH_DATA_journal, @@ -1032,10 +1028,14 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, j->replay_journal_seq = last_seq; j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; atomic64_set(&j->seq, cur_seq - 1); + if (list_empty(journal_entries)) + j->last_empty_seq = cur_seq - 1; + fifo_for_each_entry_ptr(p, &j->pin, seq) journal_pin_list_init(p, 1); @@ -1048,6 +1048,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, if (seq < last_seq) continue; + if (journal_entry_empty(&i->j)) + j->last_empty_seq = le64_to_cpu(i->j.seq); + p = journal_seq_pin(j, seq); p->devs.nr = 0; @@ -1055,6 +1058,9 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq, bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); } + if (list_empty(journal_entries)) + j->last_empty_seq = cur_seq; + spin_lock(&j->lock); set_bit(JOURNAL_STARTED, &j->flags); @@ -1144,9 +1150,6 @@ int bch2_fs_journal_init(struct journal *j) lockdep_init_map(&j->res_map, "journal res", &res_key, 0); - j->write_delay_ms = 1000; - j->reclaim_delay_ms = 100; - atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); @@ -1178,44 +1181,29 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; + unsigned long now = jiffies; unsigned i; rcu_read_lock(); s = READ_ONCE(j->reservations); - pr_buf(out, - "active journal entries:\t%llu\n" - "seq:\t\t\t%llu\n" - "last_seq:\t\t%llu\n" - "last_seq_ondisk:\t%llu\n" - "flushed_seq_ondisk:\t%llu\n" - "prereserved:\t\t%u/%u\n" - "each entry reserved:\t%u\n" - "nr flush writes:\t%llu\n" - "nr noflush writes:\t%llu\n" - "nr direct reclaim:\t%llu\n" - "nr background reclaim:\t%llu\n" - "reclaim kicked:\t\t%u\n" - "reclaim runs in:\t%u ms\n" - "current entry sectors:\t%u\n" - "current entry error:\t%u\n" - "current entry:\t\t", - fifo_used(&j->pin), - journal_cur_seq(j), - journal_last_seq(j), - j->last_seq_ondisk, - j->flushed_seq_ondisk, - j->prereserved.reserved, - j->prereserved.remaining, - j->entry_u64s_reserved, - j->nr_flush_writes, - j->nr_noflush_writes, - j->nr_direct_reclaim, - j->nr_background_reclaim, - j->reclaim_kicked, - jiffies_to_msecs(j->next_reclaim - jiffies), - j->cur_entry_sectors, - j->cur_entry_error); + pr_buf(out, "active journal entries:\t%llu\n", fifo_used(&j->pin)); + pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); + pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + pr_buf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + pr_buf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); + pr_buf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); + pr_buf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: @@ -1225,15 +1213,11 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) pr_buf(out, "closed\n"); break; default: - pr_buf(out, "%u/%u\n", - s.cur_entry_offset, - j->cur_entry_u64s); + pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - pr_buf(out, - "current entry:\t\tidx %u refcount %u\n", - s.idx, journal_state_count(s, s.idx)); + pr_buf(out, "current entry:\t\tidx %u refcount %u\n", s.idx, journal_state_count(s, s.idx)); i = s.idx; while (i != s.unwritten_idx) { @@ -1273,22 +1257,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) if (!ja->nr) continue; - pr_buf(out, - "dev %u:\n" - "\tnr\t\t%u\n" - "\tbucket size\t%u\n" - "\tavailable\t%u:%u\n" - "\tdiscard_idx\t%u\n" - "\tdirty_ondisk\t%u (seq %llu)\n" - "\tdirty_idx\t%u (seq %llu)\n" - "\tcur_idx\t\t%u (seq %llu)\n", - i, ja->nr, ca->mi.bucket_size, - bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), - ja->sectors_free, - ja->discard_idx, - ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], - ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], - ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + pr_buf(out, "dev %u:\n", i); + pr_buf(out, "\tnr\t\t%u\n", ja->nr); + pr_buf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); + pr_buf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + pr_buf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); + pr_buf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + pr_buf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + pr_buf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); } rcu_read_unlock(); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 1d55679..b298873 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -141,7 +141,6 @@ static inline u64 journal_cur_seq(struct journal *j) return j->pin.back - 1; } -u64 bch2_inode_journal_seq(struct journal *, u64); void bch2_journal_set_has_inum(struct journal *, u64, u64); static inline int journal_state_count(union journal_res_state s, int idx) @@ -163,18 +162,6 @@ static inline void journal_state_inc(union journal_res_state *s) s->buf3_count += s->idx == 3; } -static inline void bch2_journal_set_has_inode(struct journal *j, - struct journal_res *res, - u64 inum) -{ - struct journal_buf *buf = &j->buf[res->idx]; - unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); - - /* avoid atomic op if possible */ - if (unlikely(!test_bit(bit, buf->has_inode))) - set_bit(bit, buf->has_inode); -} - /* * Amount of space that will be taken up by some keys in the journal (i.e. * including the jset header) @@ -446,6 +433,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, ret = 0; if ((flags & JOURNAL_RES_GET_RESERVED) || + test_bit(JOURNAL_NOCHANGES, &j->flags) || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; @@ -489,6 +477,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 66a0e26..b5c204e 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -274,7 +274,7 @@ fsck_err: return ret; } -static int journal_entry_validate_btree_keys(struct bch_fs *c, +static int journal_entry_btree_keys_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -295,7 +295,24 @@ static int journal_entry_validate_btree_keys(struct bch_fs *c, return 0; } -static int journal_entry_validate_btree_root(struct bch_fs *c, +static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct bkey_i *k; + bool first = true; + + vstruct_for_each(entry, k) { + if (!first) { + printbuf_newline(out); + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } +} + +static int journal_entry_btree_root_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -323,7 +340,13 @@ fsck_err: return ret; } -static int journal_entry_validate_prio_ptrs(struct bch_fs *c, +static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +static int journal_entry_prio_ptrs_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -332,7 +355,12 @@ static int journal_entry_validate_prio_ptrs(struct bch_fs *c, return 0; } -static int journal_entry_validate_blacklist(struct bch_fs *c, +static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ +} + +static int journal_entry_blacklist_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -347,7 +375,16 @@ fsck_err: return ret; } -static int journal_entry_validate_blacklist_v2(struct bch_fs *c, +static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist *bl = + container_of(entry, struct jset_entry_blacklist, entry); + + pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq)); +} + +static int journal_entry_blacklist_v2_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -373,7 +410,18 @@ fsck_err: return ret; } -static int journal_entry_validate_usage(struct bch_fs *c, +static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist_v2 *bl = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + pr_buf(out, "start=%llu end=%llu", + le64_to_cpu(bl->start), + le64_to_cpu(bl->end)); +} + +static int journal_entry_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -394,7 +442,18 @@ fsck_err: return ret; } -static int journal_entry_validate_data_usage(struct bch_fs *c, +static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + pr_buf(out, "type=%s v=%llu", + bch2_fs_usage_types[u->entry.btree_id], + le64_to_cpu(u->v)); +} + +static int journal_entry_data_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -416,7 +475,17 @@ fsck_err: return ret; } -static int journal_entry_validate_clock(struct bch_fs *c, +static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + bch2_replicas_entry_to_text(out, &u->r); + pr_buf(out, "=%llu", le64_to_cpu(u->v)); +} + +static int journal_entry_clock_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -442,7 +511,16 @@ fsck_err: return ret; } -static int journal_entry_validate_dev_usage(struct bch_fs *c, +static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); +} + +static int journal_entry_dev_usage_validate(struct bch_fs *c, const char *where, struct jset_entry *entry, unsigned version, int big_endian, int write) @@ -479,15 +557,59 @@ fsck_err: return ret; } +static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + pr_buf(out, "dev=%u", le32_to_cpu(u->dev)); + + for (i = 0; i < nr_types; i++) { + if (i < BCH_DATA_NR) + pr_buf(out, " %s", bch2_data_types[i]); + else + pr_buf(out, " (unknown data type %u)", i); + pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu", + le64_to_cpu(u->d[i].buckets), + le64_to_cpu(u->d[i].sectors), + le64_to_cpu(u->d[i].fragmented)); + } + + pr_buf(out, " buckets_ec: %llu buckets_unavailable: %llu", + le64_to_cpu(u->buckets_ec), + le64_to_cpu(u->buckets_unavailable)); +} + +static int journal_entry_log_validate(struct bch_fs *c, + const char *where, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return 0; +} + +static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + + bch_scnmemcpy(out, l->d, strnlen(l->d, bytes)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, const char *, struct jset_entry *, unsigned, int, int); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; static const struct jset_entry_ops bch2_jset_entry_ops[] = { #define x(f, nr) \ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ - .validate = journal_entry_validate_##f, \ + .validate = journal_entry_##f##_validate, \ + .to_text = journal_entry_##f##_to_text, \ }, BCH_JSET_ENTRY_TYPES() #undef x @@ -503,6 +625,17 @@ int bch2_journal_entry_validate(struct bch_fs *c, const char *where, : 0; } +void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + if (entry->type < BCH_JSET_ENTRY_NR) { + pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_jset_entry_ops[entry->type].to_text(out, c, entry); + } else { + pr_buf(out, "(unknown type %u)", entry->type); + } +} + static int jset_validate_entries(struct bch_fs *c, struct jset *jset, int write) { @@ -710,7 +843,7 @@ reread: case JOURNAL_ENTRY_NONE: if (!saw_bad) return 0; - sectors = c->opts.block_size; + sectors = block_sectors(c); goto next_block; case JOURNAL_ENTRY_BAD: saw_bad = true; @@ -719,7 +852,7 @@ reread: * field of the journal entry we read, so try reading * again at next block boundary: */ - sectors = c->opts.block_size; + sectors = block_sectors(c); break; default: return ret; @@ -766,12 +899,13 @@ static void bch2_journal_read_device(struct closure *cl) struct journal_device *ja = container_of(cl, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct bch_fs *c = ca->fs; struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); struct journal_read_buf buf = { NULL, 0 }; u64 min_seq = U64_MAX; unsigned i; - int ret; + int ret = 0; if (!ja->nr) goto out; @@ -817,6 +951,7 @@ static void bch2_journal_read_device(struct closure *cl) ja->discard_idx = ja->dirty_idx_ondisk = ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: + bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); closure_return(cl); @@ -1238,7 +1373,9 @@ static void journal_write_done(struct closure *cl) u64 v, seq; int err = 0; - bch2_time_stats_update(j->write_time, j->write_start_time); + bch2_time_stats_update(!JSET_NO_FLUSH(w->data) + ? j->flush_write_time + : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); @@ -1259,14 +1396,15 @@ static void journal_write_done(struct closure *cl) if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; - j->seq_ondisk = seq; - if (err && (!j->err_seq || seq < j->err_seq)) - j->err_seq = seq; + if (!err) { + j->seq_ondisk = seq; - if (!JSET_NO_FLUSH(w->data)) { - j->flushed_seq_ondisk = seq; - j->last_seq_ondisk = w->last_seq; - } + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; /* * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard @@ -1396,9 +1534,10 @@ void bch2_journal_write(struct closure *cl) spin_lock(&j->lock); if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) && - !w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) { + (w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; @@ -1445,7 +1584,7 @@ void bch2_journal_write(struct closure *cl) SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); - if (journal_entry_empty(jset)) + if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) j->last_empty_seq = le64_to_cpu(jset->seq); if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) @@ -1515,7 +1654,7 @@ retry_alloc: w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) + if (test_bit(JOURNAL_NOCHANGES, &j->flags)) goto no_io; for_each_rw_member(ca, c, i) @@ -1538,16 +1677,12 @@ retry_alloc: } } - bch2_bucket_seq_cleanup(c); - continue_at(cl, do_journal_write, c->io_complete_wq); return; no_io: - bch2_bucket_seq_cleanup(c); - continue_at(cl, journal_write_done, c->io_complete_wq); return; err: - bch2_inconsistent_error(c); + bch2_fatal_error(c); continue_at(cl, journal_write_done, c->io_complete_wq); } diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index f34281a..d8425fe 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -40,8 +40,10 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ vstruct_for_each_safe(entry, k, _n) -int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *, - unsigned, int, int); +int bch2_journal_entry_validate(struct bch_fs *, const char *, + struct jset_entry *, unsigned, int, int); +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 7a0ae5d..52a3935 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -34,8 +34,10 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { - unsigned available = (journal_space_from(ja, from) - - ja->cur_idx - 1 + ja->nr) % ja->nr; + unsigned available = !test_bit(JOURNAL_NOCHANGES, &j->flags) + ? ((journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr) + : ja->nr; /* * Don't use the last bucket unless writing the new last_seq @@ -487,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, u64 seq; int err; - if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) - return 0; - lockdep_assert_held(&j->reclaim_lock); while (1) { @@ -635,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) * make sure to flush at least one journal pin: */ if (time_after(jiffies, j->last_flushed + - msecs_to_jiffies(j->reclaim_delay_ms))) + msecs_to_jiffies(c->opts.journal_reclaim_delay))) min_nr = 1; if (j->prereserved.reserved * 4 > j->prereserved.remaining) @@ -644,6 +643,9 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) if (fifo_free(&j->pin) <= 32) min_nr = 1; + if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + min_nr = 1; + trace_journal_reclaim_start(c, min_nr, j->prereserved.reserved, @@ -653,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct) atomic_long_read(&c->btree_key_cache.nr_dirty), atomic_long_read(&c->btree_key_cache.nr_keys)); - min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL); + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr, min_key_cache); @@ -681,13 +683,12 @@ int bch2_journal_reclaim(struct journal *j) static int bch2_journal_reclaim_thread(void *arg) { struct journal *j = arg; + struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned long delay, now; int ret = 0; set_freezable(); - kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)); - j->last_flushed = jiffies; while (!ret && !kthread_should_stop()) { @@ -698,7 +699,7 @@ static int bch2_journal_reclaim_thread(void *arg) mutex_unlock(&j->reclaim_lock); now = jiffies; - delay = msecs_to_jiffies(j->reclaim_delay_ms); + delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); j->next_reclaim = j->last_flushed + delay; if (!time_in_range(j->next_reclaim, now, now + delay)) diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index f2060f9..3cc63fc 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -66,6 +66,12 @@ blacklist_entry_try_merge(struct bch_fs *c, return bl; } +static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, + u64 start, u64 end) +{ + return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); +} + int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; @@ -76,28 +82,21 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); nr = blacklist_nr_entries(bl); - if (bl) { - for (i = 0; i < nr; i++) { - struct journal_seq_blacklist_entry *e = - bl->start + i; - - if (start == le64_to_cpu(e->start) && - end == le64_to_cpu(e->end)) - goto out; - - if (start <= le64_to_cpu(e->start) && - end >= le64_to_cpu(e->end)) { - e->start = cpu_to_le64(start); - e->end = cpu_to_le64(end); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; - } + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; + + if (bl_entry_contig_or_overlaps(e, start, end)) { + e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); + e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; } } @@ -189,27 +188,34 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) return 0; } -static const char * -bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_journal_seq_blacklist *bl = field_to_type(f, journal_seq_blacklist); - struct journal_seq_blacklist_entry *i; - unsigned nr = blacklist_nr_entries(bl); + unsigned i, nr = blacklist_nr_entries(bl); - for (i = bl->start; i < bl->start + nr; i++) { - if (le64_to_cpu(i->start) >= - le64_to_cpu(i->end)) - return "entry start >= end"; - - if (i + 1 < bl->start + nr && - le64_to_cpu(i[0].end) > - le64_to_cpu(i[1].start)) - return "entries out of order"; + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = bl->start + i; + + if (le64_to_cpu(e->start) >= + le64_to_cpu(e->end)) { + pr_buf(err, "entry %u start >= end (%llu >= %llu)", + i, le64_to_cpu(e->start), le64_to_cpu(e->end)); + return -EINVAL; + } + + if (i + 1 < nr && + le64_to_cpu(e[0].end) > + le64_to_cpu(e[1].start)) { + pr_buf(err, "entry %u out of order with next entry (%llu > %llu)", + i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); + return -EINVAL; + } } - return NULL; + return 0; } static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, @@ -250,19 +256,28 @@ void bch2_blacklist_entries_gc(struct work_struct *work) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; - for_each_btree_node(&trans, iter, i, POS_MIN, - BTREE_ITER_PREFETCH, b) - if (test_bit(BCH_FS_STOPPING, &c->flags)) { - bch2_trans_exit(&trans); - return; - } - bch2_trans_iter_free(&trans, iter); + bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + 0, 0, BTREE_ITER_PREFETCH); +retry: + bch2_trans_begin(&trans); + + b = bch2_btree_iter_peek_node(&iter); + + while (!(ret = PTR_ERR_OR_ZERO(b)) && + b && + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); } - ret = bch2_trans_exit(&trans); + bch2_trans_exit(&trans); if (ret) return; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 61674ae..d6d7512 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -34,8 +34,6 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; - /* bloom filter: */ - unsigned long has_inode[1024 / sizeof(unsigned long)]; }; /* @@ -150,10 +148,10 @@ enum journal_space_from { enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NOCHANGES, }; /* Embedded in struct bch_fs */ @@ -263,8 +261,6 @@ struct journal { struct mutex discard_lock; bool can_discard; - unsigned write_delay_ms; - unsigned reclaim_delay_ms; unsigned long last_flush_write; u64 res_get_blocked_start; @@ -274,8 +270,8 @@ struct journal { u64 nr_flush_writes; u64 nr_noflush_writes; - struct time_stats *write_time; - struct time_stats *delay_time; + struct time_stats *flush_write_time; + struct time_stats *noflush_write_time; struct time_stats *blocked_time; struct time_stats *flush_seq_time; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 1f65eca..6defc33 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -39,7 +39,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags enum btree_id btree_id) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_buf sk; int ret = 0; @@ -47,13 +47,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); - while ((k = bch2_btree_iter_peek(iter)).k && + while ((bch2_trans_begin(&trans), + (k = bch2_btree_iter_peek(&iter)).k) && !(ret = bkey_err(k))) { if (!bch2_bkey_has_device(k, dev_idx)) { - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); continue; } @@ -71,10 +73,18 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags */ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); - bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&sk.k->k)) + sk.k->k.size = 0; - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, sk.k, 0) ?: + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, sk.k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); @@ -88,9 +98,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); BUG_ON(ret == -EINTR); @@ -107,7 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct closure cl; struct btree *b; struct bkey_buf k; @@ -123,12 +133,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) closure_init_stack(&cl); for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&trans, iter, id, POS_MIN, - BTREE_ITER_PREFETCH, b) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), dev_idx)) - continue; + goto next; bch2_bkey_buf_copy(&k, c, &b->key); @@ -139,18 +153,23 @@ retry: break; } - ret = bch2_btree_node_update_key(&trans, iter, b, k.k, false); + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); if (ret == -EINTR) { - b = bch2_btree_iter_peek_node(iter); ret = 0; - goto retry; + continue; } + if (ret) { bch_err(c, "Error updating btree node key: %i", ret); break; } +next: + bch2_btree_iter_next_node(&iter); } - bch2_trans_iter_free(&trans, iter); + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); if (ret) goto err; @@ -162,7 +181,7 @@ retry: ret = 0; err: - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&k, c); BUG_ON(ret == -EINTR); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index ee0f155..7ca7ce3 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -8,11 +8,13 @@ #include "btree_update_interior.h" #include "buckets.h" #include "disk_groups.h" +#include "ec.h" #include "inode.h" #include "io.h" #include "journal_reclaim.h" #include "move.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include "keylist.h" @@ -53,13 +55,89 @@ struct moving_context { wait_queue_head_t wait; }; +static int insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, update_iter; + struct bkey_s_c k; + struct snapshots_seen s; + int ret; + + if (!btree_type_has_snapshots(id)) + return 0; + + snapshots_seen_init(&s); + + if (!bkey_cmp(old_pos, new_pos)) + return 0; + + if (!snapshot_t(c, old_pos.snapshot)->children[0]) + return 0; + + bch2_trans_iter_init(trans, &iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { +next: + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (bkey_cmp(old_pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { + struct bkey_i *update; + size_t i; + + for (i = 0; i < s.nr; i++) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) + goto next; + + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = new_pos; + update->k.p.snapshot = k.k->p.snapshot; + + bch2_trans_iter_init(trans, &update_iter, id, update->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&update_iter) ?: + bch2_trans_update(trans, &update_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &update_iter); + if (ret) + break; + + ret = snapshots_seen_add(c, &s, k.k->p.snapshot); + if (ret) + break; + } + } + bch2_trans_iter_exit(trans, &iter); + kfree(s.d); + + return ret; +} + static int bch2_migrate_index_update(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct migrate_write *m = container_of(op, struct migrate_write, op); + struct open_bucket *ec_ob = ec_open_bucket(c, &op->open_buckets); struct keylist *keys = &op->insert_keys; struct bkey_buf _new, _insert; int ret = 0; @@ -70,9 +148,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k; @@ -80,13 +158,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) struct bkey_i_extent *new; const union bch_extent_entry *entry; struct extent_ptr_decoded p; + struct bpos next_pos; bool did_work = false; - bool extending = false, should_check_enospc; + bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; bch2_trans_begin(&trans); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -102,9 +181,9 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); new = bkey_i_to_extent(_new.k); - bch2_cut_front(iter->pos, &new->k_i); + bch2_cut_front(iter.pos, &new->k_i); - bch2_cut_front(iter->pos, insert); + bch2_cut_front(iter.pos, insert); bch2_cut_back(new->k.p, insert); bch2_cut_back(insert->k.p, &new->k_i); @@ -119,7 +198,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) extent_for_each_ptr(extent_i_to_s(new), new_ptr) new_ptr->cached = true; - bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); + __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); } extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { @@ -146,8 +225,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.background_target, op->opts.data_replicas); - ret = bch2_sum_sector_overwrites(&trans, iter, insert, - &extending, + ret = bch2_sum_sector_overwrites(&trans, &iter, insert, &should_check_enospc, &i_sectors_delta, &disk_sectors_delta); @@ -163,20 +241,29 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto out; } - ret = bch2_trans_update(&trans, iter, insert, 0) ?: + next_pos = insert->k.p; + + ret = insert_snapshot_whiteouts(&trans, m->btree_id, + k.k->p, insert->k.p) ?: + bch2_trans_update(&trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| m->data_opts.btree_insert_flags); -err: - if (!ret) + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); atomic_long_inc(&c->extent_migrate_done); + if (ec_ob) + bch2_ob_add_backpointer(c, ec_ob, &insert->k); + } +err: if (ret == -EINTR) ret = 0; if (ret) break; next: - while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { + while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { bch2_keylist_pop_front(keys); if (bch2_keylist_empty(keys)) goto out; @@ -184,18 +271,18 @@ next: continue; nomatch: if (m->ctxt) { - BUG_ON(k.k->p.offset <= iter->pos.offset); + BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->ctxt->stats->keys_raced); - atomic64_add(k.k->p.offset - iter->pos.offset, + atomic64_add(k.k->p.offset - iter.pos.offset, &m->ctxt->stats->sectors_raced); } atomic_long_inc(&c->extent_migrate_raced); trace_move_race(&new->k); - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); goto next; } out: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&_insert, c); bch2_bkey_buf_exit(&_new, c); @@ -216,11 +303,6 @@ void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) m->op.crc = rbio->pick.crc; m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; - if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { - m->op.nonce = m->op.crc.nonce + m->op.crc.offset; - m->op.csum_type = m->op.crc.csum_type; - } - if (m->data_cmd == DATA_REWRITE) bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); } @@ -235,6 +317,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; struct extent_ptr_decoded p; int ret; @@ -255,6 +338,18 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.target = data_opts.target, m->op.write_point = wp; + /* + * op->csum_type is normally initialized from the fs/file's current + * options - but if an extent is encrypted, we require that it stays + * encrypted: + */ + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (bch2_csum_type_is_encryption(crc.csum_type)) { + m->op.nonce = crc.nonce + crc.offset; + m->op.csum_type = crc.csum_type; + break; + } + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { m->op.alloc_reserve = RESERVE_MOVINGGC; m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; @@ -299,10 +394,14 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, unsigned compressed_sectors = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (p.ptr.dev == data_opts.rewrite_dev && - !p.ptr.cached && - crc_is_compressed(p.crc)) - compressed_sectors += p.crc.compressed_size; + if (p.ptr.dev == data_opts.rewrite_dev) { + if (p.ptr.cached) + m->op.flags |= BCH_WRITE_CACHED; + + if (!p.ptr.cached && + crc_is_compressed(p.crc)) + compressed_sectors += p.crc.compressed_size; + } if (compressed_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, @@ -388,19 +487,22 @@ static void move_read_endio(struct bio *bio) closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt) +static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) { struct moving_io *io; + if (trans) + bch2_trans_unlock(trans); + while ((io = next_pending_write(ctxt))) { list_del(&io->list); closure_call(&io->cl, move_write, NULL, &ctxt->cl); } } -#define move_ctxt_wait_event(_ctxt, _cond) \ +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ do { \ - do_pending_writes(_ctxt); \ + do_pending_writes(_ctxt, _trans); \ \ if (_cond) \ break; \ @@ -408,11 +510,12 @@ do { \ next_pending_write(_ctxt) || (_cond)); \ } while (1) -static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, + struct btree_trans *trans) { unsigned sectors_pending = atomic_read(&ctxt->write_sectors); - move_ctxt_wait_event(ctxt, + move_ctxt_wait_event(ctxt, trans, !atomic_read(&ctxt->write_sectors) || atomic_read(&ctxt->write_sectors) != sectors_pending); } @@ -434,14 +537,6 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->write_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->read_sectors) < - SECTORS_IN_FLIGHT_PER_DEVICE); - /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -511,13 +606,13 @@ err: static int lookup_inode(struct btree_trans *trans, struct bpos pos, struct bch_inode_unpacked *inode) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos, - BTREE_ITER_ALL_SNAPSHOTS); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) goto err; @@ -527,15 +622,15 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, goto err; } - ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; + ret = bkey_is_inode(k.k) ? 0 : -EIO; if (ret) goto err; - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); + ret = bch2_inode_unpack(k, inode); if (ret) goto err; err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -553,7 +648,7 @@ static int __bch2_move_data(struct bch_fs *c, struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bkey_buf sk; struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct data_opts data_opts; enum data_cmd data_cmd; @@ -567,8 +662,9 @@ static int __bch2_move_data(struct bch_fs *c, stats->btree_id = btree_id; stats->pos = start; - iter = bch2_trans_get_iter(&trans, btree_id, start, - BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &iter, btree_id, start, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); if (rate) bch2_ratelimit_reset(rate); @@ -591,26 +687,36 @@ static int __bch2_move_data(struct bch_fs *c, schedule_timeout(delay); if (unlikely(freezing(current))) { - bch2_trans_unlock(&trans); - move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); + move_ctxt_wait_event(ctxt, &trans, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); - bch2_trans_begin(&trans); + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->write_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); - k = bch2_btree_iter_peek(iter); + move_ctxt_wait_event(ctxt, &trans, + atomic_read(&ctxt->read_sectors) < + SECTORS_IN_FLIGHT_PER_DEVICE); - stats->pos = iter->pos; + bch2_trans_begin(&trans); + k = bch2_btree_iter_peek(&iter); if (!k.k) break; + ret = bkey_err(k); + if (ret == -EINTR) + continue; if (ret) break; + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) break; + stats->pos = iter.pos; + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; @@ -645,22 +751,22 @@ static int __bch2_move_data(struct bch_fs *c, BUG(); } - /* unlock before doing IO: */ + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { - if (ret2 == -EINTR) { - bch2_trans_begin(&trans); + if (ret2 == -EINTR) continue; - } if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); + bch2_move_ctxt_wait_for_io(ctxt, &trans); continue; } @@ -671,21 +777,43 @@ static int __bch2_move_data(struct bch_fs *c, if (rate) bch2_ratelimit_increment(rate, k.k->size); next: - atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), - &stats->sectors_seen); + atomic64_add(k.k->size, &stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(iter); - bch2_trans_cond_resched(&trans); + bch2_btree_iter_advance(&iter); } out: - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } +inline void bch_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + + scnprintf(stats->name, sizeof(stats->name), + "%s", name); +} + +static inline void progress_list_add(struct bch_fs *c, + struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_add(&stats->list, &c->data_progress_list); + mutex_unlock(&c->data_progress_lock); +} + +static inline void progress_list_del(struct bch_fs *c, + struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_del(&stats->list); + mutex_unlock(&c->data_progress_lock); +} + int bch2_move_data(struct bch_fs *c, enum btree_id start_btree_id, struct bpos start_pos, enum btree_id end_btree_id, struct bpos end_pos, @@ -698,6 +826,7 @@ int bch2_move_data(struct bch_fs *c, enum btree_id id; int ret; + progress_list_add(c, stats); closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); @@ -722,7 +851,7 @@ int bch2_move_data(struct bch_fs *c, } - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(&ctxt, NULL, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); EBUG_ON(atomic_read(&ctxt.write_sectors)); @@ -731,6 +860,7 @@ int bch2_move_data(struct bch_fs *c, atomic64_read(&stats->sectors_moved), atomic64_read(&stats->keys_moved)); + progress_list_del(c, stats); return ret; } @@ -747,7 +877,7 @@ static int bch2_move_btree(struct bch_fs *c, bool kthread = (current->flags & PF_KTHREAD) != 0; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct btree *b; enum btree_id id; struct data_opts data_opts; @@ -755,6 +885,7 @@ static int bch2_move_btree(struct bch_fs *c, int ret = 0; bch2_trans_init(&trans, c, 0, 0); + progress_list_add(c, stats); stats->data_type = BCH_DATA_btree; @@ -763,9 +894,13 @@ static int bch2_move_btree(struct bch_fs *c, id++) { stats->btree_id = id; - for_each_btree_node(&trans, iter, id, - id == start_btree_id ? start_pos : POS_MIN, - BTREE_ITER_PREFETCH, b) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; @@ -773,7 +908,7 @@ static int bch2_move_btree(struct bch_fs *c, bpos_cmp(b->key.k.p, end_pos)) > 0) break; - stats->pos = iter->pos; + stats->pos = iter.pos; switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) { case DATA_SKIP: @@ -787,13 +922,19 @@ static int bch2_move_btree(struct bch_fs *c, BUG(); } - ret = bch2_btree_node_rewrite(&trans, iter, - b->data->keys.seq, 0) ?: ret; + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; + if (ret == -EINTR) + continue; + if (ret) + break; next: - bch2_trans_cond_resched(&trans); + bch2_btree_iter_next_node(&iter); } + if (ret == -EINTR) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_trans_iter_free(&trans, iter) ?: ret; if (kthread && kthread_should_stop()) break; } @@ -803,6 +944,11 @@ next: if (ret) bch_err(c, "error %i in bch2_move_btree", ret); + /* flush relevant btree updates */ + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_nr_pending(c)); + + progress_list_del(c, stats); return ret; } @@ -822,16 +968,9 @@ static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, struct data_opts *data_opts) { unsigned nr_good = bch2_bkey_durability(c, k); - unsigned replicas = 0; - - switch (k.k->type) { - case KEY_TYPE_btree_ptr: - replicas = c->opts.metadata_replicas; - break; - case KEY_TYPE_extent: - replicas = io_opts->data_replicas; - break; - } + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; if (!nr_good || nr_good >= replicas) return DATA_SKIP; @@ -944,6 +1083,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: + bch_move_stats_init(stats, "rereplicate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -951,10 +1091,6 @@ int bch2_data_job(struct bch_fs *c, op.start_btree, op.start_pos, op.end_btree, op.end_pos, rereplicate_btree_pred, c, stats) ?: ret; - - closure_wait_event(&c->btree_interior_update_wait, - !bch2_btree_interior_updates_nr_pending(c)); - ret = bch2_replicas_gc2(c) ?: ret; ret = bch2_move_data(c, @@ -968,6 +1104,7 @@ int bch2_data_job(struct bch_fs *c, if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; + bch_move_stats_init(stats, "migrate"); stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); @@ -985,6 +1122,7 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_REWRITE_OLD_NODES: + bch_move_stats_init(stats, "rewrite_old_nodes"); ret = bch2_scan_old_btree_nodes(c, stats); break; default: diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 5076153..2a789a1 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -66,4 +66,8 @@ int bch2_data_job(struct bch_fs *, struct bch_move_stats *, struct bch_ioctl_data); +inline void bch_move_stats_init(struct bch_move_stats *stats, + char *name); + + #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index fc0de16..9df6d18 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -6,6 +6,8 @@ struct bch_move_stats { enum bch_data_type data_type; enum btree_id btree_id; struct bpos pos; + struct list_head list; + char name[32]; atomic64_t keys_moved; atomic64_t keys_raced; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 2acca0d..c82ecff 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -6,6 +6,7 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" @@ -69,10 +70,14 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, .dev = p.ptr.dev, .offset = p.ptr.offset, }; + ssize_t i; - ssize_t i = eytzinger0_find_le(h->data, h->used, - sizeof(h->data[0]), - bucket_offset_cmp, &search); + if (p.ptr.cached) + continue; + + i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); #if 0 /* eytzinger search verify code: */ ssize_t j = -1, k; @@ -85,6 +90,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, BUG_ON(i != j); #endif if (i >= 0 && + p.ptr.dev == h->data[i].dev && p.ptr.offset < h->data[i].offset + ca->mi.bucket_size && p.ptr.gen == h->data[i].gen) { /* @@ -132,21 +138,110 @@ static inline int fragmentation_cmp(copygc_heap *heap, return cmp_int(l.fragmentation, r.fragmentation); } +static int walk_buckets_to_copygc(struct bch_fs *c) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); + struct copygc_heap_entry e; + + u = bch2_alloc_unpack(k); + + if (u.data_type != BCH_DATA_user || + u.dirty_sectors >= ca->mi.bucket_size || + bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) + continue; + + e = (struct copygc_heap_entry) { + .dev = iter.pos.inode, + .gen = u.gen, + .replicas = 1 + u.stripe_redundancy, + .fragmentation = u.dirty_sectors * (1U << 15) + / ca->mi.bucket_size, + .sectors = u.dirty_sectors, + .offset = bucket_to_sector(ca, iter.pos.offset), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +static int bucket_inorder_cmp(const void *_l, const void *_r) +{ + const struct copygc_heap_entry *l = _l; + const struct copygc_heap_entry *r = _r; + + return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset); +} + +static int check_copygc_was_done(struct bch_fs *c, + u64 *sectors_not_moved, + u64 *buckets_not_moved) +{ + copygc_heap *h = &c->copygc_heap; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_alloc_unpacked u; + struct copygc_heap_entry *i; + int ret = 0; + + sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL); + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0); + + for (i = h->data; i < h->data + h->used; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + + bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset))); + + ret = lockrestart_do(&trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (ret) + break; + + u = bch2_alloc_unpack(k); + + if (u.gen == i->gen && u.dirty_sectors) { + *sectors_not_moved += u.dirty_sectors; + *buckets_not_moved += 1; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + static int bch2_copygc(struct bch_fs *c) { copygc_heap *h = &c->copygc_heap; struct copygc_heap_entry e, *i; - struct bucket_array *buckets; struct bch_move_stats move_stats; - u64 sectors_to_move = 0, sectors_not_moved = 0; + u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0; u64 sectors_reserved = 0; u64 buckets_to_move, buckets_not_moved = 0; struct bch_dev *ca; unsigned dev_idx; - size_t b, heap_size = 0; + size_t heap_size = 0; int ret; - memset(&move_stats, 0, sizeof(move_stats)); + bch_move_stats_init(&move_stats, "copygc"); + /* * Find buckets with lowest sector counts, skipping completely * empty buckets, by building a maxheap sorted by sector count, @@ -172,59 +267,45 @@ static int bch2_copygc(struct bch_fs *c) spin_lock(&ca->fs->freelist_lock); sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; spin_unlock(&ca->fs->freelist_lock); + } - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket *g = buckets->b + b; - struct bucket_mark m = READ_ONCE(g->mark); - struct copygc_heap_entry e; - - if (m.owned_by_allocator || - m.data_type != BCH_DATA_user || - !bucket_sectors_used(m) || - bucket_sectors_used(m) >= ca->mi.bucket_size) - continue; - - WARN_ON(m.stripe && !g->stripe_redundancy); - - e = (struct copygc_heap_entry) { - .dev = dev_idx, - .gen = m.gen, - .replicas = 1 + g->stripe_redundancy, - .fragmentation = bucket_sectors_used(m) * (1U << 15) - / ca->mi.bucket_size, - .sectors = bucket_sectors_used(m), - .offset = bucket_to_sector(ca, b), - }; - heap_add_or_replace(h, e, -fragmentation_cmp, NULL); - } - up_read(&ca->bucket_lock); + ret = walk_buckets_to_copygc(c); + if (ret) { + bch2_fs_fatal_error(c, "error walking buckets to copygc!"); + return ret; } - if (!sectors_reserved) { - bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); - return -1; + if (!h->used) { + bch_err_ratelimited(c, "copygc requested to run but found no buckets to move!"); + return 0; } /* * Our btree node allocations also come out of RESERVE_MOVINGGC: */ - sectors_to_move = (sectors_to_move * 3) / 4; + sectors_reserved = (sectors_reserved * 3) / 4; + if (!sectors_reserved) { + bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); + return -1; + } - for (i = h->data; i < h->data + h->used; i++) - sectors_to_move += i->sectors * i->replicas; + for (i = h->data; i < h->data + h->used; i++) { + sectors_to_move += i->sectors; + sectors_to_write += i->sectors * i->replicas; + } - while (sectors_to_move > sectors_reserved) { + while (sectors_to_write > sectors_reserved) { BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - sectors_to_move -= e.sectors * e.replicas; + sectors_to_write -= e.sectors * e.replicas; } buckets_to_move = h->used; - if (!buckets_to_move) + if (!buckets_to_move) { + bch_err_ratelimited(c, "copygc cannot run - sectors_reserved %llu!", + sectors_reserved); return 0; + } eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), @@ -237,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c) writepoint_ptr(&c->copygc_write_point), copygc_pred, NULL, &move_stats); + if (ret) { + bch_err(c, "error %i from bch2_move_data() in copygc", ret); + return ret; + } - for_each_rw_member(ca, c, dev_idx) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for (i = h->data; i < h->data + h->used; i++) { - struct bucket_mark m; - size_t b; - - if (i->dev != dev_idx) - continue; - - b = sector_to_bucket(ca, i->offset); - m = READ_ONCE(buckets->b[b].mark); - - if (i->gen == m.gen && - bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); - buckets_not_moved++; - } - } - up_read(&ca->bucket_lock); + ret = check_copygc_was_done(c, §ors_not_moved, &buckets_not_moved); + if (ret) { + bch_err(c, "error %i from check_copygc_was_done()", ret); + return ret; } - if (sectors_not_moved && !ret) + if (sectors_not_moved) bch_warn_ratelimited(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", sectors_not_moved, sectors_to_move, diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 5de2960..71bf26e 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -31,17 +31,32 @@ const char * const bch2_btree_ids[] = { NULL }; +const char * const bch2_csum_types[] = { + BCH_CSUM_TYPES() + NULL +}; + const char * const bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; +const char * const bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL +}; + const char * const bch2_compression_opts[] = { BCH_COMPRESSION_OPTS() NULL }; const char * const bch2_str_hash_types[] = { + BCH_STR_HASH_TYPES() + NULL +}; + +const char * const bch2_str_hash_opts[] = { BCH_STR_HASH_OPTS() NULL }; @@ -51,19 +66,24 @@ const char * const bch2_data_types[] = { NULL }; -const char * const bch2_cache_replacement_policies[] = { - BCH_CACHE_REPLACEMENT_POLICIES() +const char * const bch2_member_states[] = { + BCH_MEMBER_STATES() NULL }; -const char * const bch2_member_states[] = { - BCH_MEMBER_STATES() +const char * const bch2_jset_entry_types[] = { + BCH_JSET_ENTRY_TYPES() + NULL +}; + +const char * const bch2_fs_usage_types[] = { + BCH_FS_USAGE_TYPES() NULL }; #undef x -const char * const bch2_d_types[DT_MAX] = { +const char * const bch2_d_types[BCH_DT_MAX] = { [DT_UNKNOWN] = "unknown", [DT_FIFO] = "fifo", [DT_CHR] = "chr", @@ -73,6 +93,7 @@ const char * const bch2_d_types[DT_MAX] = { [DT_LNK] = "lnk", [DT_SOCK] = "sock", [DT_WHT] = "whiteout", + [DT_SUBVOL] = "subvol", }; void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) @@ -125,41 +146,27 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } -/* - * Initial options from superblock - here we don't want any options undefined, - * any options the superblock doesn't specify are set to 0: - */ -struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -{ - struct bch_opts opts = bch2_opts_empty(); - -#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ - if (_sb_opt != NO_SB_OPT) \ - opt_set(opts, _name, _sb_opt(sb)); - BCH_OPTS() -#undef x - - return opts; -} - const struct bch_option bch2_opt_table[] = { -#define OPT_BOOL() .type = BCH_OPT_BOOL -#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices +#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ + .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = ARRAY_SIZE(_choices),\ + .choices = _choices #define OPT_FN(_fn) .type = BCH_OPT_FN, \ .parse = _fn##_parse, \ .to_text = _fn##_to_text -#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ +#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ .attr = { \ .name = #_name, \ - .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ + .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ }, \ - .mode = _mode, \ + .flags = _flags, \ .hint = _hint, \ .help = _help, \ + .get_sb = _sb_opt, \ .set_sb = SET_##_sb_opt, \ _type \ }, @@ -202,7 +209,41 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } -int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, +static int bch2_opt_validate(const struct bch_option *opt, const char *msg, u64 v) +{ + if (v < opt->min) { + if (msg) + pr_err("invalid %s%s: too small (min %llu)", + msg, opt->attr.name, opt->min); + return -ERANGE; + } + + if (opt->max && v >= opt->max) { + if (msg) + pr_err("invalid %s%s: too big (max %llu)", + msg, opt->attr.name, opt->max); + return -ERANGE; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { + if (msg) + pr_err("invalid %s %s: not a multiple of 512", + msg, opt->attr.name); + return -EINVAL; + } + + if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { + if (msg) + pr_err("invalid %s%s: must be a power of two", + msg, opt->attr.name); + return -EINVAL; + } + + return 0; +} + +int bch2_opt_parse(struct bch_fs *c, const char *msg, + const struct bch_option *opt, const char *val, u64 *res) { ssize_t ret; @@ -212,30 +253,13 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ret = kstrtou64(val, 10, res); if (ret < 0) return ret; - - if (*res > 1) - return -ERANGE; break; case BCH_OPT_UINT: - ret = kstrtou64(val, 10, res); - if (ret < 0) - return ret; - - if (*res < opt->min || *res >= opt->max) - return -ERANGE; - break; - case BCH_OPT_SECTORS: - ret = bch2_strtou64_h(val, res); + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); if (ret < 0) return ret; - - if (*res & 511) - return -EINVAL; - - *res >>= 9; - - if (*res < opt->min || *res >= opt->max) - return -ERANGE; break; case BCH_OPT_STR: ret = match_string(opt->choices, -1, val); @@ -248,10 +272,12 @@ int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, if (!c) return 0; - return opt->parse(c, val, res); + ret = opt->parse(c, val, res); + if (ret < 0) + return ret; } - return 0; + return bch2_opt_validate(opt, msg, *res); } void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, @@ -272,10 +298,10 @@ void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: case BCH_OPT_UINT: - pr_buf(out, "%lli", v); - break; - case BCH_OPT_SECTORS: - bch2_hprint(out, v); + if (opt->flags & OPT_HUMAN_READABLE) + bch2_hprint(out, v); + else + pr_buf(out, "%lli", v); break; case BCH_OPT_STR: if (flags & OPT_SHOW_FULL_LIST) @@ -349,7 +375,8 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, if (id < 0) goto bad_opt; - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v); + ret = bch2_opt_parse(c, "mount option ", + &bch2_opt_table[id], val, &v); if (ret < 0) goto bad_val; } else { @@ -369,7 +396,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, goto no_val; } - if (!(bch2_opt_table[id].mode & OPT_MOUNT)) + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; if (id == Opt_acl && @@ -404,6 +431,65 @@ out: return ret; } +/* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: + */ +int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) +{ + unsigned id; + int ret; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->get_sb == NO_SB_OPT) + continue; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + ret = bch2_opt_validate(opt, "superblock option ", v); + if (ret) + return ret; + + bch2_opt_set_by_id(opts, id, v); + } + + return 0; +} + +void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_NO_SB_OPT) + return; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v >>= 9; + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = ilog2(v); + + opt->set_sb(sb, v); +} + +void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_NO_SB_OPT) + return; + + mutex_lock(&c->sb_lock); + __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + /* io opts: */ struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 003c00f..affe923 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -12,14 +12,23 @@ extern const char * const bch2_error_actions[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; extern const char * const bch2_data_types[]; -extern const char * const bch2_cache_replacement_policies[]; extern const char * const bch2_member_states[]; +extern const char * const bch2_jset_entry_types[]; +extern const char * const bch2_fs_usage_types[]; extern const char * const bch2_d_types[]; +static inline const char *bch2_d_type_str(unsigned d_type) +{ + return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; +} + /* * Mount options; we also store defaults in the superblock. * @@ -36,18 +45,22 @@ extern const char * const bch2_d_types[]; LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); /* When can be set: */ -enum opt_mode { - OPT_FORMAT = (1 << 0), - OPT_MOUNT = (1 << 1), - OPT_RUNTIME = (1 << 2), - OPT_INODE = (1 << 3), - OPT_DEVICE = (1 << 4), +enum opt_flags { + OPT_FS = (1 << 0), /* Filesystem option */ + OPT_DEVICE = (1 << 1), /* Device option */ + OPT_INODE = (1 << 2), /* Inode option */ + OPT_FORMAT = (1 << 3), /* May be specified at format time */ + OPT_MOUNT = (1 << 4), /* May be specified at mount time */ + OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = (1 << 6), + OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ }; enum opt_type { BCH_OPT_BOOL, BCH_OPT_UINT, - BCH_OPT_SECTORS, BCH_OPT_STR, BCH_OPT_FN, }; @@ -72,223 +85,242 @@ enum opt_type { */ #ifdef __KERNEL__ -#define RATELIMIT_ERRORS true +#define RATELIMIT_ERRORS_DEFAULT true #else -#define RATELIMIT_ERRORS false +#define RATELIMIT_ERRORS_DEFAULT false #endif #define BCH_OPTS() \ x(block_size, u16, \ - OPT_FORMAT, \ - OPT_SECTORS(1, 128), \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 16), \ BCH_SB_BLOCK_SIZE, 8, \ "size", NULL) \ - x(btree_node_size, u16, \ - OPT_FORMAT, \ - OPT_SECTORS(1, 512), \ + x(btree_node_size, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 20), \ BCH_SB_BTREE_NODE_SIZE, 512, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ NULL, "Action to take on filesystem error") \ x(metadata_replicas, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_META_REPLICAS_WANT, 1, \ "#", "Number of metadata replicas") \ x(data_replicas, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_WANT, 1, \ "#", "Number of data replicas") \ x(metadata_replicas_required, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_META_REPLICAS_REQ, 1, \ "#", NULL) \ x(data_replicas_required, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_REQ, 1, \ "#", NULL) \ + x(encoded_extent_max, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ + OPT_UINT(4096, 2U << 20), \ + BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ + "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(compression, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_compression_opts), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_compression_opts), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(str_hash, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_str_hash_types), \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_opts), \ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or disk group for metadata writes") \ x(foreground_target, u16, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_FOREGROUND_TARGET, 0, \ "(target)", "Device or disk group for foreground writes") \ x(background_target, u16, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_BACKGROUND_TARGET, 0, \ "(target)", "Device or disk group to move data to in the background")\ x(promote_target, u16, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0, \ "(target)", "Device or disk group to promote data to on read")\ x(erasure_code, u16, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ x(shard_inode_numbers, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, false, \ + BCH_SB_SHARD_INUMS, true, \ NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_INODES_USE_KEY_CACHE, true, \ NULL, "Use the btree key cache for the inodes btree") \ x(btree_node_mem_ptr_optimization, u8, \ - OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ x(gc_reserve_percent, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ BCH_SB_GC_RESERVE, 8, \ "%", "Percentage of disk space to reserve for copygc")\ x(gc_reserve_bytes, u64, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_SECTORS(0, U64_MAX), \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ + OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(0, U64_MAX), \ BCH_SB_GC_RESERVE_BYTES, 0, \ "%", "Amount of disk space to reserve for copygc\n" \ "Takes precedence over gc_reserve_percent if set")\ x(root_reserve_percent, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_UINT(0, 100), \ BCH_SB_ROOT_RESERVE, 0, \ "%", "Percentage of disk space to reserve for superuser")\ x(wide_macs, u8, \ - OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_128_BIT_MACS, false, \ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ x(inline_data, u8, \ - OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ NO_SB_OPT, true, \ NULL, "Enable inline data extents") \ x(acl, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_POSIX_ACL, true, \ NULL, "Enable POSIX acls") \ x(usrquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_USRQUOTA, false, \ NULL, "Enable user quotas") \ x(grpquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_GRPQUOTA, false, \ NULL, "Enable group quotas") \ x(prjquota, u8, \ - OPT_FORMAT|OPT_MOUNT, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ BCH_SB_PRJQUOTA, false, \ NULL, "Enable project quotas") \ x(degraded, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Allow mounting in degraded mode") \ x(very_degraded, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ x(discard, u8, \ - OPT_MOUNT|OPT_DEVICE, \ + OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ + NULL, "Delay in milliseconds before automatic journal commits")\ x(journal_flush_disabled, u8, \ - OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ NULL, "Disable journal flush on sync/fsync\n" \ "If enabled, writes can be lost, but only since the\n"\ "last journal write (default 1 second)") \ + x(journal_reclaim_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ + NULL, "Delay in milliseconds before automatic journal reclaim")\ x(fsck, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ x(fix_errors, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Fix errors during fsck without asking") \ x(ratelimit_errors, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, RATELIMIT_ERRORS, \ + NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ NULL, "Ratelimit error messages during fsck") \ x(nochanges, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Super read only mode - no writes at all will be issued,\n"\ "even if we have to replay the journal") \ x(norecovery, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ x(rebuild_replicas, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ - OPT_MOUNT, \ + 0, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't free journal entries/keys after startup")\ @@ -297,8 +329,13 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ x(noexcl, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't open device in exclusive mode") \ @@ -308,7 +345,7 @@ enum opt_type { NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ - 0, \ + OPT_FS, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, NULL) \ @@ -318,16 +355,21 @@ enum opt_type { NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ x(reconstruct_alloc, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ - OPT_MOUNT, \ + OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ @@ -335,12 +377,12 @@ enum opt_type { NULL, NULL) \ x(fs_size, u64, \ OPT_DEVICE, \ - OPT_SECTORS(0, S64_MAX), \ + OPT_UINT(0, S64_MAX), \ NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(bucket, u32, \ OPT_DEVICE, \ - OPT_SECTORS(0, S64_MAX), \ + OPT_UINT(0, S64_MAX), \ NO_SB_OPT, 0, \ "size", "Size of filesystem on device") \ x(durability, u8, \ @@ -399,13 +441,14 @@ struct printbuf; struct bch_option { struct attribute attr; + u64 (*get_sb)(const struct bch_sb *); void (*set_sb)(struct bch_sb *, u64); - enum opt_mode mode; enum opt_type type; + enum opt_flags flags; + u64 min, max; union { struct { - u64 min, max; }; struct { const char * const *choices; @@ -427,10 +470,13 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -struct bch_opts bch2_opts_from_sb(struct bch_sb *); +int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); +void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); +void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); -int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); +int bch2_opt_parse(struct bch_fs *, const char *, const struct bch_option *, + const char *, u64 *); #define OPT_SHOW_FULL_LIST (1 << 0) #define OPT_SHOW_MOUNT_STYLE (1 << 1) diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 7861781..6fb8224 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -3,17 +3,20 @@ #include "btree_update.h" #include "inode.h" #include "quota.h" +#include "subvolume.h" #include "super-io.h" -static const char *bch2_sb_validate_quota(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_validate_quota(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_quota *q = field_to_type(f, quota); - if (vstruct_bytes(&q->field) != sizeof(*q)) - return "invalid field quota: wrong size"; + if (vstruct_bytes(&q->field) < sizeof(*q)) { + pr_buf(err, "wrong size (got %llu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); + } - return NULL; + return 0; } const struct bch_sb_field_ops bch_sb_field_ops_quota = { @@ -357,7 +360,7 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; @@ -372,9 +375,10 @@ static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) if (ret) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); + return ret; } void bch2_fs_quota_exit(struct bch_fs *c) @@ -414,14 +418,55 @@ static void bch2_sb_quota_read(struct bch_fs *c) } } +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_subvolume subvolume; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 1; + + ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + if (ret) + return ret; + + /* + * We don't do quota accounting in snapshots: + */ + if (BCH_SUBVOLUME_SNAP(&subvolume)) + goto advance; + + if (!bkey_is_inode(k.k)) + goto advance; + + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); + return 0; +} + int bch2_fs_quota_read(struct bch_fs *c) { unsigned i, qtypes = enabled_qtypes(c); struct bch_memquota_type *q; struct btree_trans trans; - struct btree_iter *iter; - struct bch_inode_unpacked u; - struct bkey_s_c k; + struct btree_iter iter; int ret; mutex_lock(&c->sb_lock); @@ -436,23 +481,18 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - switch (k.k->type) { - case KEY_TYPE_inode: - ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); - } - } - bch2_trans_iter_put(&trans, iter); - - return bch2_trans_exit(&trans) ?: ret; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + do { + ret = lockrestart_do(&trans, + bch2_fs_quota_read_inode(&trans, &iter)); + } while (!ret); + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; } /* Enable/disable/delete quotas for an entire filesystem: */ @@ -532,7 +572,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_USR, 0), POS(QTYP_USR + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } @@ -544,7 +584,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_GRP, 0), POS(QTYP_GRP + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } @@ -556,7 +596,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_PRJ, 0), POS(QTYP_PRJ + 1, 0), - NULL); + 0, NULL); if (ret) return ret; } @@ -717,13 +757,13 @@ static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_i_quota *new_quota, struct qc_dqblk *qdq) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (unlikely(ret)) @@ -742,8 +782,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans, if (qdq->d_fieldmask & QC_INO_HARD) new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - ret = bch2_trans_update(trans, iter, &new_quota->k_i, 0); - bch2_trans_iter_put(trans, iter); + ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index a0dbf41..a573fed 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -166,6 +166,7 @@ static int bch2_rebalance_thread(void *arg) struct bch_fs_rebalance *r = &c->rebalance; struct io_clock *clock = &c->io_clock[WRITE]; struct rebalance_work w, p; + struct bch_move_stats move_stats; unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; @@ -179,6 +180,7 @@ static int bch2_rebalance_thread(void *arg) prev_start = jiffies; prev_cputime = curr_cputime(); + bch_move_stats_init(&move_stats, "rebalance"); while (!kthread_wait_freezable(r->enabled)) { cond_resched(); @@ -235,7 +237,7 @@ static int bch2_rebalance_thread(void *arg) prev_cputime = cputime; r->state = REBALANCE_RUNNING; - memset(&r->move_stats, 0, sizeof(r->move_stats)); + memset(&move_stats, 0, sizeof(move_stats)); rebalance_work_reset(c); bch2_move_data(c, @@ -245,7 +247,7 @@ static int bch2_rebalance_thread(void *arg) NULL, /* &r->pd.rate, */ writepoint_ptr(&c->rebalance_write_point), rebalance_pred, NULL, - &r->move_stats); + &move_stats); } return 0; @@ -281,10 +283,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) h1); break; case REBALANCE_RUNNING: - pr_buf(out, "running\n" - "pos "); - bch2_bpos_to_text(out, r->move_stats.pos); - pr_buf(out, "\n"); + pr_buf(out, "running\n"); break; } } diff --git a/libbcachefs/rebalance_types.h b/libbcachefs/rebalance_types.h index 2f62a64..7462a92 100644 --- a/libbcachefs/rebalance_types.h +++ b/libbcachefs/rebalance_types.h @@ -19,7 +19,6 @@ struct bch_fs_rebalance { enum rebalance_state state; u64 throttled_until_iotime; unsigned long throttled_until_cputime; - struct bch_move_stats move_stats; unsigned enabled:1; }; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index afb7264..543db58 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -20,6 +20,7 @@ #include "quota.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super-io.h" #include @@ -58,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys) static int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, - struct journal_key *r) + const struct journal_key *r) { return (cmp_int(l_btree_id, r->btree_id) ?: cmp_int(l_level, r->level) ?: bpos_cmp(l_pos, r->k->k.p)); } -static int journal_key_cmp(struct journal_key *l, struct journal_key *r) +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { - return (cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p)); + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -static size_t journal_key_search(struct journal_keys *journal_keys, - enum btree_id id, unsigned level, - struct bpos pos) +size_t bch2_journal_key_search(struct journal_keys *journal_keys, + enum btree_id id, unsigned level, + struct bpos pos) { size_t l = 0, r = journal_keys->nr, m; @@ -108,18 +107,25 @@ static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsign iter->idx++; } -int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, - unsigned level, struct bkey_i *k) +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) { struct journal_key n = { .btree_id = id, .level = level, .k = k, - .allocated = true + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, }; struct journal_keys *keys = &c->journal_keys; struct journal_iter *iter; - unsigned idx = journal_key_search(keys, id, level, k->k.p); + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); if (idx < keys->nr && journal_key_cmp(&n, &keys->d[idx]) == 0) { @@ -156,38 +162,66 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, return 0; } -int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, - unsigned level, struct bpos pos) +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) { - struct bkey_i *whiteout = - kmalloc(sizeof(struct bkey), GFP_KERNEL); + struct bkey_i *n; int ret; - if (!whiteout) { - bch_err(c, "%s: error allocating new key", __func__); + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) return -ENOMEM; - } - - bkey_init(&whiteout->k); - whiteout->k.p = pos; - ret = bch2_journal_key_insert(c, id, level, whiteout); + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); if (ret) - kfree(whiteout); + kfree(n); return ret; } +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->nr && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + !bpos_cmp(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->idx - iter->keys->nr - ? iter->keys->d + iter->idx : NULL; + struct journal_key *k = iter->keys->d + iter->idx; + + while (k < iter->keys->d + iter->keys->nr && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return k->k; - if (k && - k->btree_id == iter->btree_id && - k->level == iter->level) - return k->k; + iter->idx++; + k = iter->keys->d + iter->idx; + } - iter->idx = iter->keys->nr; return NULL; } @@ -210,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c, iter->btree_id = id; iter->level = level; iter->keys = &c->journal_keys; - iter->idx = journal_key_search(&c->journal_keys, id, level, pos); - list_add(&iter->list, &c->journal_iters); + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); } static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) @@ -297,104 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, - struct btree *b) +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) { memset(iter, 0, sizeof(*iter)); iter->b = b; - bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); - bch2_journal_iter_init(c, &iter->journal, - b->c.btree_id, b->c.level, b->data->min_key); -} - -/* Walk btree, overlaying keys from the journal: */ - -static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b, - struct btree_and_journal_iter iter) -{ - unsigned i = 0, nr = b->c.level > 1 ? 2 : 16; - struct bkey_s_c k; - struct bkey_buf tmp; - - BUG_ON(!b->c.level); - - bch2_bkey_buf_init(&tmp); - - while (i < nr && - (k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - bch2_btree_node_prefetch(c, NULL, tmp.k, - b->c.btree_id, b->c.level - 1); - - bch2_btree_and_journal_iter_advance(&iter); - i++; - } - - bch2_bkey_buf_exit(&tmp, c); -} - -static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, - enum btree_id btree_id, - btree_walk_key_fn key_fn) -{ - struct btree_and_journal_iter iter; - struct bkey_s_c k; - struct bkey_buf tmp; - struct btree *child; - int ret = 0; - - bch2_bkey_buf_init(&tmp); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); - - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - if (b->c.level) { - bch2_bkey_buf_reassemble(&tmp, c, k); - - child = bch2_btree_node_get_noiter(c, tmp.k, - b->c.btree_id, b->c.level - 1, - false); - - ret = PTR_ERR_OR_ZERO(child); - if (ret) - break; - - btree_and_journal_iter_prefetch(c, b, iter); - - ret = bch2_btree_and_journal_walk_recurse(c, child, - btree_id, key_fn); - six_unlock_read(&child->c.lock); - } else { - ret = key_fn(c, k); - } - - if (ret) - break; - - bch2_btree_and_journal_iter_advance(&iter); - } - - bch2_btree_and_journal_iter_exit(&iter); - bch2_bkey_buf_exit(&tmp, c); - return ret; + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); } -int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id, - btree_walk_key_fn key_fn) +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) { - struct btree *b = c->btree_roots[btree_id].b; - int ret = 0; - - if (btree_node_fake(b)) - return 0; - - six_lock_read(&b->c.lock, NULL, NULL); - ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn); - six_unlock_read(&b->c.lock); + struct btree_node_iter node_iter; - return ret; + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -419,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) const struct journal_key *l = _l; const struct journal_key *r = _r; - return cmp_int(l->btree_id, r->btree_id) ?: - cmp_int(l->level, r->level) ?: - bpos_cmp(l->k->k.p, r->k->k.p) ?: + return journal_key_cmp(l, r) ?: cmp_int(l->journal_seq, r->journal_seq) ?: cmp_int(l->journal_offset, r->journal_offset); } @@ -514,140 +474,84 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int __bch2_journal_replay_key(struct btree_trans *trans, - enum btree_id id, unsigned level, - struct bkey_i *k) +static int bch2_journal_replay_key(struct btree_trans *trans, + struct journal_key *k) { - struct btree_iter *iter; + struct btree_iter iter; + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; int ret; - iter = bch2_trans_get_node_iter(trans, id, k->k.p, - BTREE_MAX_DEPTH, level, - BTREE_ITER_INTENT| - BTREE_ITER_NOT_EXTENTS); - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_put(trans, iter); - return ret; -} - -static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k) -{ - unsigned commit_flags = BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW; - - if (!k->allocated) - commit_flags |= BTREE_INSERT_JOURNAL_REPLAY; + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED; - return bch2_trans_do(c, NULL, NULL, commit_flags, - __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k)); -} + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; -static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -{ - struct btree_iter *iter; - int ret; + /* Must be checked with btree locked: */ + if (k->overwritten) + goto out; - iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p, - BTREE_ITER_CACHED| - BTREE_ITER_CACHED_NOFILL| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); - bch2_trans_iter_put(trans, iter); + ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); +out: + bch2_trans_iter_exit(trans, &iter); return ret; } -static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - return bch2_trans_do(c, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY, - __bch2_alloc_replay_key(&trans, k)); -} - static int journal_sort_seq_cmp(const void *_l, const void *_r) { - const struct journal_key *l = _l; - const struct journal_key *r = _r; + const struct journal_key *l = *((const struct journal_key **)_l); + const struct journal_key *r = *((const struct journal_key **)_r); - return cmp_int(r->level, l->level) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bpos_cmp(l->k->k.p, r->k->k.p); + return cmp_int(l->journal_seq, r->journal_seq); } -static int bch2_journal_replay(struct bch_fs *c, - struct journal_keys keys) +static int bch2_journal_replay(struct bch_fs *c) { + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; struct journal *j = &c->journal; - struct journal_key *i; - u64 seq; + size_t i; int ret; - sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); + keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); + if (!keys_sorted) + return -ENOMEM; - if (keys.nr) - replay_now_at(j, keys.journal_seq_base); + for (i = 0; i < keys->nr; i++) + keys_sorted[i] = &keys->d[i]; - seq = j->replay_journal_seq; + sort(keys_sorted, keys->nr, + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); - /* - * First replay updates to the alloc btree - these will only update the - * btree key cache: - */ - for_each_journal_key(keys, i) { - cond_resched(); + if (keys->nr) + replay_now_at(j, keys->journal_seq_base); - if (!i->level && i->btree_id == BTREE_ID_alloc) { - j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_alloc_replay_key(c, i->k); - if (ret) - goto err; - } - } + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; - /* - * Next replay updates to interior btree nodes: - */ - for_each_journal_key(keys, i) { cond_resched(); - if (i->level) { - j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; - ret = bch2_journal_replay_key(c, i); - if (ret) - goto err; - } - } - - /* - * Now that the btree is in a consistent state, we can start journal - * reclaim (which will be flushing entries from the btree key cache back - * to the btree: - */ - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); - journal_reclaim_kick(j); - - j->replay_journal_seq = seq; - - /* - * Now replay leaf node updates: - */ - for_each_journal_key(keys, i) { - cond_resched(); + if (!k->allocated) + replay_now_at(j, keys->journal_seq_base + k->journal_seq); - if (i->level || i->btree_id == BTREE_ID_alloc) - continue; - - replay_now_at(j, keys.journal_seq_base + i->journal_seq); - - ret = bch2_journal_replay_key(c, i); - if (ret) + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RESERVED| + (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), + bch2_journal_replay_key(&trans, k)); + if (ret) { + bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", + ret, bch2_btree_ids[k->btree_id], k->level); goto err; + } } replay_now_at(j, j->replay_journal_seq_end); @@ -655,10 +559,9 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_set_replay_done(j); bch2_journal_flush_all_pins(j); - return bch2_journal_error(j); + ret = bch2_journal_error(j); err: - bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", - ret, bch2_btree_ids[i->btree_id], i->level); + kvfree(keys_sorted); return ret; } @@ -696,15 +599,15 @@ static int journal_replay_entry_early(struct bch_fs *c, container_of(entry, struct jset_entry_usage, entry); switch (entry->btree_id) { - case FS_USAGE_RESERVED: + case BCH_FS_USAGE_reserved: if (entry->level < BCH_REPLICAS_MAX) c->usage_base->persistent_reserved[entry->level] = le64_to_cpu(u->v); break; - case FS_USAGE_INODES: + case BCH_FS_USAGE_inodes: c->usage_base->nr_inodes = le64_to_cpu(u->v); break; - case FS_USAGE_KEY_VERSION: + case BCH_FS_USAGE_key_version: atomic64_set(&c->key_version, le64_to_cpu(u->v)); break; @@ -724,10 +627,7 @@ static int journal_replay_entry_early(struct bch_fs *c, struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); - unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); - unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) / - sizeof(struct jset_entry_dev_usage_type); - unsigned i; + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable); @@ -961,6 +861,73 @@ fsck_err: return ret; } +static int bch2_fs_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; + root_snapshot.v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + ret = bch2_btree_insert(c, BTREE_ID_snapshots, + &root_snapshot.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_subvolumes, + &root_volume.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + return 0; +} + +static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -ENOENT; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_fs_recovery(struct bch_fs *c) { const char *err = "cannot allocate memory"; @@ -979,6 +946,8 @@ int bch2_fs_recovery(struct bch_fs *c) if (c->sb.clean) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); + else + bch_info(c, "recovering from unclean shutdown"); if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); @@ -997,7 +966,6 @@ int bch2_fs_recovery(struct bch_fs *c) bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); ret = -EINVAL; goto err; - } if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { @@ -1012,16 +980,20 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } - if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { - bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); - c->opts.version_upgrade = true; - c->opts.fsck = true; - c->opts.fix_errors = FSCK_OPT_YES; - } - - if (c->sb.version < bcachefs_metadata_version_btree_ptr_sectors_written) { - bch_info(c, "version prior to btree_ptr_sectors_written, upgrade required"); - c->opts.version_upgrade = true; + if (!c->opts.nochanges) { + if (c->sb.version < bcachefs_metadata_version_inode_backpointers) { + bch_info(c, "version prior to inode backpointers, upgrade and fsck required"); + c->opts.version_upgrade = true; + c->opts.fsck = true; + c->opts.fix_errors = FSCK_OPT_YES; + } else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) { + bch_info(c, "filesystem version is prior to subvol_dirent - upgrading"); + c->opts.version_upgrade = true; + c->opts.fsck = true; + } else if (c->sb.version < bcachefs_metadata_version_inode_v2) { + bch_info(c, "filesystem version is prior to inode_v2 - upgrading"); + c->opts.version_upgrade = true; + } } ret = bch2_blacklist_table_initialize(c); @@ -1033,6 +1005,7 @@ int bch2_fs_recovery(struct bch_fs *c) if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { struct journal_replay *i; + bch_verbose(c, "starting journal read"); ret = bch2_journal_read(c, &c->journal_entries, &blacklist_seq, &journal_seq); if (ret) @@ -1122,7 +1095,11 @@ use_clean: bch_verbose(c, "starting alloc read"); err = "error reading allocation information"; - ret = bch2_alloc_read(c); + + down_read(&c->gc_lock); + ret = bch2_alloc_read(c, false, false); + up_read(&c->gc_lock); + if (ret) goto err; bch_verbose(c, "alloc read done"); @@ -1136,18 +1113,25 @@ use_clean: set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + /* + * If we're not running fsck, this ensures bch2_fsck_err() calls are + * instead interpreted as bch2_inconsistent_err() calls: + */ + if (!c->opts.fsck) + set_bit(BCH_FS_FSCK_DONE, &c->flags); + if (c->opts.fsck || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) || !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) || test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { bool metadata_only = c->opts.norecovery; - bch_info(c, "starting mark and sweep"); + bch_info(c, "checking allocations"); err = "error in mark and sweep"; ret = bch2_gc(c, true, metadata_only); if (ret) goto err; - bch_verbose(c, "mark and sweep done"); + bch_verbose(c, "done checking allocations"); } bch2_stripes_heap_start(c); @@ -1165,29 +1149,37 @@ use_clean: if (c->opts.norecovery) goto out; - bch_verbose(c, "starting journal replay"); + bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); err = "journal replay failed"; - ret = bch2_journal_replay(c, c->journal_keys); + ret = bch2_journal_replay(c); if (ret) goto err; - bch_verbose(c, "journal replay done"); + if (c->opts.verbose || !c->sb.clean) + bch_info(c, "journal replay done"); - if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) && - !c->opts.nochanges) { - /* - * note that even when filesystem was clean there might be work - * to do here, if we ran gc (because of fsck) which recalculated - * oldest_gen: - */ - bch_verbose(c, "writing allocation info"); - err = "error writing out alloc info"; - ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: - bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error writing alloc info"); + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + bch2_fs_lazy_rw(c); + + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + } + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + + if (c->sb.version < bcachefs_metadata_version_snapshot_2) { + /* set bi_subvol on root inode */ + err = "error upgrade root inode for subvolumes"; + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_fs_upgrade_for_subvolumes(&trans)); + if (ret) goto err; - } - bch_verbose(c, "alloc write done"); } if (c->opts.fsck) { @@ -1214,21 +1206,6 @@ use_clean: bch_verbose(c, "quotas done"); } - if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || - !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { - struct bch_move_stats stats = { 0 }; - - bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c); - if (ret) - goto err; - - ret = bch2_scan_old_btree_nodes(c, &stats); - if (ret) - goto err; - bch_info(c, "scanning for old btree nodes done"); - } - mutex_lock(&c->sb_lock); if (c->opts.version_upgrade) { c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); @@ -1253,6 +1230,24 @@ use_clean: bch2_write_super(c); mutex_unlock(&c->sb_lock); + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || + le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c); + if (ret) + goto err; + + ret = bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + if (c->journal_seq_blacklist_table && c->journal_seq_blacklist_table->nr > 128) queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); @@ -1300,20 +1295,15 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); bch2_write_super(c); } - - for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, 0); mutex_unlock(&c->sb_lock); set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); - set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); - set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); - err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); @@ -1346,11 +1336,26 @@ int bch2_fs_initialize(struct bch_fs *c) percpu_ref_put(&ca->ref); goto err; } + + ca->new_fs_bucket_idx = 0; } + err = "error creating root snapshot node"; + ret = bch2_fs_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); + err = "error reading snapshots table"; + ret = bch2_fs_snapshots_start(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); - root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; bch2_inode_pack(c, &packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; @@ -1365,11 +1370,12 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating lost+found"; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + bch2_create_trans(&trans, + BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, 0, 0, S_IFDIR|0700, 0, - NULL, NULL)); + NULL, NULL, (subvol_inum) { 0 }, 0)); if (ret) { bch_err(c, "error creating lost+found"); goto err; @@ -1382,7 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c) } err = "error writing first journal entry"; - ret = bch2_journal_meta(&c->journal); + ret = bch2_journal_flush(&c->journal); if (ret) goto err; diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index e5565e4..21bdad9 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -31,24 +31,30 @@ struct btree_and_journal_iter { } last; }; +size_t bch2_journal_key_search(struct journal_keys *, enum btree_id, + unsigned, struct bpos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, struct bch_fs *, struct btree *); -typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k); - -int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn); - void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct list_head *); diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 3d9c5c5..c8d6d73 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -7,6 +7,7 @@ #include "inode.h" #include "io.h" #include "reflink.h" +#include "subvolume.h" #include @@ -31,6 +32,10 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_bytes(p.k) != sizeof(*p.v)) return "incorrect value size"; + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && + le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) + return "idx < front_pad"; + return NULL; } @@ -39,7 +44,10 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); + pr_buf(out, "idx %llu front_pad %u back_pad %u", + le64_to_cpu(p.v->idx), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); } bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) @@ -116,7 +124,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, struct bkey_i *orig) { struct bch_fs *c = trans->c; - struct btree_iter *reflink_iter; + struct btree_iter reflink_iter = { NULL }; struct bkey_s_c k; struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; @@ -126,11 +134,11 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (orig->k.type == KEY_TYPE_inline_data) bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink, + for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, POS(0, c->reflink_hint), BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { - if (reflink_iter->pos.inode) { - bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + if (reflink_iter.pos.inode) { + bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); continue; } @@ -142,7 +150,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, goto err; /* rewind iter to start of hole, if necessary: */ - bch2_btree_iter_set_pos_to_extent_start(reflink_iter); + bch2_btree_iter_set_pos_to_extent_start(&reflink_iter); r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); @@ -151,7 +159,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bkey_init(&r_v->k); r_v->k.type = bkey_type_to_indirect(&orig->k); - r_v->k.p = reflink_iter->pos; + r_v->k.p = reflink_iter.pos; bch2_key_resize(&r_v->k, orig->k.size); r_v->k.version = orig->k.version; @@ -161,20 +169,26 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, *refcount = 0; memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); - ret = bch2_trans_update(trans, reflink_iter, r_v, 0); + ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); if (ret) goto err; + /* + * orig is in a bkey_buf which statically allocates 5 64s for the val, + * so we know it will be big enough: + */ orig->k.type = KEY_TYPE_reflink_p; r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + memset(&r_p->v, 0, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); - ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); err: - if (!IS_ERR(reflink_iter)) - c->reflink_hint = reflink_iter->pos.offset; - bch2_trans_iter_put(trans, reflink_iter); + c->reflink_hint = reflink_iter.pos.offset; + bch2_trans_iter_exit(trans, &reflink_iter); return ret; } @@ -184,7 +198,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_continue(iter, 0, k, ret) { + for_each_btree_key_continue_norestart(*iter, 0, k, ret) { if (bkey_cmp(iter->pos, end) >= 0) break; @@ -198,17 +212,21 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } s64 bch2_remap_range(struct bch_fs *c, - struct bpos dst_start, struct bpos src_start, - u64 remap_sectors, u64 *journal_seq, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, u64 new_i_size, s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *dst_iter, *src_iter; + struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; struct bpos src_want; u64 dst_done; + u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; if (!percpu_ref_tryget(&c->writes)) @@ -223,13 +241,13 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_init(&new_src); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); - src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start, - BTREE_ITER_INTENT); - dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start, - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); while ((ret == 0 || ret == -EINTR) && - bkey_cmp(dst_iter->pos, dst_end) < 0) { + bkey_cmp(dst_iter.pos, dst_end) < 0) { struct disk_reservation disk_res = { 0 }; bch2_trans_begin(&trans); @@ -239,31 +257,45 @@ s64 bch2_remap_range(struct bch_fs *c, break; } - dst_done = dst_iter->pos.offset - dst_start.offset; + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + &src_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(src_iter, src_want); + bch2_btree_iter_set_pos(&src_iter, src_want); - src_k = get_next_src(src_iter, src_end); + src_k = get_next_src(&src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; - if (bkey_cmp(src_want, src_iter->pos) < 0) { - ret = bch2_fpunch_at(&trans, dst_iter, - bpos_min(dst_end, - POS(dst_iter->pos.inode, dst_iter->pos.offset + - src_iter->pos.offset - src_want.offset)), - journal_seq, i_sectors_delta); + if (bkey_cmp(src_want, src_iter.pos) < 0) { + ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), + i_sectors_delta); continue; } if (src_k.k->type != KEY_TYPE_reflink_p) { - bch2_btree_iter_set_pos_to_extent_start(src_iter); + bch2_btree_iter_set_pos_to_extent_start(&src_iter); bch2_bkey_buf_reassemble(&new_src, c, src_k); src_k = bkey_i_to_s_c(new_src.k); - ret = bch2_make_extent_indirect(&trans, src_iter, + ret = bch2_make_extent_indirect(&trans, &src_iter, new_src.k); if (ret) continue; @@ -286,46 +318,47 @@ s64 bch2_remap_range(struct bch_fs *c, BUG(); } - new_dst.k->k.p = dst_iter->pos; + new_dst.k->k.p = dst_iter.pos; bch2_key_resize(&new_dst.k->k, min(src_k.k->p.offset - src_want.offset, - dst_end.offset - dst_iter->pos.offset)); - ret = bch2_extent_update(&trans, dst_iter, new_dst.k, - &disk_res, journal_seq, + dst_end.offset - dst_iter.pos.offset)); + + ret = bch2_extent_update(&trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, NULL, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); } - bch2_trans_iter_put(&trans, dst_iter); - bch2_trans_iter_put(&trans, src_iter); + bch2_trans_iter_exit(&trans, &dst_iter); + bch2_trans_iter_exit(&trans, &src_iter); - BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end)); - BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end)); + BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0); - dst_done = dst_iter->pos.offset - dst_start.offset; - new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + dst_done = dst_iter.pos.offset - dst_start.offset; + new_i_size = min(dst_iter.pos.offset << 9, new_i_size); do { struct bch_inode_unpacked inode_u; - struct btree_iter *inode_iter; + struct btree_iter inode_iter = { NULL }; bch2_trans_begin(&trans); - inode_iter = bch2_inode_peek(&trans, &inode_u, - dst_start.inode, BTREE_ITER_INTENT); - ret2 = PTR_ERR_OR_ZERO(inode_iter); + ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; - ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, journal_seq, 0); + ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); } - bch2_trans_iter_put(&trans, inode_iter); + bch2_trans_iter_exit(&trans, &inode_iter); } while (ret2 == -EINTR); - ret = bch2_trans_exit(&trans) ?: ret; + bch2_trans_exit(&trans); bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index 68c5cb5..3745873 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) } } -s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, - u64, u64 *, u64, s64 *); +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index dbbbcc6..96994b7 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -41,18 +41,19 @@ void bch2_replicas_entry_to_text(struct printbuf *out, { unsigned i; - pr_buf(out, "%s: %u/%u [", - bch2_data_types[e->data_type], - e->nr_required, - e->nr_devs); + if (e->data_type < BCH_DATA_NR) + pr_buf(out, "%s", bch2_data_types[e->data_type]); + else + pr_buf(out, "(invalid data type %u)", e->data_type); + pr_buf(out, ": %u/%u [", e->nr_required, e->nr_devs); for (i = 0; i < e->nr_devs; i++) pr_buf(out, i ? " %u" : "%u", e->devs[i]); pr_buf(out, "]"); } void bch2_cpu_replicas_to_text(struct printbuf *out, - struct bch_replicas_cpu *r) + struct bch_replicas_cpu *r) { struct bch_replicas_entry *e; bool first = true; @@ -413,75 +414,14 @@ err: goto out; } -static int __bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *r, - bool check) -{ - return likely(bch2_replicas_marked(c, r)) ? 0 - : check ? -1 - : bch2_mark_replicas_slowpath(c, r); -} - int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) { - return __bch2_mark_replicas(c, r, false); -} - -static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, - bool check) -{ - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - int ret; - - memset(&search, 0, sizeof(search)); - - for (i = 0; i < cached.nr; i++) { - bch2_replicas_entry_cached(&search.e, cached.devs[i]); - - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - } - - bch2_bkey_to_replicas(&search.e, k); - - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - - if (search.e.data_type == BCH_DATA_parity) { - search.e.data_type = BCH_DATA_cached; - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - - search.e.data_type = BCH_DATA_user; - ret = __bch2_mark_replicas(c, &search.e, check); - if (ret) - return ret; - } - - return 0; + return likely(bch2_replicas_marked(c, r)) + ? 0 : bch2_mark_replicas_slowpath(c, r); } /* replicas delta list: */ -bool bch2_replicas_delta_list_marked(struct bch_fs *c, - struct replicas_delta_list *r) -{ - struct replicas_delta *d = r->d; - struct replicas_delta *top = (void *) r->d + r->used; - - percpu_rwsem_assert_held(&c->mark_lock); - - for (d = r->d; d != top; d = replicas_delta_next(d)) - if (bch2_replicas_entry_idx(c, &d->r) < 0) - return false; - return true; -} - int bch2_replicas_delta_list_mark(struct bch_fs *c, struct replicas_delta_list *r) { @@ -494,19 +434,6 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c, return ret; } -/* bkey replicas: */ - -bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k) -{ - return __bch2_mark_bkey_replicas(c, k, true) == 0; -} - -int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -{ - return __bch2_mark_bkey_replicas(c, k, false); -} - /* * Old replicas_gc mechanism: only used for journal replicas entries now, should * die at some point: @@ -874,67 +801,78 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, return 0; } -static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) +static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, + struct bch_sb *sb, + struct printbuf *err) { - unsigned i; + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned i, j; sort_cmp_size(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, memcmp, NULL); - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_entry *l = + for (i = 0; i < cpu_r->nr; i++) { + struct bch_replicas_entry *e = cpu_replicas_entry(cpu_r, i); - struct bch_replicas_entry *r = - cpu_replicas_entry(cpu_r, i + 1); - - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - if (!memcmp(l, r, cpu_r->entry_size)) - return "duplicate replicas entry"; - } + if (e->data_type >= BCH_DATA_NR) { + pr_buf(err, "invalid data type in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - return NULL; -} + if (!e->nr_devs) { + pr_buf(err, "no devices in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } -static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) -{ - struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu cpu_r = { .entries = NULL }; - struct bch_replicas_entry *e; - const char *err; - unsigned i; + if (e->nr_required > 1 && + e->nr_required >= e->nr_devs) { + pr_buf(err, "bad nr_required in entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - for_each_replicas_entry(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; + for (j = 0; j < e->nr_devs; j++) + if (!bch2_dev_exists(sb, mi, e->devs[j])) { + pr_buf(err, "invalid device %u in entry ", e->devs[j]); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } - err = "invalid replicas entry: no devices"; - if (!e->nr_devs) - goto err; + if (i + 1 < cpu_r->nr) { + struct bch_replicas_entry *n = + cpu_replicas_entry(cpu_r, i + 1); - err = "invalid replicas entry: bad nr_required"; - if (e->nr_required > 1 && - e->nr_required >= e->nr_devs) - goto err; + BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr_devs; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; + if (!memcmp(e, n, cpu_r->entry_size)) { + pr_buf(err, "duplicate replicas entry "); + bch2_replicas_entry_to_text(err, e); + return -EINVAL; + } + } } - err = "cannot allocate memory"; + return 0; +} + +static int bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); + struct bch_replicas_cpu cpu_r; + int ret; + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) - goto err; + return -ENOMEM; - err = check_dup_replicas_entries(&cpu_r); -err: + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); - return err; + return ret; } static void bch2_sb_replicas_to_text(struct printbuf *out, @@ -959,38 +897,19 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas = { .to_text = bch2_sb_replicas_to_text, }; -static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) +static int bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); - struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu cpu_r = { .entries = NULL }; - struct bch_replicas_entry_v0 *e; - const char *err; - unsigned i; - - for_each_replicas_entry_v0(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; - - err = "invalid replicas entry: no devices"; - if (!e->nr_devs) - goto err; - - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr_devs; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; - } + struct bch_replicas_cpu cpu_r; + int ret; - err = "cannot allocate memory"; if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) - goto err; + return -ENOMEM; - err = check_dup_replicas_entries(&cpu_r); -err: + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); - return err; + return ret; } const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { @@ -1010,6 +929,9 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; + if (e->data_type == BCH_DATA_cached) + continue; + for (i = 0; i < e->nr_devs; i++) { struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 72ac544..d237d7c 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -48,12 +48,9 @@ replicas_delta_next(struct replicas_delta *d) return (void *) d + replicas_entry_bytes(&d->r) + 8; } -bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *); int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); -int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, unsigned dev) diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 2360234..57d6367 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -8,6 +8,7 @@ #include "error.h" #include "inode.h" #include "siphash.h" +#include "subvolume.h" #include "super.h" #include @@ -19,13 +20,13 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) { switch (opt) { case BCH_STR_HASH_OPT_crc32c: - return BCH_STR_HASH_CRC32C; + return BCH_STR_HASH_crc32c; case BCH_STR_HASH_OPT_crc64: - return BCH_STR_HASH_CRC64; + return BCH_STR_HASH_crc64; case BCH_STR_HASH_OPT_siphash: return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) - ? BCH_STR_HASH_SIPHASH - : BCH_STR_HASH_SIPHASH_OLD; + ? BCH_STR_HASH_siphash + : BCH_STR_HASH_siphash_old; default: BUG(); } @@ -50,7 +51,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) .siphash_key = { .k0 = bi->bi_hash_seed } }; - if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; @@ -76,16 +77,16 @@ static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, sizeof(info->siphash_key.k0)); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Init(&ctx->siphash, &info->siphash_key); break; default: @@ -98,14 +99,14 @@ static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, const void *data, size_t len) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: ctx->crc32c = crc32c(ctx->crc32c, data, len); break; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: ctx->crc64 = crc64_be(ctx->crc64, data, len); break; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: SipHash24_Update(&ctx->siphash, data, len); break; default: @@ -117,12 +118,12 @@ static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, const struct bch_hash_info *info) { switch (info->type) { - case BCH_STR_HASH_CRC32C: + case BCH_STR_HASH_crc32c: return ctx->crc32c; - case BCH_STR_HASH_CRC64: + case BCH_STR_HASH_crc64: return ctx->crc64 >> 1; - case BCH_STR_HASH_SIPHASH_OLD: - case BCH_STR_HASH_SIPHASH: + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: return SipHash24_End(&ctx->siphash) >> 1; default: BUG(); @@ -137,28 +138,40 @@ struct bch_hash_desc { u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); bool (*cmp_key)(struct bkey_s_c, const void *); bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); + bool (*is_visible)(subvol_inum inum, struct bkey_s_c); }; -static __always_inline struct btree_iter * +static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) +{ + return k.k->type == desc.key_type && + (!desc.is_visible || desc.is_visible(inum, k)); +} + +static __always_inline int bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key, + subvol_inum inum, const void *key, unsigned flags) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; - if (k.k->type == desc.key_type) { + if (is_visible_key(desc, inum, k)) { if (!desc.cmp_key(k, key)) - return iter; + return 0; } else if (k.k->type == KEY_TYPE_hash_whiteout) { ; } else { @@ -166,35 +179,38 @@ bch2_hash_lookup(struct btree_trans *trans, break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, iter); - return ERR_PTR(ret ?: -ENOENT); + return ret ?: -ENOENT; } -static __always_inline struct btree_iter * +static __always_inline int bch2_hash_hole(struct btree_trans *trans, + struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { - struct btree_iter *iter; struct bkey_s_c k; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; - if (k.k->type != desc.key_type) - return iter; + if (!is_visible_key(desc, inum, k)) + return 0; } + bch2_trans_iter_exit(trans, iter); - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; - bch2_trans_iter_put(trans, iter); - - return ERR_PTR(ret ?: -ENOSPC); + return ret ?: -ENOSPC; } static __always_inline @@ -203,28 +219,27 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, const struct bch_hash_info *info, struct btree_iter *start) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret; - iter = bch2_trans_copy_iter(trans, start); + bch2_trans_copy_iter(&iter, start); - bch2_btree_iter_advance(iter); + bch2_btree_iter_advance(&iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; if (k.k->type == desc.key_type && desc.hash_bkey(info, k) <= start->pos.offset) { - iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ret = 1; break; } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -232,20 +247,28 @@ static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, struct bkey_i *insert, int flags) + subvol_inum inum, + struct bkey_i *insert, int flags) { - struct btree_iter *iter, *slot = NULL; + struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; + u32 snapshot; int ret; - for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter.pos.inode != inum.inum) break; - if (k.k->type == desc.key_type) { + if (is_visible_key(desc, inum, k)) { if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) goto found; @@ -253,9 +276,9 @@ int bch2_hash_set(struct btree_trans *trans, continue; } - if (!slot && + if (!slot.path && !(flags & BCH_HASH_SET_MUST_REPLACE)) - slot = bch2_trans_copy_iter(trans, iter); + bch2_trans_copy_iter(&slot, &iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -264,8 +287,8 @@ int bch2_hash_set(struct btree_trans *trans, if (!ret) ret = -ENOSPC; out: - bch2_trans_iter_put(trans, slot); - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &slot); + bch2_trans_iter_exit(trans, &iter); return ret; found: @@ -277,11 +300,11 @@ not_found: } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { - if (!found && slot) + if (!found && slot.path) swap(iter, slot); - insert->k.p = iter->pos; - ret = bch2_trans_update(trans, iter, insert, 0); + insert->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, insert, 0); } goto out; @@ -291,7 +314,8 @@ static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - struct btree_iter *iter) + struct btree_iter *iter, + unsigned update_flags) { struct bkey_i *delete; int ret; @@ -309,25 +333,25 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, 0); + return bch2_trans_update(trans, iter, delete, update_flags); } static __always_inline int bch2_hash_delete(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { - struct btree_iter *iter; + struct btree_iter iter; int ret; - iter = bch2_hash_lookup(trans, desc, info, inode, key, + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); + if (ret) + return ret; - ret = bch2_hash_delete_at(trans, desc, info, iter); - bch2_trans_iter_put(trans, iter); + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c new file mode 100644 index 0000000..6960332 --- /dev/null +++ b/libbcachefs/subvolume.c @@ -0,0 +1,1089 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "error.h" +#include "fs.h" +#include "subvolume.h" + +/* Snapshot tree: */ + +static void bch2_delete_dead_snapshots_work(struct work_struct *); +static void bch2_delete_dead_snapshots(struct bch_fs *); + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + pr_buf(out, "is_subvol %llu deleted %llu parent %u children %u %u subvol %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol)); +} + +const char *bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || + bkey_cmp(k.k->p, POS(0, 1)) < 0) + return "bad pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) + return "bad val size"; + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + if (id && id <= k.k->p.offset) + return "bad parent node"; + + if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) + return "children not normalized"; + + if (s.v->children[0] && + s.v->children[0] == s.v->children[1]) + return "duplicate child nodes"; + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + if (id >= k.k->p.offset) + return "bad child node"; + } + + return NULL; +} + +int bch2_mark_snapshot(struct btree_trans *trans, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; + + t = genradix_ptr_alloc(&c->snapshots, + U32_MAX - new.k->p.offset, + GFP_KERNEL); + if (!t) + return -ENOMEM; + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + } else { + t->parent = 0; + t->children[0] = 0; + t->children[1] = 0; + t->subvol = 0; + } + + return 0; +} + +static int snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; + + if (!ret) + *s = *bkey_s_c_to_snapshot(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +static int bch2_snapshots_set_equiv(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + unsigned i; + int ret; + + for_each_btree_key(trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 id = k.k->p.offset, child[2]; + unsigned nr_live = 0, live_idx; + + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + ret = snapshot_live(trans, child[i]); + if (ret < 0) + break; + + if (ret) + live_idx = i; + nr_live += ret; + } + + snapshot_t(c, id)->equiv = nr_live == 1 + ? snapshot_t(c, child[live_idx])->equiv + : id; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + bch_err(c, "error walking snapshots: %i", ret); + + return ret; +} + +/* fsck: */ +static int bch2_snapshot_check(struct btree_trans *trans, + struct bkey_s_c_snapshot s) +{ + struct bch_subvolume subvol; + struct bch_snapshot v; + u32 i, id; + int ret; + + id = le32_to_cpu(s.v->subvol); + ret = lockrestart_do(trans, bch2_subvolume_get(trans, id, 0, false, &subvol)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent subvolume %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + bch_err(trans->c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + s.k->p.offset); + return -EINVAL; + } + + id = le32_to_cpu(s.v->parent); + if (id) { + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent parent %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.children[0]) != s.k->p.offset && + le32_to_cpu(v.children[1]) != s.k->p.offset) { + bch_err(trans->c, "snapshot parent %u missing pointer to child %llu", + id, s.k->p.offset); + return -EINVAL; + } + } + + for (i = 0; i < 2 && s.v->children[i]; i++) { + id = le32_to_cpu(s.v->children[i]); + + ret = lockrestart_do(trans, snapshot_lookup(trans, id, &v)); + if (ret == -ENOENT) + bch_err(trans->c, "snapshot node %llu has nonexistent child %u", + s.k->p.offset, id); + if (ret) + return ret; + + if (le32_to_cpu(v.parent) != s.k->p.offset) { + bch_err(trans->c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), s.k->p.offset); + return -EINVAL; + } + } + + return 0; +} + +int bch2_fs_snapshots_check(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_snapshot s; + unsigned id; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + ret = bch2_snapshot_check(&trans, bkey_s_c_to_snapshot(k)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error %i checking snapshots", ret); + goto err; + } + + for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; +again_2: + id = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + ret = snapshot_lookup(&trans, id, &s); + + if (ret == -EINTR) { + k = bch2_btree_iter_peek(&iter); + goto again_2; + } else if (ret == -ENOENT) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, id); + else if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + genradix_free(&c->snapshots); +} + +int bch2_fs_snapshots_start(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + bool have_deleted = false; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) + break; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(c, "found wrong key type %u in snapshot node table", + k.k->type); + continue; + } + + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + have_deleted = true; + + ret = bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; +err: + bch2_trans_exit(&trans); + + if (!ret && have_deleted) { + bch_info(c, "restarting deletion of dead snapshots"); + if (c->opts.fsck) { + bch2_delete_dead_snapshots_work(&c->snapshot_delete_work); + } else { + bch2_delete_dead_snapshots(c); + } + } + + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_snapshot *s; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) + goto err; + + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto err; + + bkey_reassemble(&s->k_i, k); + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + ret = bch2_trans_update(trans, &iter, &s->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct bkey_s_c k; + struct bkey_s_c_snapshot s; + struct bkey_i_snapshot *parent; + u32 parent_id; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); + ret = -ENOENT; + goto err; + } + + s = bkey_s_c_to_snapshot(k); + + BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); + parent_id = le32_to_cpu(s.v->parent); + + if (parent_id) { + bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, + POS(0, parent_id), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&p_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); + ret = -ENOENT; + goto err; + } + + parent = bch2_trans_kmalloc(trans, sizeof(*parent)); + ret = PTR_ERR_OR_ZERO(parent); + if (ret) + goto err; + + bkey_reassemble(&parent->k_i, k); + + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (i == 2) + bch_err(trans->c, "snapshot %u missing child pointer to %u", + parent_id, id); + else + parent->v.children[i] = 0; + + if (le32_to_cpu(parent->v.children[0]) < + le32_to_cpu(parent->v.children[1])) + swap(parent->v.children[0], + parent->v.children[1]); + + ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); + if (ret) + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -ENOSPC; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_snapshot_init(&n->k_i); + n->k.p = iter.pos; + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.pad = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: + bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + goto err; + + new_snapids[i] = iter.pos.offset; + } + + if (parent) { + bch2_btree_iter_set_pos(&iter, POS(0, parent)); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_snapshot) { + bch_err(trans->c, "snapshot %u not found", parent); + ret = -ENOENT; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, k); + + if (n->v.children[0] || n->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + n->v.children[0] = cpu_to_le32(new_snapids[0]); + n->v.children[1] = cpu_to_le32(new_snapids[1]); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + if (ret) + goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +{ + BUG_ON(snapshot_list_has_id(s, id)); + + if (s->nr == s->size) { + size_t new_size = max(8U, s->size * 2); + void *n = krealloc(s->d, + new_size * sizeof(s->d[0]), + GFP_KERNEL); + if (!n) { + pr_err("error allocating snapshot ID list"); + return -ENOMEM; + } + + s->d = n; + s->size = new_size; + }; + + s->d[s->nr++] = id; + return 0; +} + +static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, + struct snapshot_id_list *deleted, + enum btree_id btree_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct snapshot_id_list equiv_seen = { 0 }; + struct bpos last_pos = POS_MIN; + int ret = 0; + + /* + * XXX: We should also delete whiteouts that no longer overwrite + * anything + */ + + bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + + while ((bch2_trans_begin(trans), + (k = bch2_btree_iter_peek(&iter)).k) && + !(ret = bkey_err(k))) { + u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; + + if (bkey_cmp(k.k->p, last_pos)) + equiv_seen.nr = 0; + last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(&equiv_seen, equiv)) { + if (btree_id == BTREE_ID_inodes && + bch2_btree_key_cache_flush(trans, btree_id, iter.pos)) + continue; + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + if (ret) + break; + } else { + ret = snapshot_id_add(&equiv_seen, equiv); + if (ret) + break; + } + + bch2_btree_iter_advance(&iter); + } + bch2_trans_iter_exit(trans, &iter); + + kfree(equiv_seen.d); + + return ret; +} + +static void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + struct snapshot_id_list deleted = { 0 }; + u32 i, id, children[2]; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + continue; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = snapshot_live(&trans, children[0]) ?: + snapshot_live(&trans, children[1]); + if (ret < 0) + break; + if (ret) + continue; + + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_set_deleted(&trans, iter.pos.offset)); + if (ret) { + bch_err(c, "error deleting snapshot %llu: %i", iter.pos.offset, ret); + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + ret = bch2_snapshots_set_equiv(&trans); + if (ret) + goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_id_add(&deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %i", ret); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_snapshots(id)) + continue; + + ret = bch2_snapshot_delete_keys_btree(&trans, &deleted, id); + if (ret) { + bch_err(c, "error deleting snapshot keys: %i", ret); + goto err; + } + } + + for (i = 0; i < deleted.nr; i++) { + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.d[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %i", + deleted.d[i], ret); + goto err; + } + } +err: + kfree(deleted.d); + bch2_trans_exit(&trans); + percpu_ref_put(&c->writes); +} + +static void bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (unlikely(!percpu_ref_tryget(&c->writes))) + return; + + if (!queue_work(system_long_wq, &c->snapshot_delete_work)) + percpu_ref_put(&c->writes); +} + +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + bch2_delete_dead_snapshots(trans->c); + return 0; +} + +/* Subvolumes: */ + +const char *bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0) + return "invalid pos"; + + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + return "invalid pos"; + + if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) + return "bad val size"; + + return NULL; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + pr_buf(out, "root %llu snapshot id %u", + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); +} + +int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), + iter_flags); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; + + if (ret == -ENOENT && inconsistent_if_not_found) + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + if (!ret) + *s = *bkey_s_c_to_subvolume(k).v; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + u32 *snapid) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, true, + BTREE_ITER_CACHED| + BTREE_ITER_WITH_UPDATES, + &s); + + *snapid = le32_to_cpu(s.snapshot); + return ret; +} + +/* + * Delete subvolume, mark snapshot ID as deleted, queue up snapshot + * deletion/cleanup: + */ +int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; + struct btree_trans_commit_hook *h; + struct bkey_i *delete; + u32 snapid; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + goto err; + + bkey_init(&delete->k); + delete->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, delete, 0); + if (ret) + goto err; + + ret = bch2_snapshot_node_set_deleted(trans, snapid); + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->fn = bch2_delete_dead_snapshots_hook; + bch2_trans_commit_hook(trans, h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); + struct snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; + memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + + for (id = s.d; id < s.d + s.nr; id++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { + bch_err(c, "error %i deleting subvolume %u", ret, *id); + break; + } + } + + kfree(s.d); + } + + percpu_ref_put(&c->writes); +} + +struct subvolume_unlink_hook { + struct btree_trans_commit_hook h; + u32 subvol; +}; + +int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); + struct bch_fs *c = trans->c; + int ret = 0; + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) + return ret; + + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; + + if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + percpu_ref_put(&c->writes); + return 0; +} + +int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_subvolume *n; + struct subvolume_unlink_hook *h; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, k); + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + if (ret) + goto err; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, + u32 *new_snapshotid, + bool ro) +{ + struct bch_fs *c = trans->c; + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + struct bkey_s_c k; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; + int ret = 0; + + for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) + break; + + /* + * bch2_subvolume_delete() doesn't flush the btree key cache - + * ideally it would but that's tricky + */ + if (bkey_deleted(k.k) && + !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos)) + goto found_slot; + } + + if (!ret) + ret = -ENOSPC; + goto err; +found_slot: + snapshot_subvols[0] = dst_iter.pos.offset; + snapshot_subvols[1] = src_subvolid; + + if (src_subvolid) { + /* Creating a snapshot: */ + src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, + POS(0, src_subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch_err(c, "subvolume %u not found", src_subvolid); + ret = -ENOENT; + goto err; + } + + bkey_reassemble(&src_subvol->k_i, k); + parent = le32_to_cpu(src_subvol->v.snapshot); + } + + ret = bch2_snapshot_node_create(trans, parent, new_nodes, + snapshot_subvols, + src_subvolid ? 2 : 1); + if (ret) + goto err; + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); + ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); + if (ret) + goto err; + } + + new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + goto err; + + bkey_subvolume_init(&new_subvol->k_i); + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + new_subvol->k.p = dst_iter.pos; + ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); + if (ret) + goto err; + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +err: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int bch2_fs_subvolumes_init(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); + mutex_init(&c->snapshots_unlinked_lock); + return 0; +} diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h new file mode 100644 index 0000000..4abe53d --- /dev/null +++ b/libbcachefs/subvolume.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + +#include "subvolume_types.h" + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_snapshot (struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ +} + +int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, + struct bkey_s_c, unsigned); + +static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return genradix_ptr(&c->snapshots, U32_MAX - id); +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s = snapshot_t(c, id); + + return s->children[0] || s->children[1]; +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + struct snapshot_t *s; + u32 parent = bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + while (id && id < ancestor) + id = bch2_snapshot_parent(c, id); + + return id == ancestor; +} + +struct snapshots_seen { + struct bpos pos; + size_t nr; + size_t size; + u32 *d; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + kfree(s->d); + s->d = NULL; +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + if (s->nr == s->size) { + size_t new_size = max(s->size, (size_t) 128) * 2; + u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); + + if (!d) { + bch_err(c, "error reallocating snapshots_seen table (new size %zu)", + new_size); + return -ENOMEM; + } + + s->size = new_size; + s->d = d; + } + + s->d[s->nr++] = id; + return 0; +} + +static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +{ + unsigned i; + + for (i = 0; i < s->nr; i++) + if (id == s->d[i]) + return true; + return false; +} + +int bch2_fs_snapshots_check(struct bch_fs *); +void bch2_fs_snapshots_exit(struct bch_fs *); +int bch2_fs_snapshots_start(struct bch_fs *); + +const char *bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_subvolume (struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ +} + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + +int bch2_subvolume_delete(struct btree_trans *, u32); +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +int bch2_fs_subvolumes_init(struct bch_fs *); + +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h new file mode 100644 index 0000000..9410b95 --- /dev/null +++ b/libbcachefs/subvolume_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + +struct snapshot_id_list { + u32 nr; + u32 size; + u32 *d; +}; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 3903b73..49dafda 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -27,8 +27,8 @@ const char * const bch2_sb_fields[] = { NULL }; -static const char *bch2_sb_field_validate(struct bch_sb *, - struct bch_sb_field *); +static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, + struct printbuf *); struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, enum bch_sb_field_type type) @@ -202,22 +202,31 @@ static inline void __bch2_sb_layout_size_assert(void) BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); } -static const char *validate_sb_layout(struct bch_sb_layout *layout) +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) { u64 offset, prev_offset, max_sectors; unsigned i; - if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) - return "Not a bcachefs superblock layout"; + if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { + pr_buf(out, "Not a bcachefs superblock layout"); + return -EINVAL; + } - if (layout->layout_type != 0) - return "Invalid superblock layout type"; + if (layout->layout_type != 0) { + pr_buf(out, "Invalid superblock layout type %u", + layout->layout_type); + return -EINVAL; + } - if (!layout->nr_superblocks) - return "Invalid superblock layout: no superblocks"; + if (!layout->nr_superblocks) { + pr_buf(out, "Invalid superblock layout: no superblocks"); + return -EINVAL; + } - if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) - return "Invalid superblock layout: too many superblocks"; + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { + pr_buf(out, "Invalid superblock layout: too many superblocks"); + return -EINVAL; + } max_sectors = 1 << layout->sb_max_size_bits; @@ -226,126 +235,134 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) for (i = 1; i < layout->nr_superblocks; i++) { offset = le64_to_cpu(layout->sb_offset[i]); - if (offset < prev_offset + max_sectors) - return "Invalid superblock layout: superblocks overlap"; + if (offset < prev_offset + max_sectors) { + pr_buf(out, "Invalid superblock layout: superblocks overlap\n" + " (sb %u ends at %llu next starts at %llu", + i - 1, prev_offset + max_sectors, offset); + return -EINVAL; + } prev_offset = offset; } - return NULL; + return 0; } -const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; struct bch_sb_field_members *mi; - const char *err; u32 version, version_min; u16 block_size; + int ret; version = le16_to_cpu(sb->version); version_min = version >= bcachefs_metadata_version_new_versioning ? le16_to_cpu(sb->version_min) : version; - if (version >= bcachefs_metadata_version_max || - version_min < bcachefs_metadata_version_min) - return "Unsupported superblock version"; + if (version >= bcachefs_metadata_version_max) { + pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } + + if (version_min < bcachefs_metadata_version_min) { + pr_buf(out, "Unsupported superblock version %u (min %u, max %u)", + version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } - if (version_min > version) - return "Bad minimum version"; + if (version_min > version) { + pr_buf(out, "Bad minimum version %u, greater than version field %u", + version_min, version); + return -EINVAL; + } if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) - return "Filesystem has incompatible features"; + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { + pr_buf(out, "Filesystem has incompatible features"); + return -EINVAL; + } block_size = le16_to_cpu(sb->block_size); - if (!is_power_of_2(block_size) || - block_size > PAGE_SECTORS) - return "Bad block size"; + if (block_size > PAGE_SECTORS) { + pr_buf(out, "Block size too big (got %u, max %u)", + block_size, PAGE_SECTORS); + return -EINVAL; + } - if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) - return "Bad user UUID"; + if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { + pr_buf(out, "Bad user UUID (got zeroes)"); + return -EINVAL; + } - if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) - return "Bad internal UUID"; + if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { + pr_buf(out, "Bad intenal UUID (got zeroes)"); + return -EINVAL; + } if (!sb->nr_devices || - sb->nr_devices <= sb->dev_idx || - sb->nr_devices > BCH_SB_MEMBERS_MAX) - return "Bad number of member devices"; - - if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) - return "Invalid number of metadata replicas"; - - if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; - - if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) - return "Invalid number of data replicas"; - - if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) - return "Invalid metadata checksum type"; - - if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) - return "Invalid metadata checksum type"; - - if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) - return "Invalid compression type"; - - if (!BCH_SB_BTREE_NODE_SIZE(sb)) - return "Btree node size not set"; - - if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) - return "Btree node size not a power of two"; + sb->nr_devices > BCH_SB_MEMBERS_MAX) { + pr_buf(out, "Bad number of member devices %u (max %u)", + sb->nr_devices, BCH_SB_MEMBERS_MAX); + return -EINVAL; + } - if (BCH_SB_GC_RESERVE(sb) < 5) - return "gc reserve percentage too small"; + if (sb->dev_idx >= sb->nr_devices) { + pr_buf(out, "Bad dev_idx (got %u, nr_devices %u)", + sb->dev_idx, sb->nr_devices); + return -EINVAL; + } if (!sb->time_precision || - le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) - return "invalid time precision"; + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { + pr_buf(out, "Invalid time precision: %u (min 1, max %lu)", + le32_to_cpu(sb->time_precision), NSEC_PER_SEC); + return -EINVAL; + } /* validate layout */ - err = validate_sb_layout(&sb->layout); - if (err) - return err; + ret = validate_sb_layout(&sb->layout, out); + if (ret) + return ret; vstruct_for_each(sb, f) { - if (!f->u64s) - return "Invalid superblock: invalid optional field"; + if (!f->u64s) { + pr_buf(out, "Invalid superblock: optional with size 0 (type %u)", + le32_to_cpu(f->type)); + return -EINVAL; + } - if (vstruct_next(f) > vstruct_last(sb)) - return "Invalid superblock: invalid optional field"; + if (vstruct_next(f) > vstruct_last(sb)) { + pr_buf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", + le32_to_cpu(f->type)); + return -EINVAL; + } } /* members must be validated first: */ mi = bch2_sb_get_members(sb); - if (!mi) - return "Invalid superblock: member info area missing"; + if (!mi) { + pr_buf(out, "Invalid superblock: member info area missing"); + return -EINVAL; + } - err = bch2_sb_field_validate(sb, &mi->field); - if (err) - return err; + ret = bch2_sb_field_validate(sb, &mi->field, out); + if (ret) + return ret; vstruct_for_each(sb, f) { if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) continue; - err = bch2_sb_field_validate(sb, f); - if (err) - return err; + ret = bch2_sb_field_validate(sb, f, out); + if (ret) + return ret; } - return NULL; + return 0; } /* device open: */ @@ -366,7 +383,6 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); - c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; @@ -439,10 +455,8 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) __copy_super(&c->disk_sb, src); - if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) - set_bit(BCH_FS_ERROR, &c->flags); - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + if (BCH_SB_INITIALIZED(c->disk_sb.sb)) + set_bit(BCH_FS_INITIALIZED, &c->flags); ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) @@ -477,10 +491,12 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) /* read superblock: */ -static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) +static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { struct bch_csum csum; + u32 version, version_min; size_t bytes; + int ret; reread: bio_reset(sb->bio); bio_set_dev(sb->bio, sb->bdev); @@ -488,40 +504,65 @@ reread: bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); - if (submit_bio_wait(sb->bio)) - return "IO error"; + ret = submit_bio_wait(sb->bio); + if (ret) { + pr_buf(err, "IO error: %i", ret); + return ret; + } - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) - return "Not a bcachefs superblock"; + if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { + pr_buf(err, "Not a bcachefs superblock"); + return -EINVAL; + } - if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || - le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) - return "Unsupported superblock version"; + version = le16_to_cpu(sb->sb->version); + version_min = version >= bcachefs_metadata_version_new_versioning + ? le16_to_cpu(sb->sb->version_min) + : version; + + if (version >= bcachefs_metadata_version_max) { + pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } + + if (version_min < bcachefs_metadata_version_min) { + pr_buf(err, "Unsupported superblock version %u (min %u, max %u)", + version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); + return -EINVAL; + } bytes = vstruct_bytes(sb->sb); - if (bytes > 512 << sb->sb->layout.sb_max_size_bits) - return "Bad superblock: too big"; + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { + pr_buf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", + bytes, 512UL << sb->sb->layout.sb_max_size_bits); + return -EINVAL; + } if (bytes > sb->buffer_size) { if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) - return "cannot allocate memory"; + return -ENOMEM; goto reread; } - if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) - return "unknown csum type"; + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + pr_buf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + return -EINVAL; + } /* XXX: verify MACs */ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), null_nonce(), sb->sb); - if (bch2_crc_cmp(csum, sb->sb->csum)) - return "bad checksum reading superblock"; + if (bch2_crc_cmp(csum, sb->sb->csum)) { + pr_buf(err, "bad checksum"); + return -EINVAL; + } sb->seq = le64_to_cpu(sb->sb->seq); - return NULL; + return 0; } int bch2_read_super(const char *path, struct bch_opts *opts, @@ -529,10 +570,16 @@ int bch2_read_super(const char *path, struct bch_opts *opts, { u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; - const char *err; + char *_err; + struct printbuf err; __le64 *i; int ret; + _err = kmalloc(4096, GFP_KERNEL); + if (!_err) + return -ENOMEM; + err = _PBUF(_err, 4096); + pr_verbose_init(*opts, ""); memset(sb, 0, sizeof(*sb)); @@ -561,25 +608,28 @@ int bch2_read_super(const char *path, struct bch_opts *opts, goto out; } - err = "cannot allocate memory"; ret = bch2_sb_realloc(sb, 0); - if (ret) + if (ret) { + pr_buf(&err, "error allocating memory for superblock"); goto err; + } - ret = -EFAULT; - err = "dynamic fault"; - if (bch2_fs_init_fault("read_super")) + if (bch2_fs_init_fault("read_super")) { + pr_buf(&err, "dynamic fault"); + ret = -EFAULT; goto err; + } - ret = -EINVAL; - err = read_one_super(sb, offset); - if (!err) + ret = read_one_super(sb, offset, &err); + if (!ret) goto got_super; if (opt_defined(*opts, sb)) goto err; - pr_err("error reading default superblock: %s", err); + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", + path, _err); + err = _PBUF(_err, 4096); /* * Error reading primary superblock - read location of backup @@ -595,13 +645,15 @@ int bch2_read_super(const char *path, struct bch_opts *opts, */ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); - err = "IO error"; - if (submit_bio_wait(sb->bio)) + ret = submit_bio_wait(sb->bio); + if (ret) { + pr_buf(&err, "IO error: %i", ret); goto err; + } memcpy(&layout, sb->sb, sizeof(layout)); - err = validate_sb_layout(&layout); - if (err) + ret = validate_sb_layout(&layout, &err); + if (ret) goto err; for (i = layout.sb_offset; @@ -611,29 +663,41 @@ int bch2_read_super(const char *path, struct bch_opts *opts, if (offset == opt_get(*opts, sb)) continue; - err = read_one_super(sb, offset); - if (!err) + ret = read_one_super(sb, offset, &err); + if (!ret) goto got_super; } - ret = -EINVAL; goto err; got_super: - err = "Superblock block size smaller than device block size"; - ret = -EINVAL; if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev)) + bdev_logical_block_size(sb->bdev)) { + pr_buf(&err, "block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); + ret = -EINVAL; goto err; + } ret = 0; sb->have_layout = true; + + ret = bch2_sb_validate(sb, &err); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + path, _err); + goto err_no_print; + } out: pr_verbose_init(*opts, "ret %i", ret); + kfree(_err); return ret; err: + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", + path, _err); +err_no_print: bch2_free_super(sb); - pr_err("error reading superblock: %s", err); goto out; } @@ -706,7 +770,6 @@ int bch2_write_super(struct bch_fs *c) struct closure *cl = &c->sb_write; struct bch_dev *ca; unsigned i, sb = 0, nr_wrote; - const char *err; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; @@ -733,10 +796,19 @@ int bch2_write_super(struct bch_fs *c) bch2_sb_from_fs(c, ca); for_each_online_member(ca, c, i) { - err = bch2_sb_validate(&ca->disk_sb); - if (err) { - bch2_fs_inconsistent(c, "sb invalid before write: %s", err); - ret = -1; + struct printbuf buf = { NULL, NULL }; + + ret = bch2_sb_validate(&ca->disk_sb, &buf); + if (ret) { + char *_buf = kmalloc(4096, GFP_NOFS); + if (_buf) { + buf = _PBUF(_buf, 4096); + bch2_sb_validate(&ca->disk_sb, &buf); + } + + bch2_fs_inconsistent(c, "sb invalid before write: %s", _buf); + kfree(_buf); + percpu_ref_put(&ca->io_ref); goto out; } } @@ -754,11 +826,24 @@ int bch2_write_super(struct bch_fs *c) closure_sync(cl); for_each_online_member(ca, c, i) { - if (!ca->sb_write_error && - ca->disk_sb.seq != - le64_to_cpu(ca->sb_read_scratch->seq)) { + if (ca->sb_write_error) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { bch2_fs_fatal_error(c, - "Superblock modified by another process"); + "Superblock modified by another process (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); ret = -EROFS; goto out; @@ -807,7 +892,8 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written || (can_mount_without_written && !can_mount_with_written), c, - "Unable to write superblock to sufficient devices")) + "Unable to write superblock to sufficient devices (from %ps)", + (void *) _RET_IP_)) ret = -1; out: /* Make new options visible after they're persistent: */ @@ -835,54 +921,57 @@ static int u64_cmp(const void *_l, const void *_r) return l < r ? -1 : l > r ? 1 : 0; } -static const char *bch2_sb_validate_journal(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_validate_journal(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - const char *err; + int ret = -EINVAL; unsigned nr; unsigned i; u64 *b; - journal = bch2_sb_get_journal(sb); - if (!journal) - return NULL; - nr = bch2_nr_journal_buckets(journal); if (!nr) - return NULL; + return 0; b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); if (!b) - return "cannot allocate memory"; + return -ENOMEM; for (i = 0; i < nr; i++) b[i] = le64_to_cpu(journal->buckets[i]); sort(b, nr, sizeof(u64), u64_cmp, NULL); - err = "journal bucket at sector 0"; - if (!b[0]) + if (!b[0]) { + pr_buf(err, "journal bucket at sector 0"); goto err; + } - err = "journal bucket before first bucket"; - if (m && b[0] < le16_to_cpu(m->first_bucket)) + if (b[0] < le16_to_cpu(m->first_bucket)) { + pr_buf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); goto err; + } - err = "journal bucket past end of device"; - if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) + if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); goto err; + } - err = "duplicate journal buckets"; for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) + if (b[i] == b[i + 1]) { + pr_buf(err, "duplicate journal buckets %llu", b[i]); goto err; + } - err = NULL; + ret = 0; err: kfree(b); - return err; + return ret; } static const struct bch_sb_field_ops bch_sb_field_ops_journal = { @@ -891,39 +980,54 @@ static const struct bch_sb_field_ops bch_sb_field_ops_journal = { /* BCH_SB_FIELD_members: */ -static const char *bch2_sb_validate_members(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_validate_members(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_members *mi = field_to_type(f, members); - struct bch_member *m; + unsigned i; if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) - return "Invalid superblock: bad member info"; + vstruct_end(&mi->field)) { + pr_buf(err, "too many devices for section size"); + return -EINVAL; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; - for (m = mi->members; - m < mi->members + sb->nr_devices; - m++) { if (!bch2_member_exists(m)) continue; - if (le64_to_cpu(m->nbuckets) > LONG_MAX) - return "Too many buckets"; + if (le64_to_cpu(m->nbuckets) > LONG_MAX) { + pr_buf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m->nbuckets), LONG_MAX); + return -EINVAL; + } if (le64_to_cpu(m->nbuckets) - - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) - return "Not enough buckets"; + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { + pr_buf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); + return -EINVAL; + } if (le16_to_cpu(m->bucket_size) < - le16_to_cpu(sb->block_size)) - return "bucket size smaller than block size"; + le16_to_cpu(sb->block_size)) { + pr_buf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); + return -EINVAL; + } if (le16_to_cpu(m->bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) - return "bucket size smaller than btree node size"; + BCH_SB_BTREE_NODE_SIZE(sb)) { + pr_buf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -EINVAL; + } } - return NULL; + return 0; } static const struct bch_sb_field_ops bch_sb_field_ops_members = { @@ -932,18 +1036,24 @@ static const struct bch_sb_field_ops bch_sb_field_ops_members = { /* BCH_SB_FIELD_crypt: */ -static const char *bch2_sb_validate_crypt(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_validate_crypt(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) - return "invalid field crypt: wrong size"; + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + pr_buf(err, "wrong size (got %llu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -EINVAL; + } - if (BCH_CRYPT_KDF_TYPE(crypt)) - return "invalid field crypt: bad kdf type"; + if (BCH_CRYPT_KDF_TYPE(crypt)) { + pr_buf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -EINVAL; + } - return NULL; + return 0; } static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { @@ -1028,7 +1138,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_INODES; + u->entry.btree_id = BCH_FS_USAGE_inodes; u->v = cpu_to_le64(c->usage_base->nr_inodes); } @@ -1038,7 +1148,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_KEY_VERSION; + u->entry.btree_id = BCH_FS_USAGE_key_version; u->v = cpu_to_le64(atomic64_read(&c->key_version)); } @@ -1048,7 +1158,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry_usage, entry); u->entry.type = BCH_JSET_ENTRY_usage; - u->entry.btree_id = FS_USAGE_RESERVED; + u->entry.btree_id = BCH_FS_USAGE_reserved; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } @@ -1152,15 +1262,19 @@ out: mutex_unlock(&c->sb_lock); } -static const char *bch2_sb_validate_clean(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_validate_clean(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) { struct bch_sb_field_clean *clean = field_to_type(f, clean); - if (vstruct_bytes(&clean->field) < sizeof(*clean)) - return "invalid field crypt: wrong size"; + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + pr_buf(err, "wrong size (got %llu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -EINVAL; + } - return NULL; + return 0; } static const struct bch_sb_field_ops bch_sb_field_ops_clean = { @@ -1174,14 +1288,26 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #undef x }; -static const char *bch2_sb_field_validate(struct bch_sb *sb, - struct bch_sb_field *f) +static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *orig_err) { unsigned type = le32_to_cpu(f->type); + struct printbuf err = *orig_err; + int ret; - return type < BCH_SB_FIELD_NR - ? bch2_sb_field_ops[type]->validate(sb, f) - : NULL; + if (type >= BCH_SB_FIELD_NR) + return 0; + + pr_buf(&err, "Invalid superblock section %s: ", bch2_sb_fields[type]); + + ret = bch2_sb_field_ops[type]->validate(sb, f, &err); + if (ret) { + pr_buf(&err, "\n"); + bch2_sb_field_to_text(&err, sb, f); + *orig_err = err; + } + + return ret; } void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index b64ac2f..3b425be 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -38,9 +38,8 @@ BCH_SB_FIELDS() extern const char * const bch2_sb_fields[]; struct bch_sb_field_ops { - const char * (*validate)(struct bch_sb *, struct bch_sb_field *); - void (*to_text)(struct printbuf *, struct bch_sb *, - struct bch_sb_field *); + int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); }; static inline __le64 bch2_sb_magic(struct bch_fs *c) @@ -66,8 +65,6 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -const char *bch2_sb_validate(struct bch_sb_handle *); - int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); void __bch2_check_set_feature(struct bch_fs *, unsigned); @@ -110,7 +107,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .bucket_size = le16_to_cpu(mi->bucket_size), .group = BCH_MEMBER_GROUP(mi), .state = BCH_MEMBER_STATE(mi), - .replacement = BCH_MEMBER_REPLACEMENT(mi), .discard = BCH_MEMBER_DISCARD(mi), .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), .durability = BCH_MEMBER_DURABILITY(mi) diff --git a/libbcachefs/super.c b/libbcachefs/super.c index ce8e5d4..b36e621 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -16,6 +16,7 @@ #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" @@ -39,6 +40,7 @@ #include "rebalance.h" #include "recovery.h" #include "replicas.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" #include "sysfs.h" @@ -404,13 +406,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - /* - * We need to write out a journal entry before we start doing btree - * updates, to ensure that on unclean shutdown new journal blacklist - * entries are created: - */ - bch2_journal_meta(&c->journal); - clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); for_each_rw_member(ca, c, i) @@ -468,11 +463,13 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); bch2_fs_btree_interior_update_exit(c); bch2_fs_btree_iter_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); @@ -486,12 +483,12 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_journal_entries_free(&c->journal_entries); percpu_free_rwsem(&c->mark_lock); - if (c->btree_iters_bufs) + if (c->btree_paths_bufs) for_each_possible_cpu(cpu) - kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); + kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); free_percpu(c->online_reserved); - free_percpu(c->btree_iters_bufs); + free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); @@ -593,48 +590,53 @@ void bch2_fs_stop(struct bch_fs *c) bch2_fs_free(c); } -static const char *bch2_fs_online(struct bch_fs *c) +static int bch2_fs_online(struct bch_fs *c) { struct bch_dev *ca; - const char *err = NULL; unsigned i; - int ret; + int ret = 0; lockdep_assert_held(&bch_fs_list_lock); - if (!list_empty(&c->list)) - return NULL; - - if (__bch2_uuid_to_fs(c->sb.uuid)) - return "filesystem UUID already open"; + if (__bch2_uuid_to_fs(c->sb.uuid)) { + bch_err(c, "filesystem UUID already open"); + return -EINVAL; + } ret = bch2_fs_chardev_init(c); - if (ret) - return "error creating character device"; + if (ret) { + bch_err(c, "error creating character device"); + return ret; + } bch2_fs_debug_init(c); - if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || - kobject_add(&c->internal, &c->kobj, "internal") || - kobject_add(&c->opts_dir, &c->kobj, "options") || - kobject_add(&c->time_stats, &c->kobj, "time_stats") || - bch2_opts_create_sysfs_files(&c->opts_dir)) - return "error creating sysfs objects"; + ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + kobject_add(&c->internal, &c->kobj, "internal") ?: + kobject_add(&c->opts_dir, &c->kobj, "options") ?: + kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + bch2_opts_create_sysfs_files(&c->opts_dir); + if (ret) { + bch_err(c, "error creating sysfs objects"); + return ret; + } down_write(&c->state_lock); - err = "error creating sysfs objects"; - for_each_member_device(ca, c, i) - if (bch2_dev_sysfs_online(c, ca)) { + for_each_member_device(ca, c, i) { + ret = bch2_dev_sysfs_online(c, ca); + if (ret) { + bch_err(c, "error creating sysfs objects"); percpu_ref_put(&ca->ref); goto err; } + } + BUG_ON(!list_empty(&c->list)); list_add(&c->list, &bch_fs_list); - err = NULL; err: up_write(&c->state_lock); - return err; + return ret; } static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) @@ -642,13 +644,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) struct bch_sb_field_members *mi; struct bch_fs *c; unsigned i, iter_size; - const char *err; + int ret = 0; pr_verbose_init(opts, ""); c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); - if (!c) + if (!c) { + c = ERR_PTR(-ENOMEM); goto out; + } __module_get(THIS_MODULE); @@ -670,6 +674,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -686,6 +691,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->usage_scratch_lock); mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->snapshot_table_lock); spin_lock_init(&c->btree_write_error_lock); @@ -704,6 +710,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->ec_stripe_new_list); mutex_init(&c->ec_stripe_new_lock); + INIT_LIST_HEAD(&c->data_progress_list); + mutex_init(&c->data_progress_lock); + spin_lock_init(&c->ec_stripes_heap_lock); seqcount_init(&c->gc_pos_lock); @@ -716,38 +725,59 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->rebalance.enabled = 1; c->promote_whole_extents = true; - c->journal.write_time = &c->times[BCH_TIME_journal_write]; - c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; - c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; - c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; + c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; + c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; bch2_fs_btree_cache_init_early(&c->btree_cache); mutex_init(&c->sectors_available_lock); - if (percpu_init_rwsem(&c->mark_lock)) + ret = percpu_init_rwsem(&c->mark_lock); + if (ret) goto err; mutex_lock(&c->sb_lock); + ret = bch2_sb_to_fs(c, sb); + mutex_unlock(&c->sb_lock); - if (bch2_sb_to_fs(c, sb)) { - mutex_unlock(&c->sb_lock); + if (ret) goto err; - } - mutex_unlock(&c->sb_lock); + uuid_unparse_lower(c->sb.user_uuid.b, c->name); + + /* Compat: */ + if (sb->version <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); + if (sb->version <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); c->opts = bch2_opts_default; - bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); + ret = bch2_opts_from_sb(&c->opts, sb); + if (ret) + goto err; + bch2_opts_apply(&c->opts, opts); - c->block_bits = ilog2(c->opts.block_size); + /* key cache currently disabled for inodes, because of snapshots: */ + c->opts.inodes_use_key_cache = 0; + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) + c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + + c->block_bits = ilog2(block_sectors(c)); c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); - if (bch2_fs_init_fault("fs_alloc")) + if (bch2_fs_init_fault("fs_alloc")) { + bch_err(c, "fs_alloc fault injected"); + ret = -EFAULT; goto err; + } iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * @@ -771,33 +801,45 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || + !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL)) || - bch2_io_clock_init(&c->io_clock[READ]) || - bch2_io_clock_init(&c->io_clock[WRITE]) || - bch2_fs_journal_init(&c->journal) || - bch2_fs_replicas_init(c) || - bch2_fs_btree_cache_init(c) || - bch2_fs_btree_key_cache_init(&c->btree_key_cache) || - bch2_fs_btree_iter_init(c) || - bch2_fs_btree_interior_update_init(c) || - bch2_fs_io_init(c) || - bch2_fs_encryption_init(c) || - bch2_fs_compress_init(c) || - bch2_fs_ec_init(c) || - bch2_fs_fsio_init(c)) + sizeof(u64), GFP_KERNEL))) { + ret = -ENOMEM; goto err; + } + + ret = bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_journal_init(&c->journal) ?: + bch2_fs_replicas_init(c) ?: + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c); + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_init(c) ?: + bch2_fs_encryption_init(c) ?: + bch2_fs_compress_init(c) ?: + bch2_fs_ec_init(c) ?: + bch2_fs_fsio_init(c); + if (ret) + goto err; + + if (c->opts.nochanges) + set_bit(JOURNAL_NOCHANGES, &c->journal.flags); mi = bch2_sb_get_members(c->disk_sb.sb); for (i = 0; i < c->sb.nr_devices; i++) if (bch2_dev_exists(c->disk_sb.sb, mi, i) && - bch2_dev_alloc(c, i)) + bch2_dev_alloc(c, i)) { + ret = -EEXIST; goto err; + } bch2_journal_entry_res_resize(&c->journal, &c->btree_root_journal_res, @@ -808,18 +850,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); mutex_lock(&bch_fs_list_lock); - err = bch2_fs_online(c); + ret = bch2_fs_online(c); mutex_unlock(&bch_fs_list_lock); - if (err) { - bch_err(c, "bch2_fs_online() error: %s", err); + + if (ret) goto err; - } out: - pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err: bch2_fs_free(c); - c = NULL; + c = ERR_PTR(ret); goto out; } @@ -842,7 +883,7 @@ static void print_mount_opts(struct bch_fs *c) const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); - if (!(opt->mode & OPT_MOUNT)) + if (!(opt->flags & OPT_MOUNT)) continue; if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) @@ -859,7 +900,6 @@ static void print_mount_opts(struct bch_fs *c) int bch2_fs_start(struct bch_fs *c) { - const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); @@ -895,10 +935,11 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - err = "dynamic fault"; ret = -EINVAL; - if (bch2_fs_init_fault("fs_start")) + if (bch2_fs_init_fault("fs_start")) { + bch_err(c, "fs_start fault injected"); goto err; + } set_bit(BCH_FS_STARTED, &c->flags); @@ -919,7 +960,6 @@ int bch2_fs_start(struct bch_fs *c) if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { - err = "error going read write"; ret = !test_bit(BCH_FS_RW, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); @@ -937,25 +977,22 @@ err: case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); pr_cont("mount with -o fix_errors to repair\n"); - err = "fsck error"; break; case BCH_FSCK_REPAIR_UNIMPLEMENTED: bch_err(c, "filesystem contains errors: please report this to the developers"); pr_cont("repair unimplemented: inform the developers so that it can be added\n"); - err = "fsck error"; break; case BCH_FSCK_REPAIR_IMPOSSIBLE: bch_err(c, "filesystem contains errors, but repair impossible"); - err = "fsck error"; break; case BCH_FSCK_UNKNOWN_VERSION: - err = "unknown metadata version";; + bch_err(c, "unknown metadata version"); break; case -ENOMEM: - err = "cannot allocate memory"; + bch_err(c, "cannot allocate memory"); break; case -EIO: - err = "IO error"; + bch_err(c, "IO error"); break; } @@ -972,7 +1009,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) if (!sb_mi) return "Invalid superblock: member info area missing"; - if (le16_to_cpu(sb->block_size) != c->opts.block_size) + if (le16_to_cpu(sb->block_size) != block_sectors(c)) return "mismatched block size"; if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < @@ -1230,6 +1267,8 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ca->disk_sb.bdev->bd_holder = ca; memset(sb, 0, sizeof(*sb)); + ca->dev = ca->disk_sb.bdev->bd_dev; + percpu_ref_reinit(&ca->io_ref); return 0; @@ -1375,7 +1414,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) bch2_copygc_start(c); } -static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); @@ -1384,10 +1423,7 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - if (bch2_dev_allocator_start(ca)) - return "error starting allocator thread"; - - return NULL; + return bch2_dev_allocator_start(ca); } int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, @@ -1413,9 +1449,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (new_state == BCH_MEMBER_STATE_rw && - __bch2_dev_read_write(c, ca)) - ret = -ENOMEM; + if (new_state == BCH_MEMBER_STATE_rw) + ret = __bch2_dev_read_write(c, ca); rebalance_wakeup(c); @@ -1445,20 +1480,23 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < ca->mi.nbuckets; i++) { - ret = bch2_btree_key_cache_flush(&trans, - BTREE_ID_alloc, POS(ca->dev_idx, i)); + ret = lockrestart_do(&trans, + bch2_btree_key_cache_flush(&trans, + BTREE_ID_alloc, POS(ca->dev_idx, i))); if (ret) break; } bch2_trans_exit(&trans); - if (ret) + if (ret) { + bch_err(c, "error %i removing dev alloc info", ret); return ret; + } return bch2_btree_delete_range(c, BTREE_ID_alloc, POS(ca->dev_idx, 0), POS(ca->dev_idx + 1, 0), - NULL); + 0, NULL); } int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -1576,66 +1614,65 @@ int bch2_dev_add(struct bch_fs *c, const char *path) struct bch_sb_field_members *mi; struct bch_member dev_mi; unsigned dev_idx, nr_devices, u64s; + char *_errbuf; + struct printbuf errbuf; int ret; - ret = bch2_read_super(path, &opts, &sb); - if (ret) - return ret; + _errbuf = kmalloc(4096, GFP_KERNEL); + if (!_errbuf) + return -ENOMEM; - err = bch2_sb_validate(&sb); - if (err) - return -EINVAL; + errbuf = _PBUF(_errbuf, 4096); + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + bch_err(c, "device add error: error reading super: %i", ret); + goto err; + } dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; err = bch2_dev_may_add(sb.sb, c); - if (err) - return -EINVAL; + if (err) { + bch_err(c, "device add error: %s", err); + ret = -EINVAL; + goto err; + } ca = __bch2_dev_alloc(c, &dev_mi); if (!ca) { bch2_free_super(&sb); - return -ENOMEM; + ret = -ENOMEM; + goto err; } ret = __bch2_dev_attach_bdev(ca, &sb); if (ret) { bch2_dev_free(ca); - return ret; + goto err; } - /* - * We want to allocate journal on the new device before adding the new - * device to the filesystem because allocating after we attach requires - * spinning up the allocator thread, and the allocator thread requires - * doing btree writes, which if the existing devices are RO isn't going - * to work - * - * So we have to mark where the superblocks are, but marking allocated - * data normally updates the filesystem usage too, so we have to mark, - * allocate the journal, reset all the marks, then remark after we - * attach... - */ - bch2_mark_dev_superblock(NULL, ca, 0); - - err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); - if (ret) + if (ret) { + bch_err(c, "device add error: journal alloc failed"); goto err; + } down_write(&c->state_lock); mutex_lock(&c->sb_lock); - err = "insufficient space in new superblock"; ret = bch2_sb_from_fs(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: new device superblock too small"); goto err_unlock; + } mi = bch2_sb_get_members(ca->disk_sb.sb); if (!bch2_sb_resize_members(&ca->disk_sb, le32_to_cpu(mi->field.u64s) + sizeof(dev_mi) / sizeof(u64))) { + bch_err(c, "device add error: new device superblock too small"); ret = -ENOSPC; goto err_unlock; } @@ -1648,7 +1685,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) goto have_slot; no_slot: - err = "no slots available in superblock"; + bch_err(c, "device add error: already have maximum number of devices"); ret = -ENOSPC; goto err_unlock; @@ -1657,12 +1694,12 @@ have_slot: u64s = (sizeof(struct bch_sb_field_members) + sizeof(struct bch_member) * nr_devices) / sizeof(u64); - err = "no space in superblock for member info"; - ret = -ENOSPC; - mi = bch2_sb_resize_members(&c->disk_sb, u64s); - if (!mi) + if (!mi) { + bch_err(c, "device add error: no room in superblock for member info"); + ret = -ENOSPC; goto err_unlock; + } /* success: */ @@ -1678,15 +1715,20 @@ have_slot: bch2_dev_usage_journal_reserve(c); - err = "error marking superblock"; ret = bch2_trans_mark_dev_sb(c, ca); - if (ret) + if (ret) { + bch_err(c, "device add error: error marking new superblock: %i", ret); goto err_late; + } + + ca->new_fs_bucket_idx = 0; if (ca->mi.state == BCH_MEMBER_STATE_rw) { - err = __bch2_dev_read_write(c, ca); - if (err) + ret = __bch2_dev_read_write(c, ca); + if (ret) { + bch_err(c, "device add error: error going RW on new device: %i", ret); goto err_late; + } } up_write(&c->state_lock); @@ -1699,12 +1741,12 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - bch_err(c, "Unable to add device: %s", err); + kfree(_errbuf); return ret; err_late: up_write(&c->state_lock); - bch_err(c, "Error going rw after adding device: %s", err); - return -EINVAL; + ca = NULL; + goto err; } /* Hot add existing device to running filesystem: */ @@ -1729,24 +1771,27 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (err) + if (err) { + bch_err(c, "error bringing %s online: %s", path, err); goto err; + } - if (bch2_dev_attach_bdev(c, &sb)) { - err = "bch2_dev_attach_bdev() error"; + ret = bch2_dev_attach_bdev(c, &sb); + if (ret) goto err; - } ca = bch_dev_locked(c, dev_idx); - if (bch2_trans_mark_dev_sb(c, ca)) { - err = "bch2_trans_mark_dev_sb() error"; + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err(c, "error bringing %s online: error %i from bch2_trans_mark_dev_sb", + path, ret); goto err; } if (ca->mi.state == BCH_MEMBER_STATE_rw) { - err = __bch2_dev_read_write(c, ca); - if (err) + ret = __bch2_dev_read_write(c, ca); + if (ret) goto err; } @@ -1764,7 +1809,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) err: up_write(&c->state_lock); bch2_free_super(&sb); - bch_err(c, "error bringing %s online: %s", path, err); return -EINVAL; } @@ -1836,20 +1880,14 @@ err: } /* return with ref on ca->ref: */ -struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { struct bch_dev *ca; - dev_t dev; unsigned i; - int ret; - - ret = lookup_bdev(path, &dev); - if (ret) - return ERR_PTR(ret); rcu_read_lock(); for_each_member_device_rcu(ca, c, i, NULL) - if (ca->disk_sb.bdev->bd_dev == dev) + if (!strcmp(name, ca->name)) goto found; ca = ERR_PTR(-ENOENT); found: @@ -1868,32 +1906,39 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_sb_field_members *mi; unsigned i, best_sb = 0; const char *err; - int ret = -ENOMEM; + char *_errbuf = NULL; + struct printbuf errbuf; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); pr_verbose_init(opts, ""); if (!nr_devices) { - c = ERR_PTR(-EINVAL); - goto out2; + ret = -EINVAL; + goto err; } - if (!try_module_get(THIS_MODULE)) { - c = ERR_PTR(-ENODEV); - goto out2; + _errbuf = kmalloc(4096, GFP_KERNEL); + if (!_errbuf) { + ret = -ENOMEM; + goto err; } + errbuf = _PBUF(_errbuf, 4096); + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); - if (!sb) + if (!sb) { + ret = -ENOMEM; goto err; + } for (i = 0; i < nr_devices; i++) { ret = bch2_read_super(devices[i], &opts, &sb[i]); if (ret) goto err; - err = bch2_sb_validate(&sb[i]); - if (err) - goto err_print; } for (i = 1; i < nr_devices; i++) @@ -1921,18 +1966,20 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, i++; } - ret = -ENOMEM; c = bch2_fs_alloc(sb[best_sb].sb, opts); - if (!c) + if (IS_ERR(c)) { + ret = PTR_ERR(c); goto err; + } - err = "bch2_dev_online() error"; down_write(&c->state_lock); - for (i = 0; i < nr_devices; i++) - if (bch2_dev_attach_bdev(c, &sb[i])) { + for (i = 0; i < nr_devices; i++) { + ret = bch2_dev_attach_bdev(c, &sb[i]); + if (ret) { up_write(&c->state_lock); - goto err_print; + goto err; } + } up_write(&c->state_lock); err = "insufficient devices"; @@ -1946,8 +1993,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } out: kfree(sb); + kfree(_errbuf); module_put(THIS_MODULE); -out2: pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err_print: @@ -1955,89 +2002,15 @@ err_print: devices[0], err); ret = -EINVAL; err: - if (c) + if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); - for (i = 0; i < nr_devices; i++) - bch2_free_super(&sb[i]); + if (sb) + for (i = 0; i < nr_devices; i++) + bch2_free_super(&sb[i]); c = ERR_PTR(ret); goto out; } -static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, - struct bch_opts opts) -{ - const char *err; - struct bch_fs *c; - bool allocated_fs = false; - int ret; - - err = bch2_sb_validate(sb); - if (err) - return err; - - mutex_lock(&bch_fs_list_lock); - c = __bch2_uuid_to_fs(sb->sb->uuid); - if (c) { - closure_get(&c->cl); - - err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); - if (err) - goto err; - } else { - c = bch2_fs_alloc(sb->sb, opts); - err = "cannot allocate memory"; - if (!c) - goto err; - - allocated_fs = true; - } - - err = "bch2_dev_online() error"; - - mutex_lock(&c->sb_lock); - if (bch2_dev_attach_bdev(c, sb)) { - mutex_unlock(&c->sb_lock); - goto err; - } - mutex_unlock(&c->sb_lock); - - if (!c->opts.nostart && bch2_fs_may_start(c)) { - err = "error starting filesystem"; - ret = bch2_fs_start(c); - if (ret) - goto err; - } - - closure_put(&c->cl); - mutex_unlock(&bch_fs_list_lock); - - return NULL; -err: - mutex_unlock(&bch_fs_list_lock); - - if (allocated_fs) - bch2_fs_stop(c); - else if (c) - closure_put(&c->cl); - - return err; -} - -const char *bch2_fs_open_incremental(const char *path) -{ - struct bch_sb_handle sb; - struct bch_opts opts = bch2_opts_empty(); - const char *err; - - if (bch2_read_super(path, &opts, &sb)) - return "error reading superblock"; - - err = __bch2_fs_open_incremental(&sb, opts); - bch2_free_super(&sb); - - return err; -} - /* Global interfaces/init */ static void bcachefs_exit(void) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 739e8fd..3f24ca5 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -194,6 +194,27 @@ static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) return devs; } +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(uuid_le); @@ -233,6 +254,5 @@ void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -const char *bch2_fs_open_incremental(const char *path); #endif /* _BCACHEFS_SUPER_H */ diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 96023f3..d8b159a 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -29,7 +29,6 @@ struct bch_member_cpu { u16 bucket_size; /* sectors */ u16 group; u8 state; - u8 replacement; u8 discard; u8 data_allowed; u8 durability; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 9b1ffbf..b727845 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -10,6 +10,7 @@ #include "bcachefs.h" #include "alloc_background.h" +#include "alloc_foreground.h" #include "sysfs.h" #include "btree_cache.h" #include "btree_io.h" @@ -131,7 +132,6 @@ do { \ return strtoi_h(buf, &var) ?: (ssize_t) size; \ } while (0) -write_attribute(trigger_journal_flush); write_attribute(trigger_gc); write_attribute(prune_cache); rw_attribute(btree_gc_periodic); @@ -140,8 +140,6 @@ rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); read_attribute(bucket_size); -read_attribute(block_size); -read_attribute(btree_node_size); read_attribute(first_bucket); read_attribute(nbuckets); read_attribute(durability); @@ -155,11 +153,6 @@ read_attribute(congested); read_attribute(btree_avg_write_size); -read_attribute(bucket_quantiles_last_read); -read_attribute(bucket_quantiles_last_write); -read_attribute(bucket_quantiles_fragmentation); -read_attribute(bucket_quantiles_oldest_gen); - read_attribute(reserve_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -183,11 +176,7 @@ read_attribute(read_realloc_races); read_attribute(extent_migrate_done); read_attribute(extent_migrate_raced); -rw_attribute(journal_write_delay_ms); -rw_attribute(journal_reclaim_delay_ms); - rw_attribute(discard); -rw_attribute(cache_replacement_policy); rw_attribute(label); rw_attribute(copy_gc_enabled); @@ -203,6 +192,8 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); +read_attribute(data_jobs); + #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); #endif /* CONFIG_BCACHEFS_TESTS */ @@ -239,28 +230,36 @@ static size_t bch2_btree_avg_write_size(struct bch_fs *c) return nr ? div64_u64(sectors, nr) : 0; } -static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) +static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) { - struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c); + long ret = 0; + struct bch_move_stats *stats; - if (!fs_usage) - return -ENOMEM; - - bch2_fs_usage_to_text(out, c, fs_usage); - - percpu_up_read(&c->mark_lock); + mutex_lock(&c->data_progress_lock); + list_for_each_entry(stats, &c->data_progress_list, list) { + pr_buf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + pr_buf(out, "%s", "\n"); + } - kfree(fs_usage); - return 0; + mutex_unlock(&c->data_progress_lock); + return ret; } static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; - u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, + enum btree_id id; + u64 nr_uncompressed_extents = 0, nr_compressed_extents = 0, + nr_incompressible_extents = 0, + uncompressed_sectors = 0, + incompressible_sectors = 0, compressed_sectors_compressed = 0, compressed_sectors_uncompressed = 0; int ret; @@ -270,46 +269,72 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret) - if (k.k->type == KEY_TYPE_extent) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + for (id = 0; id < BTREE_ID_NR; id++) { + if (!((1U << id) & BTREE_ID_HAS_PTRS)) + continue; + + for_each_btree_key(&trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - - extent_for_each_ptr_decode(e, p, entry) { - if (!crc_is_compressed(p.crc)) { - nr_uncompressed_extents++; - uncompressed_sectors += e.k->size; - } else { - nr_compressed_extents++; + bool compressed = false, uncompressed = false, incompressible = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + switch (p.crc.compression_type) { + case BCH_COMPRESSION_TYPE_none: + uncompressed = true; + uncompressed_sectors += k.k->size; + break; + case BCH_COMPRESSION_TYPE_incompressible: + incompressible = true; + incompressible_sectors += k.k->size; + break; + default: compressed_sectors_compressed += p.crc.compressed_size; compressed_sectors_uncompressed += p.crc.uncompressed_size; + compressed = true; + break; } - - /* only looking at the first ptr */ - break; } + + if (incompressible) + nr_incompressible_extents++; + else if (uncompressed) + nr_uncompressed_extents++; + else if (compressed) + nr_compressed_extents++; } + bch2_trans_iter_exit(&trans, &iter); + } + + bch2_trans_exit(&trans); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) return ret; - pr_buf(out, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); + pr_buf(out, "uncompressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_uncompressed_extents); + pr_buf(out, " size: "); + bch2_hprint(out, uncompressed_sectors << 9); + pr_buf(out, "\n"); + + pr_buf(out, "compressed:\n"); + pr_buf(out, " nr extents: %llu\n", nr_compressed_extents); + pr_buf(out, " compressed size: "); + bch2_hprint(out, compressed_sectors_compressed << 9); + pr_buf(out, "\n"); + pr_buf(out, " uncompressed size: "); + bch2_hprint(out, compressed_sectors_uncompressed << 9); + pr_buf(out, "\n"); + + pr_buf(out, "incompressible:\n"); + pr_buf(out, " nr extents: %llu\n", nr_incompressible_extents); + pr_buf(out, " size: "); + bch2_hprint(out, incompressible_sectors << 9); + pr_buf(out, "\n"); return 0; } @@ -328,11 +353,6 @@ SHOW(bch2_fs) sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); - sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); - sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - - sysfs_print(block_size, block_bytes(c)); - sysfs_print(btree_node_size, btree_bytes(c)); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); @@ -367,9 +387,6 @@ SHOW(bch2_fs) /* Debugging: */ - if (attr == &sysfs_alloc_debug) - return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; - if (attr == &sysfs_journal_debug) { bch2_journal_debug_to_text(&out, &c->journal); return out.pos - buf; @@ -434,6 +451,11 @@ SHOW(bch2_fs) return out.pos - buf; } + if (attr == &sysfs_data_jobs) { + data_progress_to_text(&out, c); + return out.pos - buf; + } + return 0; } @@ -441,9 +463,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); - sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - if (attr == &sysfs_btree_gc_periodic) { ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ?: (ssize_t) size; @@ -480,8 +499,16 @@ STORE(bch2_fs) /* Debugging: */ - if (attr == &sysfs_trigger_journal_flush) - bch2_journal_meta(&c->journal); + if (!test_bit(BCH_FS_RW, &c->flags)) + return -EROFS; + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); + } if (attr == &sysfs_trigger_gc) { /* @@ -496,14 +523,6 @@ STORE(bch2_fs) #endif } - if (attr == &sysfs_prune_cache) { - struct shrink_control sc; - - sc.gfp_mask = GFP_KERNEL; - sc.nr_to_scan = strtoul_or_return(buf); - c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); - } - #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -530,14 +549,9 @@ SYSFS_OPS(bch2_fs); struct attribute *bch2_fs_files[] = { &sysfs_minor, - &sysfs_block_size, - &sysfs_btree_node_size, &sysfs_btree_cache_size, &sysfs_btree_avg_write_size, - &sysfs_journal_write_delay_ms, - &sysfs_journal_reclaim_delay_ms, - &sysfs_promote_whole_extents, &sysfs_compression_stats, @@ -564,7 +578,6 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { - &sysfs_alloc_debug, &sysfs_journal_debug, &sysfs_journal_pins, &sysfs_btree_updates, @@ -572,17 +585,20 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_cache, &sysfs_btree_key_cache, &sysfs_btree_transactions, + &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, + &sysfs_io_timers_read, + &sysfs_io_timers_write, + + &sysfs_trigger_gc, + &sysfs_prune_cache, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, &sysfs_extent_migrate_raced, - &sysfs_trigger_journal_flush, - &sysfs_trigger_gc, &sysfs_gc_gens_pos, - &sysfs_prune_cache, &sysfs_copy_gc_enabled, &sysfs_copy_gc_wait, @@ -591,10 +607,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - &sysfs_new_stripes, - - &sysfs_io_timers_read, - &sysfs_io_timers_write, + &sysfs_data_jobs, &sysfs_internal_uuid, NULL @@ -628,7 +641,7 @@ STORE(bch2_fs_opts_dir) if (!tmp) return -ENOMEM; - ret = bch2_opt_parse(c, opt, strim(tmp), &v); + ret = bch2_opt_parse(c, NULL, opt, strim(tmp), &v); kfree(tmp); if (ret < 0) @@ -638,13 +651,7 @@ STORE(bch2_fs_opts_dir) if (ret < 0) return ret; - if (opt->set_sb != SET_NO_SB_OPT) { - mutex_lock(&c->sb_lock); - opt->set_sb(c->disk_sb.sb, v); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - } - + bch2_opt_set_sb(c, opt, v); bch2_opt_set_by_id(&c->opts, id, v); if ((id == Opt_background_target || @@ -667,7 +674,7 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) for (i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) + if (!(i->flags & OPT_FS)) continue; ret = sysfs_create_file(kobj, &i->attr); @@ -710,76 +717,6 @@ struct attribute *bch2_fs_time_stats_files[] = { NULL }; -typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, - size_t, void *); - -static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - int rw = (private ? 1 : 0); - - return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; -} - -static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - struct bucket *g = bucket(ca, b); - return bucket_sectors_used(g->mark); -} - -static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, - size_t b, void *private) -{ - return bucket_gc_gen(bucket(ca, b)); -} - -static int unsigned_cmp(const void *_l, const void *_r) -{ - const unsigned *l = _l; - const unsigned *r = _r; - - return cmp_int(*l, *r); -} - -static int quantiles_to_text(struct printbuf *out, - struct bch_fs *c, struct bch_dev *ca, - bucket_map_fn *fn, void *private) -{ - size_t i, n; - /* Compute 31 quantiles */ - unsigned q[31], *p; - - down_read(&ca->bucket_lock); - n = ca->mi.nbuckets; - - p = vzalloc(n * sizeof(unsigned)); - if (!p) { - up_read(&ca->bucket_lock); - return -ENOMEM; - } - - for (i = ca->mi.first_bucket; i < n; i++) - p[i] = fn(c, ca, i, private); - - sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); - up_read(&ca->bucket_lock); - - while (n && - !p[n - 1]) - --n; - - for (i = 0; i < ARRAY_SIZE(q); i++) - q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; - - vfree(p); - - for (i = 0; i < ARRAY_SIZE(q); i++) - pr_buf(out, "%u ", q[i]); - pr_buf(out, "\n"); - return 0; -} - static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) { enum alloc_reserve i; @@ -807,7 +744,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) memset(nr, 0, sizeof(nr)); for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) - nr[c->open_buckets[i].type]++; + nr[c->open_buckets[i].data_type]++; pr_buf(out, "\t\t buckets\t sectors fragmented\n" @@ -880,7 +817,6 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); sysfs_print(bucket_size, bucket_bytes(ca)); - sysfs_print(block_size, block_bytes(c)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); sysfs_print(durability, ca->mi.durability); @@ -905,14 +841,6 @@ SHOW(bch2_dev) return out.pos - buf; } - if (attr == &sysfs_cache_replacement_policy) { - bch2_string_opt_to_text(&out, - bch2_cache_replacement_policies, - ca->mi.replacement); - pr_buf(&out, "\n"); - return out.pos - buf; - } - if (attr == &sysfs_state_rw) { bch2_string_opt_to_text(&out, bch2_member_states, ca->mi.state); @@ -941,15 +869,6 @@ SHOW(bch2_dev) clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); - if (attr == &sysfs_bucket_quantiles_last_read) - return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_last_write) - return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_fragmentation) - return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_bucket_quantiles_oldest_gen) - return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_reserve_stats) { reserve_stats_to_text(&out, ca); return out.pos - buf; @@ -981,22 +900,6 @@ STORE(bch2_dev) mutex_unlock(&c->sb_lock); } - if (attr == &sysfs_cache_replacement_policy) { - ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); - - if (v < 0) - return v; - - mutex_lock(&c->sb_lock); - mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; - - if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { - SET_BCH_MEMBER_REPLACEMENT(mi, v); - bch2_write_super(c); - } - mutex_unlock(&c->sb_lock); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -1021,14 +924,12 @@ SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, &sysfs_bucket_size, - &sysfs_block_size, &sysfs_first_bucket, &sysfs_nbuckets, &sysfs_durability, /* settings: */ &sysfs_discard, - &sysfs_cache_replacement_policy, &sysfs_state_rw, &sysfs_label, @@ -1041,12 +942,6 @@ struct attribute *bch2_dev_files[] = { &sysfs_io_latency_stats_write, &sysfs_congested, - /* alloc info - other stats: */ - &sysfs_bucket_quantiles_last_read, - &sysfs_bucket_quantiles_last_write, - &sysfs_bucket_quantiles_fragmentation, - &sysfs_bucket_quantiles_oldest_gen, - &sysfs_reserve_stats, /* debug: */ diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 4d8d50f..de84ce8 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -4,6 +4,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "journal_reclaim.h" +#include "subvolume.h" #include "tests.h" #include "linux/kthread.h" @@ -14,12 +15,14 @@ static void delete_test_keys(struct bch_fs *c) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_extents, - POS(0, 0), POS(0, U64_MAX), + POS_MIN, SPOS_MAX, + BTREE_ITER_ALL_SNAPSHOTS, NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS(0, 0), POS(0, U64_MAX), + POS_MIN, SPOS_MAX, + BTREE_ITER_ALL_SNAPSHOTS, NULL); BUG_ON(ret); } @@ -29,7 +32,7 @@ static void delete_test_keys(struct bch_fs *c) static int test_delete(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -37,13 +40,12 @@ static int test_delete(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &k.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { bch_err(c, "update error in test_delete: %i", ret); goto err; @@ -51,8 +53,8 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting once"); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_btree_delete_at(&trans, iter, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error (first) in test_delete: %i", ret); goto err; @@ -60,14 +62,14 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting twice"); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_btree_delete_at(&trans, iter, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error (second) in test_delete: %i", ret); goto err; } err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -75,7 +77,7 @@ err: static int test_delete_written(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_i_cookie k; int ret; @@ -84,12 +86,12 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p, - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &k.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { bch_err(c, "update error in test_delete_written: %i", ret); goto err; @@ -99,14 +101,14 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_journal_flush_all_pins(&c->journal); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_btree_delete_at(&trans, iter, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { bch_err(c, "delete error in test_delete_written: %i", ret); goto err; } err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -114,7 +116,7 @@ err: static int test_iterate(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -145,7 +147,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS_MIN, 0, k, ret) { + SPOS(0, 0, U32_MAX), 0, k, ret) { if (k.k->p.inode) break; @@ -156,12 +158,12 @@ static int test_iterate(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) BUG_ON(k.k->p.offset != --i); BUG_ON(i); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -169,7 +171,7 @@ err: static int test_iterate_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter = NULL; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -201,7 +203,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS_MIN, 0, k, ret) { + SPOS(0, 0, U32_MAX), 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; } @@ -210,14 +212,14 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) pr_info("iterating backwards"); - while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { + while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(&iter)).k)) { BUG_ON(k.k->p.offset != i); i = bkey_start_offset(k.k); } BUG_ON(i); err: - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } @@ -225,7 +227,7 @@ err: static int test_iterate_slots(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -255,15 +257,15 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, - 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ret) { if (k.k->p.inode) break; BUG_ON(k.k->p.offset != i); i += 2; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr * 2); @@ -271,7 +273,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS, k, ret) { BUG_ON(k.k->p.offset != i); BUG_ON(bkey_deleted(k.k) != (i & 1)); @@ -280,7 +283,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) if (i == nr * 2) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); err: bch2_trans_exit(&trans); return ret; @@ -289,7 +292,7 @@ err: static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter = { NULL }; struct bkey_s_c k; u64 i; int ret = 0; @@ -320,13 +323,13 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, - 0, k, ret) { + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0, k, ret) { BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); BUG_ON(i != nr); @@ -334,7 +337,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS, k, ret) { BUG_ON(bkey_deleted(k.k) != !(i % 16)); @@ -345,7 +349,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) if (i == nr) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); err: bch2_trans_exit(&trans); return 0; @@ -358,21 +362,20 @@ err: static int test_peek_end(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); - - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - bch2_trans_iter_put(&trans, iter); - + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return 0; } @@ -380,21 +383,20 @@ static int test_peek_end(struct bch_fs *c, u64 nr) static int test_peek_end_extents(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0); - - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); BUG_ON(k.k); - bch2_trans_iter_put(&trans, iter); - + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return 0; } @@ -409,8 +411,6 @@ static int insert_test_extent(struct bch_fs *c, struct bkey_i_cookie k; int ret; - //pr_info("inserting %llu-%llu v %llu", start, end, test_version); - bkey_cookie_init(&k.k_i); k.k_i.k.p.offset = end; k.k_i.k.p.snapshot = U32_MAX; @@ -462,6 +462,70 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) __test_extent_overwrite(c, 32, 64, 32, 128); } +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ +static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie cookie; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); + k = bch2_btree_iter_peek(&iter); + + BUG_ON(k.k->p.snapshot != U32_MAX); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_snapshots(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie cookie; + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_snapshot_node_create(&trans, U32_MAX, + snapids, + snapid_subvols, + 2)); + if (ret) + return ret; + + if (snapids[0] > snapids[1]) + swap(snapids[0], snapids[1]); + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { + bch_err(c, "err %i from test_snapshot_filter", ret); + return ret; + } + + return 0; +} + /* perf tests */ static u64 test_rand(void) @@ -540,18 +604,19 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) static int rand_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; u64 i; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(iter, POS(0, test_rand())); + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) { bch_err(c, "error in rand_lookup: %i", ret); @@ -559,63 +624,73 @@ static int rand_lookup(struct bch_fs *c, u64 nr) } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } +static int rand_mixed_trans(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i_cookie *cookie, + u64 i, u64 pos) +{ + struct bkey_s_c k; + int ret; + + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret && ret != -EINTR) + bch_err(trans->c, "lookup error in rand_mixed: %i", ret); + if (ret) + return ret; + + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + + return ret; +} + static int rand_mixed(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; + struct btree_iter iter; + struct bkey_i_cookie cookie; int ret = 0; - u64 i; + u64 i, rand; bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(iter, POS(0, test_rand())); - - k = bch2_btree_iter_peek(iter); - ret = bkey_err(k); + rand = test_rand(); + ret = __bch2_trans_do(&trans, NULL, NULL, 0, + rand_mixed_trans(&trans, &iter, &cookie, i, rand)); if (ret) { - bch_err(c, "lookup error in rand_mixed: %i", ret); + bch_err(c, "update error in rand_mixed: %i", ret); break; } - - if (!(i & 3) && k.k) { - struct bkey_i_cookie k; - - bkey_cookie_init(&k.k_i); - k.k.p = iter->pos; - - ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &k.k_i, 0)); - if (ret) { - bch_err(c, "update error in rand_mixed: %i", ret); - break; - } - } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; } static int __do_delete(struct btree_trans *trans, struct bpos pos) { - struct btree_iter *iter; - struct bkey_i delete; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos, - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) goto err; @@ -623,12 +698,9 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) if (!k.k) goto err; - bkey_init(&delete.k); - delete.k.p = k.k->p; - - ret = bch2_trans_update(trans, iter, &delete, 0); + ret = bch2_btree_delete_at(trans, &iter, 0); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -641,7 +713,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); for (i = 0; i < nr; i++) { - struct bpos pos = POS(0, test_rand()); + struct bpos pos = SPOS(0, test_rand(), U32_MAX); ret = __bch2_trans_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); @@ -658,7 +730,7 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct bkey_i_cookie insert; int ret = 0; @@ -668,13 +740,13 @@ static int seq_insert(struct bch_fs *c, u64 nr) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - insert.k.p = iter->pos; + insert.k.p = iter.pos; ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &insert.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &insert.k_i, 0)); if (ret) { bch_err(c, "error in seq_insert: %i", ret); break; @@ -683,7 +755,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) if (++i == nr) break; } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -692,15 +764,16 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret) + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0, k, ret) ; - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -709,27 +782,28 @@ static int seq_lookup(struct bch_fs *c, u64 nr) static int seq_overwrite(struct bch_fs *c, u64 nr) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; int ret = 0; bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, + for_each_btree_key(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), BTREE_ITER_INTENT, k, ret) { struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_btree_iter_traverse(iter) ?: - bch2_trans_update(&trans, iter, &u.k_i, 0)); + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &u.k_i, 0)); if (ret) { bch_err(c, "error in seq_overwrite: %i", ret); break; } } - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return ret; @@ -740,7 +814,8 @@ static int seq_delete(struct bch_fs *c, u64 nr) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - POS(0, 0), POS(0, U64_MAX), + POS_MIN, SPOS_MAX, + BTREE_ITER_ALL_SNAPSHOTS, NULL); if (ret) bch_err(c, "error in seq_delete: %i", ret); @@ -778,9 +853,11 @@ static int btree_perf_test_thread(void *data) wait_event(j->ready_wait, !atomic_read(&j->ready)); } - ret = j->fn(j->c, j->nr / j->nr_threads); - if (ret) + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); + if (ret) { + bch_err(j->c, "%ps: error %i", j->fn, ret); j->ret = ret; + } if (atomic_dec_and_test(&j->done)) { j->finish = sched_clock(); @@ -833,6 +910,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, perf_test(test_extent_overwrite_middle); perf_test(test_extent_overwrite_all); + perf_test(test_snapshots); + if (!j.fn) { pr_err("unknown test %s", testname); return -EINVAL; @@ -854,11 +933,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, scnprintf(name_buf, sizeof(name_buf), "%s:", testname); bch2_hprint(&PBUF(nr_buf), nr); - bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); + bch2_hprint(&PBUF(per_sec_buf), div64_u64(nr * NSEC_PER_SEC, time)); printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", name_buf, nr_buf, nr_threads, - time / NSEC_PER_SEC, - time * nr_threads / nr, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), per_sec_buf); return j.ret; } diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 463260c..0bbea33 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -114,7 +114,7 @@ void bch2_hprint(struct printbuf *buf, s64 v) * 103 is magic: t is in the range [-1023, 1023] and we want * to turn it into [-9, 9] */ - if (u && v < 100 && v > -100) + if (u && t && v < 100 && v > -100) pr_buf(buf, ".%i", t / 103); if (u) pr_buf(buf, "%c", si_units[u]); @@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) if (!page) return -ENOMEM; - BUG_ON(!bio_add_page(bio, page, len, 0)); + if (unlikely(!bio_add_page(bio, page, len, 0))) { + __free_page(page); + break; + } + size -= len; } @@ -887,9 +891,14 @@ void eytzinger0_find_test(void) */ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) { - u64 *ret = this_cpu_ptr(p); + u64 *ret; int cpu; + /* access to pcpu vars has to be blocked by other locking */ + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + for_each_possible_cpu(cpu) { u64 *i = per_cpu_ptr(p, cpu); diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 84ef4d6..e55407d 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -18,9 +18,6 @@ #include #include -#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) - struct closure; #ifdef CONFIG_BCACHEFS_DEBUG @@ -241,6 +238,7 @@ do { \ struct printbuf { char *pos; char *end; + unsigned indent; }; static inline size_t printbuf_remaining(struct printbuf *buf) @@ -262,6 +260,27 @@ do { \ __VA_ARGS__); \ } while (0) +static inline void printbuf_indent_push(struct printbuf *buf, unsigned spaces) +{ + buf->indent += spaces; + while (spaces--) + pr_buf(buf, " "); +} + +static inline void printbuf_indent_pop(struct printbuf *buf, unsigned spaces) +{ + buf->indent -= spaces; +} + +static inline void printbuf_newline(struct printbuf *buf) +{ + unsigned i; + + pr_buf(buf, "\n"); + for (i = 0; i < buf->indent; i++) + pr_buf(buf, " "); +} + void bch_scnmemcpy(struct printbuf *, const char *, size_t); int bch2_strtoint_h(const char *, int *); @@ -749,4 +768,13 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) +{ + sprintf(out, "%plU", uuid); +} +#else +#include +#endif + #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index e6a0415..a2d6bb7 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -4,6 +4,10 @@ #include #include +#ifdef CONFIG_VALGRIND +#include +#endif + #include "varint.h" /** @@ -95,8 +99,11 @@ int bch2_varint_encode_fast(u8 *out, u64 v) */ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) { +#ifdef CONFIG_VALGRIND + VALGRIND_MAKE_MEM_DEFINED(in, 8); +#endif u64 v = get_unaligned_le64(in); - unsigned bytes = ffz(v & 255) + 1; + unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) return -1; diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index e4d400b..4d7db64 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -122,23 +122,22 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info const char *name, void *buffer, size_t size, int type) { struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_s_c k; int ret; - iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, &hash, - inode->v.i_ino, - &X_SEARCH(type, name, strlen(name)), - 0); - ret = PTR_ERR_OR_ZERO(iter); + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), + &X_SEARCH(type, name, strlen(name)), + 0); if (ret) - goto err; + goto err1; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - goto err; + goto err2; xattr = bkey_s_c_to_xattr(k); ret = le16_to_cpu(xattr.v->x_val_len); @@ -148,8 +147,9 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info else memcpy(buffer, xattr_val(xattr.v), ret); } - bch2_trans_iter_put(trans, iter); -err: +err2: + bch2_trans_iter_exit(trans, &iter); +err1: return ret == -ENOENT ? -ENODATA : ret; } @@ -160,13 +160,29 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); } -int bch2_xattr_set(struct btree_trans *trans, u64 inum, +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) { + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; int ret; + /* + * We need to do an inode update so that bi_journal_sync gets updated + * and fsync works: + * + * Perhaps we should be updating bi_mtime too? + */ + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + bch2_inode_write(trans, &inode_iter, &inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; + if (value) { struct bkey_i_xattr *xattr; unsigned namelen = strlen(name); @@ -279,16 +295,24 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct bch_fs *c = dentry->d_sb->s_fs_info; struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 inum = dentry->d_inode->i_ino; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; - for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS(inum, 0), 0, k, ret) { + for_each_btree_key_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) @@ -301,9 +325,14 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) break; } - bch2_trans_iter_put(&trans, iter); - ret = bch2_trans_exit(&trans) ?: ret; + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; + + bch2_trans_exit(&trans); if (ret) return ret; @@ -339,8 +368,8 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, &hash, + return bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); } @@ -496,7 +525,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, memcpy(buf, value, size); buf[size] = '\0'; - ret = bch2_opt_parse(c, opt, buf, &v); + ret = bch2_opt_parse(c, NULL, opt, buf, &v); kfree(buf); if (ret < 0) diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 4151065..f4f8965 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -39,7 +39,8 @@ struct bch_inode_info; int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, +int bch2_xattr_set(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); diff --git a/linux/blkdev.c b/linux/blkdev.c index 270d3c8..762e5aa 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -22,6 +22,14 @@ #include "tools-util.h" +struct fops { + void (*init)(void); + void (*cleanup)(void); + void (*read)(struct bio *bio, struct iovec * iov, unsigned i); + void (*write)(struct bio *bio, struct iovec * iov, unsigned i); +}; + +static struct fops *fops; static io_context_t aio_ctx; static atomic_t running_requests; @@ -66,35 +74,12 @@ void generic_make_request(struct bio *bio) #endif } - struct iocb iocb = { - .data = bio, - .aio_fildes = bio->bi_opf & REQ_FUA - ? bio->bi_bdev->bd_sync_fd - : bio->bi_bdev->bd_fd, - }, *iocbp = &iocb; - switch (bio_op(bio)) { case REQ_OP_READ: - iocb.aio_lio_opcode = IO_CMD_PREADV; - iocb.u.v.vec = iov; - iocb.u.v.nr = i; - iocb.u.v.offset = bio->bi_iter.bi_sector << 9; - - atomic_inc(&running_requests); - ret = io_submit(aio_ctx, 1, &iocbp); - if (ret != 1) - die("io_submit err: %s", strerror(-ret)); + fops->read(bio, iov, i); break; case REQ_OP_WRITE: - iocb.aio_lio_opcode = IO_CMD_PWRITEV; - iocb.u.v.vec = iov; - iocb.u.v.nr = i; - iocb.u.v.offset = bio->bi_iter.bi_sector << 9; - - atomic_inc(&running_requests); - ret = io_submit(aio_ctx, 1, &iocbp); - if (ret != 1) - die("io_submit err: %s", strerror(-ret)); + fops->write(bio, iov, i); break; case REQ_OP_FLUSH: ret = fsync(bio->bi_bdev->bd_fd); @@ -220,8 +205,8 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, bdev->bd_sync_fd = sync_fd; bdev->bd_holder = holder; bdev->bd_disk = &bdev->__bd_disk; - bdev->bd_bdi = &bdev->__bd_bdi; - bdev->queue.backing_dev_info = bdev->bd_bdi; + bdev->bd_disk->bdi = &bdev->bd_disk->__bdi; + bdev->queue.backing_dev_info = bdev->bd_disk->bdi; return bdev; } @@ -236,6 +221,55 @@ int lookup_bdev(const char *path, dev_t *dev) return -EINVAL; } +static void io_fallback(void) +{ + fops++; + if (fops->init == NULL) + die("no fallback possible, something is very wrong"); + fops->init(); +} + +static void sync_check(struct bio *bio, int ret) +{ + if (ret != bio->bi_iter.bi_size) { + die("IO error: %s\n", strerror(-ret)); + } + + if (bio->bi_opf & REQ_FUA) { + ret = fdatasync(bio->bi_bdev->bd_fd); + if (ret) + die("fsync error: %s\n", strerror(-ret)); + } + bio_endio(bio); +} + +static void sync_init(void) {} + +static void sync_cleanup(void) +{ + /* not necessary? */ + sync(); +} + +static void sync_read(struct bio *bio, struct iovec * iov, unsigned i) +{ + + int fd = bio->bi_opf & REQ_FUA + ? bio->bi_bdev->bd_sync_fd + : bio->bi_bdev->bd_fd; + ssize_t ret = preadv(fd, iov, i, bio->bi_iter.bi_sector << 9); + sync_check(bio, ret); +} + +static void sync_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + int fd = bio->bi_opf & REQ_FUA + ? bio->bi_bdev->bd_sync_fd + : bio->bi_bdev->bd_fd; + ssize_t ret = pwritev(fd, iov, i, bio->bi_iter.bi_sector << 9); + sync_check(bio, ret); +} + static int aio_completion_thread(void *arg) { struct io_event events[8], *ev; @@ -274,22 +308,24 @@ static int aio_completion_thread(void *arg) static struct task_struct *aio_task = NULL; -__attribute__((constructor(102))) -static void blkdev_init(void) +static void aio_init(void) { struct task_struct *p; + long err = io_setup(256, &aio_ctx); + if (!err) { + p = kthread_run(aio_completion_thread, NULL, "aio_completion"); + BUG_ON(IS_ERR(p)); - if (io_setup(256, &aio_ctx)) - die("io_setup() error: %m"); - - p = kthread_run(aio_completion_thread, NULL, "aio_completion"); - BUG_ON(IS_ERR(p)); + aio_task = p; - aio_task = p; + } else if (err == -ENOSYS) { + io_fallback(); + } else { + die("io_setup() error: %s", strerror(err)); + } } -__attribute__((destructor(102))) -static void blkdev_cleanup(void) +static void aio_cleanup(void) { struct task_struct *p = NULL; swap(aio_task, p); @@ -322,3 +358,71 @@ static void blkdev_cleanup(void) close(fds[0]); close(fds[1]); } + +static void aio_op(struct bio *bio, struct iovec *iov, unsigned i, int opcode) +{ + ssize_t ret; + struct iocb iocb = { + .data = bio, + .aio_fildes = bio->bi_opf & REQ_FUA + ? bio->bi_bdev->bd_sync_fd + : bio->bi_bdev->bd_fd, + .aio_lio_opcode = opcode, + .u.c.buf = iov, + .u.c.nbytes = i, + .u.c.offset = bio->bi_iter.bi_sector << 9, + + }, *iocbp = &iocb; + + atomic_inc(&running_requests); + ret = io_submit(aio_ctx, 1, &iocbp); + if (ret != 1) + die("io_submit err: %s", strerror(-ret)); +} + +static void aio_read(struct bio *bio, struct iovec *iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PREADV); +} + +static void aio_write(struct bio *bio, struct iovec * iov, unsigned i) +{ + aio_op(bio, iov, i, IO_CMD_PWRITEV); +} + + +/* not implemented */ +static void uring_init(void) { + io_fallback(); +} + +struct fops fops_list[] = { + { + .init = uring_init, + }, { + .init = aio_init, + .cleanup = aio_cleanup, + .read = aio_read, + .write = aio_write, + }, { + .init = sync_init, + .cleanup = sync_cleanup, + .read = sync_read, + .write = sync_write, + }, { + /* NULL */ + } +}; + +__attribute__((constructor(102))) +static void blkdev_init(void) +{ + fops = fops_list; + fops->init(); +} + +__attribute__((destructor(102))) +static void blkdev_cleanup(void) +{ + fops->cleanup(); +} diff --git a/linux/siphash.c b/linux/siphash.c new file mode 100644 index 0000000..f8dbece --- /dev/null +++ b/linux/siphash.c @@ -0,0 +1,552 @@ +/* Copyright (C) 2016 Jason A. Donenfeld . All Rights Reserved. + * + * This file is provided under a dual BSD/GPLv2 license. + * + * SipHash: a fast short-input PRF + * https://131002.net/siphash/ + * + * This implementation is specifically for SipHash2-4 for a secure PRF + * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for + * hashtables. + */ + +#include +#include +#include + +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 +#include +#include +#endif + +#define SIPROUND \ + do { \ + v0 += v1; v1 = rol64(v1, 13); v1 ^= v0; v0 = rol64(v0, 32); \ + v2 += v3; v3 = rol64(v3, 16); v3 ^= v2; \ + v0 += v3; v3 = rol64(v3, 21); v3 ^= v0; \ + v2 += v1; v1 = rol64(v1, 17); v1 ^= v2; v2 = rol64(v2, 32); \ + } while (0) + +#define PREAMBLE(len) \ + u64 v0 = 0x736f6d6570736575ULL; \ + u64 v1 = 0x646f72616e646f6dULL; \ + u64 v2 = 0x6c7967656e657261ULL; \ + u64 v3 = 0x7465646279746573ULL; \ + u64 b = ((u64)(len)) << 56; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define POSTAMBLE \ + v3 ^= b; \ + SIPROUND; \ + SIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + SIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + PREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + SIPROUND; + SIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + POSTAMBLE +} +EXPORT_SYMBOL(__siphash_unaligned); +#endif + +/** + * siphash_1u64 - compute 64-bit siphash PRF value of a u64 + * @first: first u64 + * @key: the siphash key + */ +u64 siphash_1u64(const u64 first, const siphash_key_t *key) +{ + PREAMBLE(8) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u64); + +/** + * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64 + * @first: first u64 + * @second: second u64 + * @key: the siphash key + */ +u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key) +{ + PREAMBLE(16) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_2u64); + +/** + * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @key: the siphash key + */ +u64 siphash_3u64(const u64 first, const u64 second, const u64 third, + const siphash_key_t *key) +{ + PREAMBLE(24) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u64); + +/** + * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64 + * @first: first u64 + * @second: second u64 + * @third: third u64 + * @forth: forth u64 + * @key: the siphash key + */ +u64 siphash_4u64(const u64 first, const u64 second, const u64 third, + const u64 forth, const siphash_key_t *key) +{ + PREAMBLE(32) + v3 ^= first; + SIPROUND; + SIPROUND; + v0 ^= first; + v3 ^= second; + SIPROUND; + SIPROUND; + v0 ^= second; + v3 ^= third; + SIPROUND; + SIPROUND; + v0 ^= third; + v3 ^= forth; + SIPROUND; + SIPROUND; + v0 ^= forth; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_4u64); + +u64 siphash_1u32(const u32 first, const siphash_key_t *key) +{ + PREAMBLE(4) + b |= first; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_1u32); + +u64 siphash_3u32(const u32 first, const u32 second, const u32 third, + const siphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + PREAMBLE(12) + v3 ^= combined; + SIPROUND; + SIPROUND; + v0 ^= combined; + b |= third; + POSTAMBLE +} +EXPORT_SYMBOL(siphash_3u32); + +#if BITS_PER_LONG == 64 +/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for + * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3. + */ + +#define HSIPROUND SIPROUND +#define HPREAMBLE(len) PREAMBLE(len) +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return (v0 ^ v1) ^ (v2 ^ v3); + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = le64_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= le32_to_cpup(data); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u64)); + const u8 left = len & (sizeof(u64) - 1); + u64 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u64)) { + m = get_unaligned_le64(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } +#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64 + if (left) + b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) & + bytemask_from_count(left))); +#else + switch (left) { + case 7: b |= ((u64)end[6]) << 48; fallthrough; + case 6: b |= ((u64)end[5]) << 40; fallthrough; + case 5: b |= ((u64)end[4]) << 32; fallthrough; + case 4: b |= get_unaligned_le32(end); break; + case 3: b |= ((u64)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } +#endif + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + b |= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(8) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(12) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + b |= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + u64 combined = (u64)second << 32 | first; + HPREAMBLE(16) + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + combined = (u64)forth << 32 | third; + v3 ^= combined; + HSIPROUND; + v0 ^= combined; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#else +#define HSIPROUND \ + do { \ + v0 += v1; v1 = rol32(v1, 5); v1 ^= v0; v0 = rol32(v0, 16); \ + v2 += v3; v3 = rol32(v3, 8); v3 ^= v2; \ + v0 += v3; v3 = rol32(v3, 7); v3 ^= v0; \ + v2 += v1; v1 = rol32(v1, 13); v1 ^= v2; v2 = rol32(v2, 16); \ + } while (0) + +#define HPREAMBLE(len) \ + u32 v0 = 0; \ + u32 v1 = 0; \ + u32 v2 = 0x6c796765U; \ + u32 v3 = 0x74656462U; \ + u32 b = ((u32)(len)) << 24; \ + v3 ^= key->key[1]; \ + v2 ^= key->key[0]; \ + v1 ^= key->key[1]; \ + v0 ^= key->key[0]; + +#define HPOSTAMBLE \ + v3 ^= b; \ + HSIPROUND; \ + v0 ^= b; \ + v2 ^= 0xff; \ + HSIPROUND; \ + HSIPROUND; \ + HSIPROUND; \ + return v1 ^ v3; + +u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = le32_to_cpup(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; fallthrough; + case 2: b |= le16_to_cpup(data); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_aligned); + +#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +u32 __hsiphash_unaligned(const void *data, size_t len, + const hsiphash_key_t *key) +{ + const u8 *end = data + len - (len % sizeof(u32)); + const u8 left = len & (sizeof(u32) - 1); + u32 m; + HPREAMBLE(len) + for (; data != end; data += sizeof(u32)) { + m = get_unaligned_le32(data); + v3 ^= m; + HSIPROUND; + v0 ^= m; + } + switch (left) { + case 3: b |= ((u32)end[2]) << 16; fallthrough; + case 2: b |= get_unaligned_le16(end); break; + case 1: b |= end[0]; + } + HPOSTAMBLE +} +EXPORT_SYMBOL(__hsiphash_unaligned); +#endif + +/** + * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32 + * @first: first u32 + * @key: the hsiphash key + */ +u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key) +{ + HPREAMBLE(4) + v3 ^= first; + HSIPROUND; + v0 ^= first; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_1u32); + +/** + * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32 + * @first: first u32 + * @second: second u32 + * @key: the hsiphash key + */ +u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key) +{ + HPREAMBLE(8) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_2u32); + +/** + * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @key: the hsiphash key + */ +u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third, + const hsiphash_key_t *key) +{ + HPREAMBLE(12) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_3u32); + +/** + * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32 + * @first: first u32 + * @second: second u32 + * @third: third u32 + * @forth: forth u32 + * @key: the hsiphash key + */ +u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third, + const u32 forth, const hsiphash_key_t *key) +{ + HPREAMBLE(16) + v3 ^= first; + HSIPROUND; + v0 ^= first; + v3 ^= second; + HSIPROUND; + v0 ^= second; + v3 ^= third; + HSIPROUND; + v0 ^= third; + v3 ^= forth; + HSIPROUND; + v0 ^= forth; + HPOSTAMBLE +} +EXPORT_SYMBOL(hsiphash_4u32); +#endif diff --git a/mount/build.rs b/mount/build.rs deleted file mode 100644 index 6542889..0000000 --- a/mount/build.rs +++ /dev/null @@ -1,67 +0,0 @@ -fn main() { - use std::path::PathBuf; - use std::process::Command; - - let out_dir: PathBuf = std::env::var_os("OUT_DIR").unwrap().into(); - let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR").unwrap().into(); - let libbcachefs_inc_dir = std::env::var("LIBBCACHEFS_INCLUDE") - .unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string()); - let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir); - println!("{}", libbcachefs_inc_dir.display()); - - let libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs"); - let bindings = bindgen::builder() - .header(top_dir - .join("src") - .join("libbcachefs_wrapper.h") - .display() - .to_string()) - .clang_arg(format!( - "-I{}", - libbcachefs_inc_dir.join("include").display() - )) - .clang_arg(format!("-I{}", libbcachefs_inc_dir.display())) - .clang_arg("-DZSTD_STATIC_LINKING_ONLY") - .clang_arg("-DNO_BCACHEFS_FS") - .clang_arg("-D_GNU_SOURCE") - .derive_debug(false) - .derive_default(true) - .default_enum_style(bindgen::EnumVariation::Rust { - non_exhaustive: true, - }) - .whitelist_function("bch2_read_super") - .whitelist_function("bch2_sb_field_.*") - .whitelist_function("bch2_chacha_encrypt_key") - .whitelist_function("derive_passphrase") - .whitelist_function("request_key") - .whitelist_function("add_key") - .whitelist_function("keyctl_search") - .whitelist_var("BCH_.*") - .whitelist_var("KEY_SPEC_.*") - .whitelist_type("bch_kdf_types") - .whitelist_type("bch_sb_field_.*") - .whitelist_type("bch_encrypted_key") - .whitelist_type("nonce") - .rustified_enum("bch_kdf_types") - .opaque_type("gendisk") - .opaque_type("bkey") - .generate() - .unwrap(); - bindings.write_to_file(out_dir.join("bcachefs.rs")).unwrap(); - - let keyutils = pkg_config::probe_library("libkeyutils").unwrap(); - let bindings = bindgen::builder() - .header(top_dir - .join("src") - .join("keyutils_wrapper.h") - .display() - .to_string()) - .clang_args( - keyutils.include_paths - .iter() - .map(|p| format!("-I{}", p.display())), - ) - .generate() - .unwrap(); - bindings.write_to_file(out_dir.join("keyutils.rs")).unwrap(); -} diff --git a/mount/src/filesystem.rs b/mount/src/filesystem.rs deleted file mode 100644 index 36af8c0..0000000 --- a/mount/src/filesystem.rs +++ /dev/null @@ -1,174 +0,0 @@ -extern "C" { - pub static stdout: *mut libc::FILE; -} - -use getset::{CopyGetters, Getters}; -use std::path::PathBuf; -#[derive(Getters, CopyGetters)] -pub struct FileSystem { - /// External UUID of the bcachefs - #[getset(get = "pub")] - uuid: uuid::Uuid, - /// Whether filesystem is encrypted - #[getset(get_copy = "pub")] - encrypted: bool, - /// Super block - #[getset(get = "pub")] - sb: bcachefs::bch_sb_handle, - /// Member devices for this filesystem - #[getset(get = "pub")] - devices: Vec, -} - -/// Parse a comma-separated mount options and split out mountflags and filesystem -/// specific options. -fn parse_mount_options(options: impl AsRef) -> (Option, u64) { - use either::Either::*; - let (opts, flags) = options - .as_ref() - .split(",") - .map(|o| match o { - "dirsync" => Left(libc::MS_DIRSYNC), - "lazytime" => Left(1 << 25), // MS_LAZYTIME - "mand" => Left(libc::MS_MANDLOCK), - "noatime" => Left(libc::MS_NOATIME), - "nodev" => Left(libc::MS_NODEV), - "nodiratime" => Left(libc::MS_NODIRATIME), - "noexec" => Left(libc::MS_NOEXEC), - "nosuid" => Left(libc::MS_NOSUID), - "ro" => Left(libc::MS_RDONLY), - "rw" => Left(0), - "relatime" => Left(libc::MS_RELATIME), - "strictatime" => Left(libc::MS_STRICTATIME), - "sync" => Left(libc::MS_SYNCHRONOUS), - "" => Left(0), - o @ _ => Right(o), - }) - .fold((Vec::new(), 0), |(mut opts, flags), next| match next { - Left(f) => (opts, flags | f), - Right(o) => { - opts.push(o); - (opts, flags) - } - }); - - use itertools::Itertools; - ( - if opts.len() == 0 { - None - } else { - Some(opts.iter().join(",")) - }, - flags, - ) -} - -impl FileSystem { - pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self { - Self { - uuid: sb.sb().uuid(), - encrypted: sb.sb().crypt().is_some(), - sb: sb, - devices: vec![], - } - } - - pub fn mount( - &self, - target: impl AsRef, - options: impl AsRef, - ) -> anyhow::Result<()> { - use itertools::Itertools; - use std::ffi::c_void; - use std::os::raw::c_char; - use std::os::unix::ffi::OsStrExt; - let src = self.devices.iter().map(|d| d.display()).join(":"); - let (data, mountflags) = parse_mount_options(options); - let fstype = c_str!("bcachefs"); - - let src = std::ffi::CString::new(src)?; // bind the CString to keep it alive - let target = std::ffi::CString::new(target.as_ref().as_os_str().as_bytes())?; // ditto - let data = data.map(|data| std::ffi::CString::new(data)).transpose()?; // ditto - - let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; - let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; - let data = data.as_ref().map_or(std::ptr::null(), |data| { - data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void - }); - - let ret = unsafe { libc::mount(src, target, fstype, mountflags, data) }; - if ret == 0 { - Ok(()) - } else { - Err(crate::ErrnoError(errno::errno()).into()) - } - } -} - -use crate::bcachefs; -use std::collections::HashMap; -use uuid::Uuid; -pub fn probe_filesystems() -> anyhow::Result> { - use std::os::unix::ffi::OsStrExt; - let mut udev = udev::Enumerator::new()?; - let mut fss = HashMap::new(); - udev.match_subsystem("block")?; - - { - // Stop libbcachefs from spamming the output - let _gag = gag::Gag::stdout().unwrap(); - for dev in udev.scan_devices()? { - if let Some(p) = dev.devnode() { - let path = - std::ffi::CString::new(p.as_os_str().as_bytes()).unwrap(); - let result = unsafe { - let mut opts = std::mem::MaybeUninit::zeroed(); - let mut sb = std::mem::MaybeUninit::zeroed(); - let ret = bcachefs::bch2_read_super( - path.as_ptr(), - opts.as_mut_ptr(), - sb.as_mut_ptr(), - ); - if ret == -libc::EACCES { - Err(std::io::Error::new( - std::io::ErrorKind::PermissionDenied, - "no permission", - )) - } else if ret != 0 { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "failed to read super", - )) - } else { - Ok((opts.assume_init(), sb.assume_init())) - } - }; - match result { - Ok((_, sb)) => match fss.get_mut(&sb.sb().uuid()) { - None => { - let mut fs = FileSystem::new(sb); - fs.devices.push(p.to_owned()); - fss.insert(fs.uuid, fs); - } - Some(fs) => { - fs.devices.push(p.to_owned()); - } - }, - Err(e) if e.kind() - != std::io::ErrorKind::PermissionDenied => - { - () - } - e @ Err(_) => { - e?; - } - } - } - } - // Flush stdout so buffered output don't get printed after we remove the gag - unsafe { - libc::fflush(stdout); - } - } - Ok(fss) -} diff --git a/mount/src/lib.rs b/mount/src/lib.rs deleted file mode 100644 index 751eab3..0000000 --- a/mount/src/lib.rs +++ /dev/null @@ -1,190 +0,0 @@ -use structopt::StructOpt; -use anyhow::anyhow; - -#[macro_export] -macro_rules! c_str { - ($lit:expr) => { - unsafe { std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char) - .to_bytes_with_nul() - .as_ptr() as *const std::os::raw::c_char } - }; -} - -#[derive(Debug)] -struct ErrnoError(errno::Errno); -impl std::fmt::Display for ErrnoError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { - self.0.fmt(f) - } -} -impl std::error::Error for ErrnoError {} - -#[derive(Debug)] -pub(crate) enum KeyLocation { - Fail, - Wait, - Ask, -} - -impl std::str::FromStr for KeyLocation { - type Err = anyhow::Error; - fn from_str(s: &str) -> anyhow::Result { - use anyhow::anyhow; - match s { - "fail" => Ok(Self::Fail), - "wait" => Ok(Self::Wait), - "ask" => Ok(Self::Ask), - _ => Err(anyhow!("invalid password option")) - } - } -} - -#[derive(StructOpt, Debug)] -/// Mount a bcachefs filesystem by its UUID. -struct Options { - /// Where the password would be loaded from. - /// - /// Possible values are: - /// "fail" - don't ask for password, fail if filesystem is encrypted; - /// "wait" - wait for password to become available before mounting; - /// "ask" - prompt the user for password; - #[structopt(short, long, default_value = "fail")] - key_location: KeyLocation, - - /// External UUID of the bcachefs filesystem - uuid: uuid::Uuid, - - /// Where the filesystem should be mounted. If not set, then the filesystem - /// won't actually be mounted. But all steps preceeding mounting the - /// filesystem (e.g. asking for passphrase) will still be performed. - mountpoint: Option, - - /// Mount options - #[structopt(short, default_value = "")] - options: String, -} - -mod filesystem; -mod key; -mod keyutils { - #![allow(non_upper_case_globals)] - #![allow(non_camel_case_types)] - #![allow(non_snake_case)] - #![allow(unused)] - - include!(concat!(env!("OUT_DIR"), "/keyutils.rs")); -} - -mod bcachefs { - #![allow(non_upper_case_globals)] - #![allow(non_camel_case_types)] - #![allow(non_snake_case)] - #![allow(unused)] - - include!(concat!(env!("OUT_DIR"), "/bcachefs.rs")); - - use bitfield::bitfield; - bitfield! { - pub struct bch_scrypt_flags(u64); - pub N, _: 15, 0; - pub R, _: 31, 16; - pub P, _: 47, 32; - } - bitfield! { - pub struct bch_crypt_flags(u64); - TYPE, _: 4, 0; - } - use memoffset::offset_of; - impl bch_sb_field_crypt { - pub fn scrypt_flags(&self) -> Option { - let t = bch_crypt_flags(self.flags); - if t.TYPE() != bch_kdf_types::BCH_KDF_SCRYPT as u64 { - None - } else { - Some(bch_scrypt_flags(self.kdf_flags)) - } - } - pub fn key(&self) -> &bch_encrypted_key { - &self.key - } - } - impl bch_sb { - pub fn crypt(&self) -> Option<&bch_sb_field_crypt> { - unsafe { - let ptr = bch2_sb_field_get( - self as *const _ as *mut _, - bch_sb_field_type::BCH_SB_FIELD_crypt, - ) as *const u8; - if ptr.is_null() { - None - } else { - let offset = offset_of!(bch_sb_field_crypt, field); - Some(&*((ptr.sub(offset)) as *const _)) - } - } - } - pub fn uuid(&self) -> uuid::Uuid { - uuid::Uuid::from_bytes(self.user_uuid.b) - } - - /// Get the nonce used to encrypt the superblock - pub fn nonce(&self) -> nonce { - use byteorder::{ReadBytesExt, LittleEndian}; - let mut internal_uuid = &self.uuid.b[..]; - let dword1 = internal_uuid.read_u32::().unwrap(); - let dword2 = internal_uuid.read_u32::().unwrap(); - nonce { d: [0, 0, dword1, dword2] } - } - } - impl bch_sb_handle { - pub fn sb(&self) -> &bch_sb { - unsafe { &*self.sb } - } - } -} - -fn main_inner() -> anyhow::Result<()> { - use itertools::Itertools; - use log::{info, trace}; - - env_logger::init(); - let opt = Options::from_args(); - trace!("{:?}", opt); - - let fss = filesystem::probe_filesystems()?; - info!("Found {} bcachefs filesystems: ", fss.len()); - for fs in fss.values() { - info!( - "{} ({}): {}", - fs.uuid(), - if fs.encrypted() { - "encrypted" - } else { - "unencrypted" - }, - fs.devices().iter().map(|d| d.display()).join(" ") - ); - } - - if let Some(fs) = fss.get(&opt.uuid) { - if fs.encrypted() { - info!("Making sure key is loaded for this filesystem"); - key::prepare_key(&fs, opt.key_location)?; - } - - if let Some(p) = opt.mountpoint { - fs.mount(&p, &opt.options) - } else { - Ok(()) - } - } else { - Err(anyhow!("Filesystem {} is not found", opt.uuid)) - } -} - -#[no_mangle] -pub extern "C" fn main() { - if let Err(e) = main_inner() { - println!("Error: {:?}", e); - } -} diff --git a/nix/bcachefs-kernel.nix b/nix/bcachefs-kernel.nix new file mode 100644 index 0000000..c937df4 --- /dev/null +++ b/nix/bcachefs-kernel.nix @@ -0,0 +1,34 @@ +{ lib +, fetchpatch +, fetchgit +, fetchFromGitHub +, buildLinux +, commit +, sha256 ? lib.fakeSha256 +, kernelVersion ? "5.13.0" +, kernelPatches ? [] # must always be defined in bcachefs' all-packages.nix entry because it's also a top-level attribute supplied by callPackage +, argsOverride ? {} +, versionString ? (builtins.substring 0 8 commit) +, ... +} @ args: + +buildLinux { + inherit kernelPatches; + + # pname = "linux"; + version = "${kernelVersion}-bcachefs-${versionString}"; + + modDirVersion = kernelVersion; + + + src = fetchFromGitHub { + name = "bcachefs-kernel-src"; + owner = "koverstreet"; + repo = "bcachefs"; + rev = commit; + inherit sha256; + }; + + extraConfig = "BCACHEFS_FS m"; + # NIX_DEBUG=5; +} \ No newline at end of file diff --git a/nix/bcachefs.rev.sha256 b/nix/bcachefs.rev.sha256 new file mode 100644 index 0000000..3f06215 --- /dev/null +++ b/nix/bcachefs.rev.sha256 @@ -0,0 +1 @@ +sha256-JsWrbuxrs047YKGES+r7mMfPdDWIMAGrg1fWi8qU4+A= \ No newline at end of file diff --git a/nix/overlay.nix b/nix/overlay.nix new file mode 100644 index 0000000..42d3fb2 --- /dev/null +++ b/nix/overlay.nix @@ -0,0 +1,29 @@ +{ filter, self, ... }: +final: prev: { + bcachefs = { + tools = final.callPackage ../default.nix { + testWithValgrind = false; + filter = filter.lib; + lastModified = builtins.substring 0 8 self.lastModifiedDate; + versionString = self.version; + }; + toolsValgrind = final.bcachefs.tools.override { + testWithValgrind = true; + }; + toolsDebug = final.bcachefs.toolsValgrind.override { + debugMode = true; + }; + + bch_bindgen = final.callPackage ../rust-src/bch_bindgen {}; + + mount = final.callPackage ../rust-src/mount {}; + + kernelPackages = final.recurseIntoAttrs (final.linuxPackagesFor final.bcachefs.kernel); + kernel = final.callPackage ./bcachefs-kernel.nix { + commit = final.bcachefs.tools.bcachefs_revision; + # This needs to be recalculated for every revision change + sha256 = builtins.readFile ./bcachefs.rev.sha256; + kernelPatches = []; + }; + }; +} diff --git a/packaging/bcachefs-tools.spec b/packaging/bcachefs-tools.spec index 4946cef..00d0fbb 100644 --- a/packaging/bcachefs-tools.spec +++ b/packaging/bcachefs-tools.spec @@ -15,7 +15,6 @@ BuildRequires: keyutils-libs-devel BuildRequires: libaio-devel BuildRequires: libattr-devel BuildRequires: libblkid-devel -BuildRequires: libscrypt-devel BuildRequires: libsodium-devel BuildRequires: libtool-ltdl-devel BuildRequires: libuuid-devel @@ -32,7 +31,6 @@ Requires: keyutils-libs Requires: libaio Requires: libattr Requires: libblkid -Requires: libscrypt Requires: libsodium Requires: libtool-ltdl Requires: libuuid diff --git a/qcow2.c b/qcow2.c index b7aa8c2..7cf4992 100644 --- a/qcow2.c +++ b/qcow2.c @@ -46,7 +46,8 @@ static void flush_l2(struct qcow2_image *img) if (img->l1_index != -1) { img->l1_table[img->l1_index] = cpu_to_be64(img->offset|QCOW_OFLAG_COPIED); - xpwrite(img->fd, img->l2_table, img->block_size, img->offset); + xpwrite(img->fd, img->l2_table, img->block_size, img->offset, + "qcow2 l2 table"); img->offset += img->block_size; memset(img->l2_table, 0, img->block_size); @@ -101,7 +102,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data, img.offset += img.block_size; xpread(infd, buf, block_size, src_offset); - xpwrite(outfd, buf, block_size, dst_offset); + xpwrite(outfd, buf, block_size, dst_offset, + "qcow2 data"); add_l2(&img, src_offset / block_size, dst_offset); } @@ -111,7 +113,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data, /* Write L1 table: */ dst_offset = img.offset; img.offset += round_up(l1_size * sizeof(u64), block_size); - xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset); + xpwrite(img.fd, img.l1_table, l1_size * sizeof(u64), dst_offset, + "qcow2 l1 table"); /* Write header: */ hdr.magic = cpu_to_be32(QCOW_MAGIC); @@ -123,7 +126,8 @@ void qcow2_write_image(int infd, int outfd, ranges *data, memset(buf, 0, block_size); memcpy(buf, &hdr, sizeof(hdr)); - xpwrite(img.fd, buf, block_size, 0); + xpwrite(img.fd, buf, block_size, 0, + "qcow2 header"); free(img.l2_table); free(img.l1_table); diff --git a/rust-src/bch_bindgen/.gitignore b/rust-src/bch_bindgen/.gitignore new file mode 100644 index 0000000..0aa133a --- /dev/null +++ b/rust-src/bch_bindgen/.gitignore @@ -0,0 +1,15 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +# Required By Nix +# Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb diff --git a/rust-src/bch_bindgen/Cargo.lock b/rust-src/bch_bindgen/Cargo.lock new file mode 100644 index 0000000..2138d33 --- /dev/null +++ b/rust-src/bch_bindgen/Cargo.lock @@ -0,0 +1,484 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "anyhow" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" + +[[package]] +name = "autocfg" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" + +[[package]] +name = "bch_bindgen" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "bitfield", + "byteorder", + "gag", + "libc", + "memoffset", + "pkg-config", + "tracing", + "tracing-attributes", + "udev", + "uuid", +] + +[[package]] +name = "bindgen" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375" +dependencies = [ + "bitflags", + "cexpr", + "clang-sys", + "lazy_static", + "lazycell", + "peeking_take_while", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", +] + +[[package]] +name = "bitfield" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitvec" +version = "0.19.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + +[[package]] +name = "cexpr" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89" +dependencies = [ + "nom", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "clang-sys" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee" +dependencies = [ + "glob", + "libc", +] + +[[package]] +name = "filedescriptor" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" +dependencies = [ + "libc", + "thiserror", + "winapi", +] + +[[package]] +name = "funty" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" + +[[package]] +name = "gag" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972" +dependencies = [ + "filedescriptor", + "tempfile", +] + +[[package]] +name = "getrandom" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "glob" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lazycell" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" + +[[package]] +name = "libc" +version = "0.2.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8f7255a17a627354f321ef0055d63b898c6fb27eff628af4d1b66b7331edf6" + +[[package]] +name = "libudev-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c8469b4a23b962c1396b9b451dda50ef5b283e8dd309d69033475fa9b334324" +dependencies = [ + "libc", + "pkg-config", +] + +[[package]] +name = "memchr" +version = "2.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" + +[[package]] +name = "memoffset" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" +dependencies = [ + "autocfg", +] + +[[package]] +name = "nom" +version = "6.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6" +dependencies = [ + "bitvec", + "funty", + "memchr", + "version_check", +] + +[[package]] +name = "peeking_take_while" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" + +[[package]] +name = "pin-project-lite" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" + +[[package]] +name = "pkg-config" +version = "0.3.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb" + +[[package]] +name = "ppv-lite86" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" + +[[package]] +name = "proc-macro2" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "radium" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" + +[[package]] +name = "rand" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", + "rand_hc", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_hc" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +dependencies = [ + "rand_core", +] + +[[package]] +name = "redox_syscall" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +dependencies = [ + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "remove_dir_all" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +dependencies = [ + "winapi", +] + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "shlex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "syn" +version = "1.0.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "tempfile" +version = "3.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +dependencies = [ + "cfg-if", + "libc", + "rand", + "redox_syscall", + "remove_dir_all", + "winapi", +] + +[[package]] +name = "thiserror" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8" +dependencies = [ + "cfg-if", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "udev" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24953d50a3bce0f5f5a9a2766567072dc9af8096f8c40ea81815da651066bc9f" +dependencies = [ + "libc", + "libudev-sys", +] + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "uuid" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" + +[[package]] +name = "version_check" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" + +[[package]] +name = "wasi" +version = "0.10.2+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wyz" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" diff --git a/rust-src/bch_bindgen/Cargo.toml b/rust-src/bch_bindgen/Cargo.toml new file mode 100644 index 0000000..91cc77f --- /dev/null +++ b/rust-src/bch_bindgen/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "bch_bindgen" +version = "0.1.0" +authors = [ "Kayla Firestack ", "Yuxuan Shui " ] +edition = "2018" + +[lib] +crate-type = ["lib"] +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tracing = "0.1.26" +anyhow = "1.0" +udev = "0.4" +uuid = "0.8" +bitfield = "0.13" +memoffset = "0.5" +byteorder = "1.3" +tracing-attributes = "0.1.15" +libc = "0.2.69" +gag = "1.0.0" + + +[build-dependencies] +pkg-config = "0.3" +bindgen = { version = "0.59.1", default-features = false } diff --git a/rust-src/bch_bindgen/build.rs b/rust-src/bch_bindgen/build.rs new file mode 100644 index 0000000..fd570db --- /dev/null +++ b/rust-src/bch_bindgen/build.rs @@ -0,0 +1,74 @@ +fn main() { + use std::path::PathBuf; + // use std::process::Command; + + let out_dir: PathBuf = std::env::var_os("OUT_DIR").expect("ENV Var 'OUT_DIR' Expected").into(); + let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR") + .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected") + .into(); + let libbcachefs_inc_dir = + std::env::var("LIBBCACHEFS_INCLUDE").unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string()); + let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir); + println!("{}", libbcachefs_inc_dir.display()); + + println!("cargo:rustc-link-lib=dylib=bcachefs"); + println!("cargo:rustc-link-search={}", env!("LIBBCACHEFS_LIB")); + + let _libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs"); + let bindings = bindgen::builder() + .header(top_dir.join("src").join("libbcachefs_wrapper.h").display().to_string()) + .clang_arg(format!("-I{}", libbcachefs_inc_dir.join("include").display())) + .clang_arg(format!("-I{}", libbcachefs_inc_dir.display())) + .clang_arg("-DZSTD_STATIC_LINKING_ONLY") + .clang_arg("-DNO_BCACHEFS_FS") + .clang_arg("-D_GNU_SOURCE") + .derive_debug(true) + .derive_default(true) + .derive_eq(true) + .layout_tests(true) + .default_enum_style(bindgen::EnumVariation::Rust { non_exhaustive: true }) + .allowlist_function(".*bch2_.*") + // .allowlist_function("bch2_read_super") + // .allowlist_function("bch2_sb_field_.*") + // .allowlist_function("bch2_super_write") + // .allowlist_function("bch2_chacha_encrypt_key") + // .allowlist_function("__bch2_super_read") + .allowlist_function("bio_.*") + .allowlist_function("bch2_super_write_fd") + .allowlist_function("derive_passphrase") + .allowlist_function("request_key") + .allowlist_function("add_key") + .allowlist_function("keyctl_search") + .blocklist_type("bch_extent_ptr") + .blocklist_type("btree_node") + .blocklist_type("bch_extent_crc32") + .blocklist_type("rhash_lock_head") + .blocklist_type("srcu_struct") + .allowlist_var("BCH_.*") + .allowlist_var("KEY_SPEC_.*") + .allowlist_type("bch_kdf_types") + .allowlist_type("bch_sb_field_.*") + .allowlist_type("bch_encrypted_key") + .allowlist_type("nonce") + .newtype_enum("bch_kdf_types") + .opaque_type("gendisk") + .opaque_type("bkey") + // .opaque_type("bch_extent_ptr") + // .opaque_type("bch_extent_crc32") + .opaque_type("open_bucket.*") + .generate() + .expect("BindGen Generation Failiure: [libbcachefs_wrapper]"); + bindings + .write_to_file(out_dir.join("bcachefs.rs")) + .expect("Writing to output file failed for: `bcachefs.rs`"); + + let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib"); + let bindings = bindgen::builder() + .header(top_dir.join("src").join("keyutils_wrapper.h").display().to_string()) + .clang_args(keyutils.include_paths.iter().map(|p| format!("-I{}", p.display()))) + .generate() + .expect("BindGen Generation Failiure: [Keyutils]"); + bindings + .write_to_file(out_dir.join("keyutils.rs")) + .expect("Writing to output file failed for: `keyutils.rs`"); +} diff --git a/rust-src/bch_bindgen/default.nix b/rust-src/bch_bindgen/default.nix new file mode 100644 index 0000000..f6053d5 --- /dev/null +++ b/rust-src/bch_bindgen/default.nix @@ -0,0 +1,76 @@ +{ lib +, stdenv +, rustPlatform +, llvmPackages +, bcachefs +, pkg-config + +, udev +, liburcu +, zstd +, keyutils +, libaio + +, lz4 # liblz4 +, libsodium +, libuuid +, zlib # zlib1g +, libscrypt + +, rustfmt + +, glibc +, ... +}: let + include = { + glibc = "${glibc.dev}/include"; + clang = let libc = llvmPackages.libclang; in + "${libc.lib}/lib/clang/${libc.version}/include"; + urcu = "${liburcu}/include"; + zstd = "${zstd.dev}/include"; + }; + cargo = lib.trivial.importTOML ./Cargo.toml; +in rustPlatform.buildRustPackage { + pname = cargo.package.name; + version = cargo.package.version; + + src = builtins.path { path = ./.; name = "bch_bindgen"; }; + + cargoLock = { lockFile = ./Cargo.lock; }; + + nativeBuildInputs = [ rustfmt pkg-config ]; + buildInputs = [ + + # libaio + keyutils # libkeyutils + lz4 # liblz4 + libsodium + liburcu + libuuid + zstd # libzstd + zlib # zlib1g + udev + libscrypt + libaio + ]; + + LIBBCACHEFS_LIB ="${bcachefs.tools}/lib"; + LIBBCACHEFS_INCLUDE = bcachefs.tools.src; + LIBCLANG_PATH = "${llvmPackages.libclang.lib}/lib"; + BINDGEN_EXTRA_CLANG_ARGS = lib.replaceStrings ["\n" "\t"] [" " ""] '' + -std=gnu99 + -I${include.glibc} + -I${include.clang} + -I${include.urcu} + -I${include.zstd} + ''; + + postPatch = '' + cp ${./Cargo.lock} Cargo.lock + ''; + + + doCheck = true; + + # NIX_DEBUG = 4; +} \ No newline at end of file diff --git a/rust-src/bch_bindgen/rustfmt.toml b/rust-src/bch_bindgen/rustfmt.toml new file mode 100644 index 0000000..a2b7f32 --- /dev/null +++ b/rust-src/bch_bindgen/rustfmt.toml @@ -0,0 +1,2 @@ +max_width=120 +hard_tabs = true diff --git a/rust-src/bch_bindgen/src/bcachefs.rs b/rust-src/bch_bindgen/src/bcachefs.rs new file mode 100644 index 0000000..cc98ffc --- /dev/null +++ b/rust-src/bch_bindgen/src/bcachefs.rs @@ -0,0 +1,124 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(unused)] + +include!(concat!(env!("OUT_DIR"), "/bcachefs.rs")); + +use bitfield::bitfield; +bitfield! { + pub struct bch_scrypt_flags(u64); + pub N, _: 15, 0; + pub R, _: 31, 16; + pub P, _: 47, 32; +} +bitfield! { + pub struct bch_crypt_flags(u64); + pub TYPE, _: 4, 0; +} +use memoffset::offset_of; +impl bch_sb_field_crypt { + pub fn scrypt_flags(&self) -> Option { + use std::convert::TryInto; + match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) { + bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)), + _ => None, + } + } + pub fn key(&self) -> &bch_encrypted_key { + &self.key + } +} +impl PartialEq for bch_sb { + fn eq(&self, other: &Self) -> bool { + self.magic.b == other.magic.b + && self.user_uuid.b == other.user_uuid.b + && self.block_size == other.block_size + && self.version == other.version + && self.uuid.b == other.uuid.b + && self.seq == other.seq + } +} + +impl std::fmt::Debug for bch_sb { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("bch_sb") + .field("uuid", &self.uuid()) + .field("version", &(self.version, self.version_min)) + .field("block_size", &self.block_size) + .field("device_idx", &self.dev_idx) + .field("seq", &self.seq) + .field("csum", &(self.csum.lo, self.csum.hi)) + .field("offset", &self.offset) + .finish_non_exhaustive() + } +} + + +impl bch_sb { + pub fn crypt(&self) -> Option<&bch_sb_field_crypt> { + unsafe { + let ptr = bch2_sb_field_get(self as *const _ as *mut _, bch_sb_field_type::BCH_SB_FIELD_crypt) as *const u8; + if ptr.is_null() { + None + } else { + let offset = offset_of!(bch_sb_field_crypt, field); + Some(&*((ptr.sub(offset)) as *const _)) + } + } + } + pub fn uuid(&self) -> uuid::Uuid { + uuid::Uuid::from_bytes(self.user_uuid.b) + } + + /// Get the nonce used to encrypt the superblock + pub fn nonce(&self) -> nonce { + use byteorder::{LittleEndian, ReadBytesExt}; + let mut internal_uuid = &self.uuid.b[..]; + let dword1 = internal_uuid.read_u32::().unwrap(); + let dword2 = internal_uuid.read_u32::().unwrap(); + nonce { + d: [0, 0, dword1, dword2], + } + } +} +impl bch_sb_handle { + pub fn sb(&self) -> &bch_sb { + unsafe { &*self.sb } + } + + pub fn bdev(&self) -> &block_device { + unsafe { &*self.bdev } + } +} + +#[repr(C)] +// #[repr(align(8))] +#[derive(Debug, Default, Copy, Clone)] +pub struct bch_extent_ptr { + pub _bitfield_1: __BindgenBitfieldUnit<[u8; 8usize]>, +} + +#[repr(C, packed(8))] +pub struct btree_node { + pub csum: bch_csum, + pub magic: __le64, + pub flags: __le64, + pub min_key: bpos, + pub max_key: bpos, + pub _ptr: bch_extent_ptr, + pub format: bkey_format, + pub __bindgen_anon_1: btree_node__bindgen_ty_1, +} + +#[repr(C, packed(8))] +// #[repr(align(8))] +#[derive(Debug, Default, Copy, Clone)] +pub struct bch_extent_crc32 { + pub _bitfield_1: __BindgenBitfieldUnit<[u8; 4usize]>, + pub csum: __u32, +} + +// #[repr(u8)] +pub enum rhash_lock_head {} +pub enum srcu_struct {} diff --git a/rust-src/bch_bindgen/src/keyutils.rs b/rust-src/bch_bindgen/src/keyutils.rs new file mode 100644 index 0000000..30fc56f --- /dev/null +++ b/rust-src/bch_bindgen/src/keyutils.rs @@ -0,0 +1,6 @@ +#![allow(non_upper_case_globals)] +#![allow(non_camel_case_types)] +#![allow(non_snake_case)] +#![allow(unused)] + +include!(concat!(env!("OUT_DIR"), "/keyutils.rs")); diff --git a/mount/src/keyutils_wrapper.h b/rust-src/bch_bindgen/src/keyutils_wrapper.h similarity index 100% rename from mount/src/keyutils_wrapper.h rename to rust-src/bch_bindgen/src/keyutils_wrapper.h diff --git a/rust-src/bch_bindgen/src/lib.rs b/rust-src/bch_bindgen/src/lib.rs new file mode 100644 index 0000000..c19b5a2 --- /dev/null +++ b/rust-src/bch_bindgen/src/lib.rs @@ -0,0 +1,7 @@ +pub mod bcachefs; +pub mod keyutils; +pub mod rs; + +pub mod c { + pub use crate::bcachefs::*; +} diff --git a/mount/src/libbcachefs_wrapper.h b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h similarity index 59% rename from mount/src/libbcachefs_wrapper.h rename to rust-src/bch_bindgen/src/libbcachefs_wrapper.h index 9d9754c..2a0e702 100644 --- a/mount/src/libbcachefs_wrapper.h +++ b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h @@ -1,4 +1,8 @@ #include "../libbcachefs/super-io.h" #include "../libbcachefs/checksum.h" #include "../libbcachefs/bcachefs_format.h" +#include "../libbcachefs/opts.h" +#include "../libbcachefs.h" #include "../crypto.h" +#include "../include/linux/bio.h" + diff --git a/rust-src/bch_bindgen/src/rs.rs b/rust-src/bch_bindgen/src/rs.rs new file mode 100644 index 0000000..4452f0b --- /dev/null +++ b/rust-src/bch_bindgen/src/rs.rs @@ -0,0 +1,58 @@ +use crate::bcachefs; + +pub const SUPERBLOCK_MAGIC: uuid::Uuid = uuid::Uuid::from_u128( + 0x_c68573f6_4e1a_45ca_8265_f57f48ba6d81 +); + +extern "C" { + pub static stdout: *mut libc::FILE; +} + +pub enum ReadSuperErr { + Io(std::io::Error), +} + +type RResult = std::io::Result>; + +#[tracing_attributes::instrument(skip(opts))] +pub fn read_super_opts(path: &std::path::Path, mut opts: bcachefs::bch_opts) -> RResult { + // let devp = camino::Utf8Path::from_path(devp).unwrap(); + + use std::os::unix::ffi::OsStrExt; + let path = std::ffi::CString::new(path.as_os_str().as_bytes())?; + + let mut sb = std::mem::MaybeUninit::zeroed(); + + // use gag::{BufferRedirect}; + // // Stop libbcachefs from spamming the output + // let gag = BufferRedirect::stderr().unwrap(); + // tracing::trace!("entering libbcachefs"); + + let ret = unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) }; + tracing::trace!(%ret); + + match -ret { + libc::EACCES => Err(std::io::Error::new( + std::io::ErrorKind::PermissionDenied, + "Access Permission Denied", + )), + 0 => Ok(Ok(unsafe { sb.assume_init() })), + 22 => Ok(Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "Not a BCacheFS SuperBlock", + ))), + code => { + tracing::debug!(msg = "BCacheFS return error code", ?code); + Ok(Err(std::io::Error::new( + std::io::ErrorKind::Other, + "Failed to Read SuperBlock", + ))) + } + } +} + +#[tracing_attributes::instrument] +pub fn read_super(path: &std::path::Path) -> RResult { + let opts = bcachefs::bch_opts::default(); //unsafe {std::mem::MaybeUninit::zeroed().assume_init()}; + read_super_opts(path, opts) +} diff --git a/rust-src/mount/.gitignore b/rust-src/mount/.gitignore new file mode 100644 index 0000000..644cd42 --- /dev/null +++ b/rust-src/mount/.gitignore @@ -0,0 +1,15 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +# Needed by nix +# Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb diff --git a/mount/Cargo.lock b/rust-src/mount/Cargo.lock similarity index 68% rename from mount/Cargo.lock rename to rust-src/mount/Cargo.lock index 77ccbba..92d13cf 100644 --- a/mount/Cargo.lock +++ b/rust-src/mount/Cargo.lock @@ -1,5 +1,7 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. +version = 3 + [[package]] name = "aho-corasick" version = "0.7.10" @@ -18,6 +20,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.28" @@ -43,39 +54,56 @@ checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" [[package]] name = "bcachefs-mount" -version = "0.1.0" +version = "0.3.1" dependencies = [ "anyhow", - "bindgen", - "bitfield", + "bch_bindgen", "byteorder", + "camino", "clap", "either", - "env_logger", "errno", "gag", "getset", "itertools", "libc", - "log", - "memoffset", "parse-display", - "pkg-config", "rpassword", "structopt", + "tracing", + "tracing-attributes", + "tracing-log", + "tracing-subscriber", + "udev", + "uuid", +] + +[[package]] +name = "bch_bindgen" +version = "0.1.0" +dependencies = [ + "anyhow", + "bindgen", + "bitfield", + "byteorder", + "gag", + "libc", + "memoffset", + "pkg-config", + "tracing", + "tracing-attributes", "udev", "uuid", ] [[package]] name = "bindgen" -version = "0.53.2" +version = "0.59.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb26d6a69a335b8cb0e7c7e9775cd5666611dc50a37177c3f2cedcfc040e8c8" +checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375" dependencies = [ "bitflags", "cexpr", - "cfg-if", "clang-sys", "lazy_static", "lazycell", @@ -99,17 +127,35 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +[[package]] +name = "bitvec" +version = "0.19.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + [[package]] name = "byteorder" version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +[[package]] +name = "camino" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52d74260d9bf6944e2208aa46841b4b8f0d7ffc0849a06837b2f510337f86b2b" + [[package]] name = "cexpr" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4aedb84272dbe89af497cf81375129abda4fc0a9e7c5d317498c15cc30c0d27" +checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89" dependencies = [ "nom", ] @@ -120,11 +166,29 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +dependencies = [ + "libc", + "num-integer", + "num-traits", + "winapi", +] + [[package]] name = "clang-sys" -version = "0.29.3" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe6837df1d5cba2397b835c8530f51723267e16abbf83892e9e5af4f0e5dd10a" +checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee" dependencies = [ "glob", "libc", @@ -136,7 +200,7 @@ version = "2.33.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" dependencies = [ - "ansi_term", + "ansi_term 0.11.0", "atty", "bitflags", "strsim", @@ -152,15 +216,6 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" -[[package]] -name = "env_logger" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" -dependencies = [ - "log", -] - [[package]] name = "errno" version = "0.2.5" @@ -183,12 +238,29 @@ dependencies = [ ] [[package]] -name = "gag" -version = "0.1.10" +name = "filedescriptor" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cc0b9f53275dc5fada808f1d2f82e3688a6c14d735633d1590b7be8eb2307b5" +checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" dependencies = [ "libc", + "thiserror", + "winapi", +] + +[[package]] +name = "funty" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" + +[[package]] +name = "gag" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a713bee13966e9fbffdf7193af71d54a6b35a0bb34997cd6c9519ebeb5005972" +dependencies = [ + "filedescriptor", "tempfile", ] @@ -204,7 +276,7 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "wasi", ] @@ -254,6 +326,12 @@ dependencies = [ "either", ] +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + [[package]] name = "lazy_static" version = "1.4.0" @@ -288,7 +366,16 @@ version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", +] + +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata", ] [[package]] @@ -308,14 +395,35 @@ dependencies = [ [[package]] name = "nom" -version = "5.1.1" +version = "6.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b471253da97532da4b61552249c521e01e736071f71c1a4f7ebbfbf0a06aad6" +checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6" dependencies = [ + "bitvec", + "funty", "memchr", "version_check", ] +[[package]] +name = "num-integer" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +dependencies = [ + "autocfg", +] + [[package]] name = "parse-display" version = "0.1.1" @@ -347,6 +455,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +[[package]] +name = "pin-project-lite" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" + [[package]] name = "pkg-config" version = "0.3.17" @@ -429,6 +543,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "radium" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" + [[package]] name = "rand" version = "0.7.3" @@ -488,6 +608,15 @@ dependencies = [ "thread_local", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax", +] + [[package]] name = "regex-syntax" version = "0.6.17" @@ -519,11 +648,49 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "ryu" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" + +[[package]] +name = "serde" +version = "1.0.130" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" + +[[package]] +name = "serde_json" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sharded-slab" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" -version = "0.1.1" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" + +[[package]] +name = "smallvec" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" +checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" [[package]] name = "strsim" @@ -533,9 +700,9 @@ checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" [[package]] name = "structopt" -version = "0.3.14" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef" +checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" dependencies = [ "clap", "lazy_static", @@ -544,9 +711,9 @@ dependencies = [ [[package]] name = "structopt-derive" -version = "0.4.7" +version = "0.4.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a" +checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" dependencies = [ "heck", "proc-macro-error 1.0.2", @@ -577,13 +744,19 @@ dependencies = [ "syn", ] +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + [[package]] name = "tempfile" version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" dependencies = [ - "cfg-if", + "cfg-if 0.1.10", "libc", "rand", "redox_syscall", @@ -611,6 +784,26 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "thiserror" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "318234ffa22e0920fe9a40d7b8369b5f649d490980cf7aadcf1eb91594869b42" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cae2447b6282786c3493999f40a9be2a6ad20cb8bd268b0a0dbf5a065535c0ab" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "thread_local" version = "1.0.1" @@ -620,6 +813,81 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "tracing" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +dependencies = [ + "cfg-if 1.0.0", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "tracing-log" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +dependencies = [ + "lazy_static", + "log", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe" +dependencies = [ + "ansi_term 0.12.1", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + [[package]] name = "udev" version = "0.4.0" @@ -693,3 +961,9 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "wyz" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" diff --git a/mount/Cargo.toml b/rust-src/mount/Cargo.toml similarity index 54% rename from mount/Cargo.toml rename to rust-src/mount/Cargo.toml index 4fd0d49..d48d4f7 100644 --- a/mount/Cargo.toml +++ b/rust-src/mount/Cargo.toml @@ -1,34 +1,30 @@ [package] name = "bcachefs-mount" -version = "0.1.0" -authors = ["Yuxuan Shui "] +version = "0.3.1" +authors = ["Yuxuan Shui ", "Kayla Firestack "] edition = "2018" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -log = "0.4" +tracing = "0.1.26" +tracing-log = "0.1.2" +tracing-subscriber = "0.2.20" +tracing-attributes = "0.1.15" clap = { version = "2.33", features = [ "wrap_help" ] } -env_logger = { version = "0.7", default-features = false } anyhow = "1.0" -udev = "0.4" -uuid = "0.8" libc = "0.2.69" -gag = "0.1" -bitfield = "0.13" -memoffset = "0.5" +uuid = "0.8" +udev = "0.4" +gag = "1.0.0" getset = "0.1" itertools = "0.9" -structopt = "0.3" +structopt = "0.3.23" parse-display = "0.1" errno = "0.2" either = "1.5" rpassword = "4" +camino = "1.0.5" +bch_bindgen = { path = "../bch_bindgen" } byteorder = "1.3" -[lib] -crate-type = ["staticlib"] - -[build-dependencies] -pkg-config = "0.3" -bindgen = { version = "0.53", default-features = false } diff --git a/rust-src/mount/README.md b/rust-src/mount/README.md new file mode 100644 index 0000000..e4700f6 --- /dev/null +++ b/rust-src/mount/README.md @@ -0,0 +1,62 @@ +Usage +===== + +``` +bcachefs-mount 0.1.0 +Mount a bcachefs filesystem by its UUID + +USAGE: + bcachefs-mount [OPTIONS] + +FLAGS: + -h, --help + Prints help information + + -V, --version + Prints version information + + +OPTIONS: + -o + Mount options [default: ] + + -p, --password + Where the password would be loaded from. + + Possible values are: "fail" - don't ask for password, fail if filesystem is encrypted; "wait" - wait for + password to become available before mounting; "ask" - prompt the user for password; [default: fail] + +ARGS: + + External UUID of the bcachefs filesystem + + + Where the filesystem should be mounted +``` + +Caveats +======= + +* `--password ask` is not yet implemented, but you can use `--password wait`, and load the key with `bcachefs unlock`. + +Build +===== + +```sh +$ git submodule update --init --recursive +$ cargo build --release +``` + +Binary will be built in `target/release/bcachefs-mount` + +Dependencies: + +* rust +* blkid +* uuid +* liburcu +* libsodium +* zlib +* liblz4 +* libzstd +* libkeyutils diff --git a/rust-src/mount/default.nix b/rust-src/mount/default.nix new file mode 100644 index 0000000..dab7db7 --- /dev/null +++ b/rust-src/mount/default.nix @@ -0,0 +1,41 @@ +{ lib + +, stdenv +, glibc +, llvmPackages +, rustPlatform + +, bcachefs + +, ... +}: rustPlatform.buildRustPackage ( let + cargo = lib.trivial.importTOML ./Cargo.toml; +in { + pname = "mount.bcachefs"; + version = cargo.package.version; + + src = builtins.path { path = ../.; name = "rust-src"; }; + sourceRoot = "rust-src/mount"; + + cargoLock = { lockFile = ./Cargo.lock; }; + + nativeBuildInputs = bcachefs.bch_bindgen.nativeBuildInputs; + buildInputs = bcachefs.bch_bindgen.buildInputs; + inherit (bcachefs.bch_bindgen) + LIBBCACHEFS_INCLUDE + LIBBCACHEFS_LIB + LIBCLANG_PATH + BINDGEN_EXTRA_CLANG_ARGS; + + postInstall = '' + ln $out/bin/${cargo.package.name} $out/bin/mount.bcachefs + ln -s $out/bin $out/sbin + ''; + # -isystem ${llvmPackages.libclang.lib}/lib/clang/${lib.getVersion llvmPackages.libclang}/include"; + # CFLAGS = "-I${llvmPackages.libclang.lib}/include"; + # LDFLAGS = "-L${libcdev}"; + + doCheck = false; + + # NIX_DEBUG = 4; +}) \ No newline at end of file diff --git a/rust-src/mount/module.nix b/rust-src/mount/module.nix new file mode 100644 index 0000000..b62aa7d --- /dev/null +++ b/rust-src/mount/module.nix @@ -0,0 +1,54 @@ +## Mirrors: https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/tasks/filesystems/bcachefs.nix +## with changes to use flakes and import mount.bcachefs +{ config, lib, pkgs, utils, ... }: + +with lib; + +let + + bootFs = filterAttrs (n: fs: (fs.fsType == "bcachefs") && (utils.fsNeededForBoot fs)) config.fileSystems; + cfg = config.filesystems.bcachefs; +in + +{ + options.filesystems.bcachefs.packages.tools = lib.mkOption { + description = "Which package to use to link in the bcachefs tools package"; + default = pkgs.bcachefs.tools; + type = lib.types.package; + }; + options.filesystems.bcachefs.packages.mount = lib.mkOption { + description = "Which package to use to link in the bcachefs mount package"; + default = pkgs.bcachefs.mount; + type = lib.types.package; + }; + options.filesystems.bcachefs.packages.kernelPackages = lib.mkOption { + description = "Which package to use to link in the kernel package to use"; + default = pkgs.bcachefs.kernelPackages; + type = lib.types.attrs; + + }; + + config = mkIf (elem "bcachefs" config.boot.supportedFilesystems) (mkMerge [ + { + system.fsPackages = [ cfg.packages.tools cfg.packages.mount ]; + + # use kernel package with bcachefs support until it's in mainline + boot.kernelPackages = cfg.packages.kernelPackages; + } + + (mkIf ((elem "bcachefs" config.boot.initrd.supportedFilesystems) || (bootFs != {})) { + # chacha20 and poly1305 are required only for decryption attempts + boot.initrd.availableKernelModules = [ "sha256" "chacha20" "poly1305" ]; + boot.initrd.kernelModules = [ "bcachefs" ]; + + boot.initrd.extraUtilsCommands = '' + copy_bin_and_libs ${cfg.packages.tools}/bin/bcachefs + copy_bin_and_libs ${cfg.packages.mount}/bin/mount.bcachefs + ''; + boot.initrd.extraUtilsCommandsTest = '' + $out/bin/bcachefs version + $out/bin/mount.bcachefs --version + ''; + }) + ]); +} diff --git a/rust-src/mount/rustfmt.toml b/rust-src/mount/rustfmt.toml new file mode 100644 index 0000000..a2b7f32 --- /dev/null +++ b/rust-src/mount/rustfmt.toml @@ -0,0 +1,2 @@ +max_width=120 +hard_tabs = true diff --git a/rust-src/mount/src/filesystem.rs b/rust-src/mount/src/filesystem.rs new file mode 100644 index 0000000..b1575c2 --- /dev/null +++ b/rust-src/mount/src/filesystem.rs @@ -0,0 +1,208 @@ +extern "C" { + pub static stdout: *mut libc::FILE; +} + +use getset::{CopyGetters, Getters}; +use std::path::PathBuf; +#[derive(Getters, CopyGetters)] +pub struct FileSystem { + /// External UUID of the bcachefs + #[getset(get = "pub")] + uuid: uuid::Uuid, + /// Whether filesystem is encrypted + #[getset(get_copy = "pub")] + encrypted: bool, + /// Super block + #[getset(get = "pub")] + sb: bcachefs::bch_sb_handle, + /// Member devices for this filesystem + #[getset(get = "pub")] + devices: Vec, +} +impl std::fmt::Debug for FileSystem { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("FileSystem") + .field("uuid", &self.uuid) + .field("encrypted", &self.encrypted) + .field("devices", &self.device_string()) + .finish() + } +} +use std::fmt; +impl std::fmt::Display for FileSystem { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let devs = self.device_string(); + write!( + f, + "{:?}: locked?={lock} ({}) ", + self.uuid, + devs, + lock = self.encrypted + ) + } +} + +impl FileSystem { + pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self { + Self { + uuid: sb.sb().uuid(), + encrypted: sb.sb().crypt().is_some(), + sb: sb, + devices: Vec::new(), + } + } + + pub fn device_string(&self) -> String { + use itertools::Itertools; + self.devices.iter().map(|d| d.display()).join(":") + } + + pub fn mount( + &self, + target: impl AsRef, + options: impl AsRef, + ) -> anyhow::Result<()> { + tracing::info_span!("mount").in_scope(|| { + let src = self.device_string(); + let (data, mountflags) = parse_mount_options(options); + // let fstype = c_str!("bcachefs"); + + tracing::info!(msg="mounting bcachefs filesystem", target=%target.as_ref().display()); + mount_inner(src, target, "bcachefs", mountflags, data) + }) + } +} + +fn mount_inner( + src: String, + target: impl AsRef, + fstype: &str, + mountflags: u64, + data: Option, +) -> anyhow::Result<()> { + use std::{ + ffi::{c_void, CString}, + os::{raw::c_char, unix::ffi::OsStrExt}, + }; + + // bind the CStrings to keep them alive + let src = CString::new(src)?; + let target = CString::new(target.as_ref().as_os_str().as_bytes())?; + let data = data.map(CString::new).transpose()?; + let fstype = CString::new(fstype)?; + + // convert to pointers for ffi + let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + let data = data.as_ref().map_or(std::ptr::null(), |data| { + data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void + }); + let fstype = fstype.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + + let ret = {let _entered = tracing::info_span!("libc::mount").entered(); + tracing::info!("mounting filesystem"); + // REQUIRES: CAP_SYS_ADMIN + unsafe { libc::mount(src, target, fstype, mountflags, data) } + }; + match ret { + 0 => Ok(()), + _ => Err(crate::ErrnoError(errno::errno()).into()), + } +} + +/// Parse a comma-separated mount options and split out mountflags and filesystem +/// specific options. +#[tracing_attributes::instrument(skip(options))] +fn parse_mount_options(options: impl AsRef) -> (Option, u64) { + use either::Either::*; + tracing::debug!(msg="parsing mount options", options=?options.as_ref()); + let (opts, flags) = options + .as_ref() + .split(",") + .map(|o| match o { + "dirsync" => Left(libc::MS_DIRSYNC), + "lazytime" => Left(1 << 25), // MS_LAZYTIME + "mand" => Left(libc::MS_MANDLOCK), + "noatime" => Left(libc::MS_NOATIME), + "nodev" => Left(libc::MS_NODEV), + "nodiratime" => Left(libc::MS_NODIRATIME), + "noexec" => Left(libc::MS_NOEXEC), + "nosuid" => Left(libc::MS_NOSUID), + "ro" => Left(libc::MS_RDONLY), + "rw" => Left(0), + "relatime" => Left(libc::MS_RELATIME), + "strictatime" => Left(libc::MS_STRICTATIME), + "sync" => Left(libc::MS_SYNCHRONOUS), + "" => Left(0), + o @ _ => Right(o), + }) + .fold((Vec::new(), 0), |(mut opts, flags), next| match next { + Left(f) => (opts, flags | f), + Right(o) => { + opts.push(o); + (opts, flags) + } + }); + + use itertools::Itertools; + ( + if opts.len() == 0 { + None + } else { + Some(opts.iter().join(",")) + }, + flags, + ) +} + +use bch_bindgen::bcachefs; +use std::collections::HashMap; +use uuid::Uuid; + +#[tracing_attributes::instrument] +pub fn probe_filesystems() -> anyhow::Result> { + tracing::trace!("enumerating udev devices"); + let mut udev = udev::Enumerator::new()?; + + udev.match_subsystem("block")?; // find kernel block devices + + let mut fs_map = HashMap::new(); + let devresults = + udev.scan_devices()? + .into_iter() + .filter_map(|dev| dev.devnode().map(ToOwned::to_owned)); + + for pathbuf in devresults { + match get_super_block_uuid(&pathbuf)? { + + Ok((uuid_key, superblock)) => { + let fs = fs_map.entry(uuid_key).or_insert_with(|| { + tracing::info!(msg="found bcachefs pool", uuid=?uuid_key); + FileSystem::new(superblock) + }); + + fs.devices.push(pathbuf); + }, + + Err(e) => { tracing::debug!(inner2_error=?e);} + } + } + + + tracing::info!(msg = "found filesystems", count = fs_map.len()); + Ok(fs_map) +} + +// #[tracing_attributes::instrument(skip(dev, fs_map))] +fn get_super_block_uuid(path: &std::path::Path) -> std::io::Result> { + let sb = bch_bindgen::rs::read_super(&path)?; + let super_block = match sb { + Err(e) => { return Ok(Err(e)); } + Ok(sb) => sb, + }; + + let uuid = (&super_block).sb().uuid(); + tracing::debug!(found="bcachefs superblock", devnode=?path, ?uuid); + + Ok(Ok((uuid, super_block))) +} diff --git a/mount/src/key.rs b/rust-src/mount/src/key.rs similarity index 77% rename from mount/src/key.rs rename to rust-src/mount/src/key.rs index 6769f52..91c92d1 100644 --- a/mount/src/key.rs +++ b/rust-src/mount/src/key.rs @@ -1,12 +1,11 @@ -use log::info; +use tracing::info; fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result { - use crate::keyutils::{self, keyctl_search}; + use bch_bindgen::keyutils::{self, keyctl_search}; let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _; let key_type = c_str!("logon"); - let key_id = - unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) }; + let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) }; if key_id > 0 { info!("Key has became avaiable"); Ok(true) @@ -31,9 +30,9 @@ fn wait_for_key(uuid: &uuid::Uuid) -> anyhow::Result<()> { const BCH_KEY_MAGIC: &str = "bch**key"; use crate::filesystem::FileSystem; fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> { - use crate::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key}; use anyhow::anyhow; use byteorder::{LittleEndian, ReadBytesExt}; + use bch_bindgen::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key}; use std::os::raw::c_char; let key_name = std::ffi::CString::new(format!("bcachefs:{}", fs.uuid())).unwrap(); @@ -62,19 +61,18 @@ fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> { ) }; if ret != 0 { - Err(anyhow!("chache decryption failure")) + Err(anyhow!("chacha decryption failure")) } else if key.magic != bch_key_magic { Err(anyhow!("failed to verify the password")) } else { let key_type = c_str!("logon"); let ret = unsafe { - crate::keyutils::add_key( + bch_bindgen::keyutils::add_key( key_type, - key_name.as_c_str().to_bytes_with_nul() as *const _ - as *const c_char, + key_name.as_c_str().to_bytes_with_nul() as *const _ as *const c_char, &output as *const _ as *const _, std::mem::size_of::() as u64, - crate::keyutils::KEY_SPEC_USER_KEYRING, + bch_bindgen::keyutils::KEY_SPEC_USER_KEYRING, ) }; if ret == -1 { @@ -85,9 +83,12 @@ fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> { } } -pub(crate) fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> { +#[tracing_attributes::instrument] +pub fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> { use crate::KeyLocation::*; use anyhow::anyhow; + + tracing::info!(msg = "checking if key exists for filesystem"); match password { Fail => Err(anyhow!("no key available")), Wait => Ok(wait_for_key(fs.uuid())?), diff --git a/rust-src/mount/src/lib.rs b/rust-src/mount/src/lib.rs new file mode 100644 index 0000000..4e918e1 --- /dev/null +++ b/rust-src/mount/src/lib.rs @@ -0,0 +1,91 @@ +use anyhow::anyhow; +use structopt::StructOpt; + +pub mod err { + pub enum GError { + Unknown{ + message: std::borrow::Cow<'static, String> + } + } + pub type GResult =::core::result::Result< ::core::result::Result, OE>; + pub type Result = GResult; +} + +#[macro_export] +macro_rules! c_str { + ($lit:expr) => { + unsafe { + std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char) + .to_bytes_with_nul() + .as_ptr() as *const std::os::raw::c_char + } + }; +} + +#[derive(Debug)] +struct ErrnoError(errno::Errno); +impl std::fmt::Display for ErrnoError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + self.0.fmt(f) + } +} +impl std::error::Error for ErrnoError {} + +#[derive(Debug)] +pub enum KeyLocation { + Fail, + Wait, + Ask, +} + +#[derive(Debug)] +pub struct KeyLoc(pub Option); +impl std::ops::Deref for KeyLoc { + type Target = Option; + fn deref(&self) -> &Self::Target { + &self.0 + } +} +impl std::str::FromStr for KeyLoc { + type Err = anyhow::Error; + fn from_str(s: &str) -> anyhow::Result { + // use anyhow::anyhow; + match s { + "" => Ok(KeyLoc(None)), + "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))), + "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))), + "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))), + _ => Err(anyhow!("invalid password option")), + } + } +} + +#[derive(StructOpt, Debug)] +/// Mount a bcachefs filesystem by its UUID. +pub struct Options { + /// Where the password would be loaded from. + /// + /// Possible values are: + /// "fail" - don't ask for password, fail if filesystem is encrypted; + /// "wait" - wait for password to become available before mounting; + /// "ask" - prompt the user for password; + #[structopt(short, long, default_value = "")] + pub key_location: KeyLoc, + + /// External UUID of the bcachefs filesystem + pub uuid: uuid::Uuid, + + /// Where the filesystem should be mounted. If not set, then the filesystem + /// won't actually be mounted. But all steps preceeding mounting the + /// filesystem (e.g. asking for passphrase) will still be performed. + pub mountpoint: Option, + + /// Mount options + #[structopt(short, default_value = "")] + pub options: String, +} + +pub mod filesystem; +pub mod key; + +// pub fn mnt_in_use() diff --git a/rust-src/mount/src/main.rs b/rust-src/mount/src/main.rs new file mode 100644 index 0000000..92b6917 --- /dev/null +++ b/rust-src/mount/src/main.rs @@ -0,0 +1,63 @@ +fn main() { + // convert existing log statements to tracing events + // tracing_log::LogTracer::init().expect("logtracer init failed!"); + // format tracing log data to env_logger like stdout + tracing_subscriber::fmt::init(); + + if let Err(e) = crate::main_inner() { + tracing::error!(fatal_error = ?e); + } +} + + + +#[tracing_attributes::instrument("main")] +pub fn main_inner() -> anyhow::Result<()> { + use structopt::StructOpt; + use bcachefs_mount::{Options, filesystem, key}; + unsafe { + libc::setvbuf( + filesystem::stdout, + std::ptr::null_mut(), + libc::_IONBF, + 0, + ); + // libc::fflush(filesystem::stdout); + } + let opt = Options::from_args(); + + + tracing::trace!(?opt); + + let fss = filesystem::probe_filesystems()?; + let fs = fss + .get(&opt.uuid) + .ok_or_else(|| anyhow::anyhow!("filesystem was not found"))?; + + tracing::info!(msg="found filesystem", %fs); + if fs.encrypted() { + let key = opt + .key_location + .0 + .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?; + + key::prepare_key(&fs, key)?; + } + + let mountpoint = opt + .mountpoint + .ok_or_else(|| anyhow::anyhow!("mountpoint option was not specified"))?; + + fs.mount(&mountpoint, &opt.options)?; + + Ok(()) +} + +#[cfg(test)] +mod test { + // use insta::assert_debug_snapshot; + // #[test] + // fn snapshot_testing() { + // insta::assert_debug_snapshot!(); + // } +} diff --git a/smoke_test b/smoke_test index 15a9fce..1122808 100755 --- a/smoke_test +++ b/smoke_test @@ -21,7 +21,7 @@ set -e PYTEST="${PYTEST:-pytest-3}" -spam=$(tempfile) +spam=$(mktemp) unset BCACHEFS_FUSE BCACHEFS_TEST_USE_VALGRIND BCACHEFS_DEBUG trap "set +x; cat ${spam}; rm -f ${spam} ; echo; echo FAILED." EXIT @@ -44,7 +44,6 @@ function build() { function test() { echo Running tests. ( - cd tests ${PYTEST} -n${JOBS} ) > ${spam} 2>&1 } @@ -53,7 +52,6 @@ function test_vg() { echo Running tests with valgrind. ( export BCACHEFS_TEST_USE_VALGRIND=yes - cd tests ${PYTEST} -n${JOBS} ) > ${spam} 2>&1 } @@ -71,13 +69,13 @@ test echo -- Test: debug with valgrind -- test_vg -echo -- Test: fuse debug -- -export BCACHEFS_FUSE=1 -build -test +#echo -- Test: fuse debug -- +#export BCACHEFS_FUSE=1 +#build +#test -echo -- Test: fuse debug with valgrind -- -test_vg +#echo -- Test: fuse debug with valgrind -- +#test_vg rm -f ${spam} trap "set +x; echo; echo SUCCESS." EXIT diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py index 45c6273..ffc2bad 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ # pytest fixture definitions. import pytest -import util +from tests import util @pytest.fixture def bfuse(tmpdir): diff --git a/tests/test_basic.py b/tests/test_basic.py index 6278102..a2e95c5 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -3,7 +3,7 @@ # Basic bcachefs functionality tests. import re -import util +from tests import util def test_help(): ret = util.run_bch(valgrind=True) @@ -38,9 +38,6 @@ def test_list(tmpdir): assert len(ret.stderr) == 0 assert "recovering from clean shutdown" in ret.stdout - # Totally arbitrary, feel free to update or remove after inspecting. - assert len(ret.stdout.splitlines()) == 97 - def test_list_inodes(tmpdir): dev = util.format_1g(tmpdir) diff --git a/tests/test_fixture.py b/tests/test_fixture.py index d8d3819..d96ce88 100644 --- a/tests/test_fixture.py +++ b/tests/test_fixture.py @@ -2,16 +2,15 @@ # # Tests of the functions in util.py -import pytest import signal import subprocess import time +import os +import pytest -import util -from pathlib import Path +from tests import util -#helper = Path('.') / 'test_helper' -helper = './test_helper' +helper = os.path.abspath(os.path.join(util.BASE_PATH, 'test_helper')) def test_sparse_file(tmpdir): dev = util.sparse_file(tmpdir / '1k', 1024) @@ -32,32 +31,32 @@ def test_segfault(): @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_check(): with pytest.raises(subprocess.CalledProcessError): - ret = util.run(helper, 'abort', check=True) + util.run(helper, 'abort', check=True) @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_leak(): with pytest.raises(util.ValgrindFailedError): - ret = util.run(helper, 'leak', valgrind=True) + util.run(helper, 'leak', valgrind=True) @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_undefined(): with pytest.raises(util.ValgrindFailedError): - ret = util.run(helper, 'undefined', valgrind=True) + util.run(helper, 'undefined', valgrind=True) @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_undefined_branch(): with pytest.raises(util.ValgrindFailedError): - ret = util.run(helper, 'undefined_branch', valgrind=True) + util.run(helper, 'undefined_branch', valgrind=True) @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_read_after_free(): with pytest.raises(util.ValgrindFailedError): - ret = util.run(helper, 'read_after_free', valgrind=True) + util.run(helper, 'read_after_free', valgrind=True) @pytest.mark.skipif(not util.ENABLE_VALGRIND, reason="no valgrind") def test_write_after_free(): with pytest.raises(util.ValgrindFailedError): - ret = util.run(helper, 'write_after_free', valgrind=True) + util.run(helper, 'write_after_free', valgrind=True) def test_mountpoint(tmpdir): path = util.mountpoint(tmpdir) diff --git a/tests/test_fuse.py b/tests/test_fuse.py index 69c512e..48288e6 100644 --- a/tests/test_fuse.py +++ b/tests/test_fuse.py @@ -4,7 +4,7 @@ import pytest import os -import util +from tests import util pytestmark = pytest.mark.skipif( not util.have_fuse(), reason="bcachefs not built with fuse support.") diff --git a/tests/util.py b/tests/util.py index b5e02c1..00314f4 100644 --- a/tests/util.py +++ b/tests/util.py @@ -2,18 +2,18 @@ import errno import os -import pytest import re import subprocess -import sys import tempfile import threading import time from pathlib import Path -DIR = Path('..') -BCH_PATH = DIR / 'bcachefs' +BASE_PATH= os.path.dirname(__file__) +BCH_PATH = os.path.abspath(os.path.join(BASE_PATH, '..', 'bcachefs')) +VALGRIND_PATH= os.path.abspath(os.path.join(BASE_PATH, + 'valgrind-suppressions.txt')) VPAT = re.compile(r'ERROR SUMMARY: (\d+) errors from (\d+) contexts') @@ -46,21 +46,22 @@ def run(cmd, *args, valgrind=False, check=False): cmds = [cmd] + list(args) valgrind = valgrind and ENABLE_VALGRIND + print("Running '{}'".format(cmds)) if valgrind: vout = tempfile.NamedTemporaryFile() vcmd = ['valgrind', '--leak-check=full', '--gen-suppressions=all', - '--suppressions=valgrind-suppressions.txt', + '--suppressions={}'.format(VALGRIND_PATH), '--log-file={}'.format(vout.name)] cmds = vcmd + cmds - print("Running '{}'".format(cmds)) - res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - encoding='utf-8', check=check) - - if valgrind: + res = subprocess.run(cmds, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding='utf-8', check=check) check_valgrind(vout.read().decode('utf-8')) + else: + res = subprocess.run(cmds, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding='utf-8', check=check) return res @@ -75,7 +76,7 @@ def sparse_file(lpath, size): This is typically used to create device files for bcachefs. """ path = Path(lpath) - f = path.touch(mode = 0o600, exist_ok = False) + path.touch(mode = 0o600, exist_ok = False) os.truncate(path, size) return path @@ -195,7 +196,8 @@ class BFuse: self.stdout = out1 + out2 self.stderr = err.read() - self.vout = vlog.read().decode('utf-8') + if vlog: + self.vout = vlog.read().decode('utf-8') def expect(self, pipe, regex): """Wait for the child process to mount.""" @@ -230,7 +232,8 @@ class BFuse: print("Waiting for thread to exit.") self.thread.join(timeout) if self.thread.is_alive(): - self.proc.kill() + if self.proc: + self.proc.kill() self.thread.join() else: print("Thread was already done.") @@ -242,6 +245,9 @@ class BFuse: check_valgrind(self.vout) def verify(self): + # avoid throwing exception in assertion + assert self.stdout is not None + assert self.stderr is not None assert self.returncode == 0 assert len(self.stdout) > 0 assert len(self.stderr) == 0 diff --git a/tests/valgrind-suppressions.txt b/tests/valgrind-suppressions.txt index 4ed4de3..d83e052 100644 --- a/tests/valgrind-suppressions.txt +++ b/tests/valgrind-suppressions.txt @@ -1,8 +1,17 @@ { - + call_rcu_memb Memcheck:Leak match-leak-kinds: possible,definite ... fun:get_default_call_rcu_data_memb fun:call_rcu_memb } +{ + call_rcu_data_init + Memcheck:Leak + match-leak-kinds: possible + fun:calloc + fun:_dl_allocate_tls + ... + fun:call_rcu_data_init +} diff --git a/tools-util.c b/tools-util.c index 3cc0de4..9491779 100644 --- a/tools-util.c +++ b/tools-util.c @@ -94,12 +94,12 @@ void xpread(int fd, void *buf, size_t count, off_t offset) } } -void xpwrite(int fd, const void *buf, size_t count, off_t offset) +void xpwrite(int fd, const void *buf, size_t count, off_t offset, const char *msg) { ssize_t r = pwrite(fd, buf, count, offset); if (r != count) - die("write error (ret %zi err %m)", r); + die("error writing %s (ret %zi err %m)", msg, r); } struct stat xfstatat(int dirfd, const char *path, int flags) @@ -211,7 +211,7 @@ u64 read_file_u64(int dirfd, const char *path) { char *buf = read_file_str(dirfd, path); u64 v; - if (kstrtou64(buf, 10, &v)) + if (bch2_strtou64_h(buf, &v)) die("read_file_u64: error parsing %s (got %s)", path, buf); free(buf); return v; @@ -242,17 +242,17 @@ u64 get_size(const char *path, int fd) return ret; } -/* Returns blocksize in units of 512 byte sectors: */ +/* Returns blocksize, in bytes: */ unsigned get_blocksize(const char *path, int fd) { struct stat statbuf = xfstat(fd); if (!S_ISBLK(statbuf.st_mode)) - return statbuf.st_blksize >> 9; + return statbuf.st_blksize; unsigned ret; xioctl(fd, BLKPBSZGET, &ret); - return ret >> 9; + return ret; } /* Open a block device, do magic blkid stuff to probe for existing filesystems: */ @@ -262,7 +262,9 @@ int open_for_format(const char *dev, bool force) const char *fs_type = NULL, *fs_label = NULL; size_t fs_type_len, fs_label_len; - int fd = xopen(dev, O_RDWR|O_EXCL); + int fd = open(dev, O_RDWR|O_EXCL); + if (fd < 0) + die("Error opening device to format %s: %m", dev); if (force) return fd; @@ -401,24 +403,6 @@ char *strcmp_prefix(char *a, const char *a_prefix) return *a_prefix ? NULL : a; } -unsigned hatoi_validate(const char *s, const char *msg) -{ - u64 v; - - if (bch2_strtoull_h(s, &v)) - die("bad %s %s", msg, s); - - v /= 512; - - if (v > USHRT_MAX) - die("%s too large\n", msg); - - if (!v) - die("%s too small\n", msg); - - return v; -} - /* crc32c */ static u32 crc32c_default(u32 crc, const void *buf, size_t size) diff --git a/tools-util.h b/tools-util.h index 568707b..9468f07 100644 --- a/tools-util.h +++ b/tools-util.h @@ -22,14 +22,15 @@ #define noreturn __attribute__((noreturn)) -void die(const char *, ...) noreturn; +void die(const char *, ...) + __attribute__ ((format (printf, 1, 2))) noreturn; char *mprintf(const char *, ...) __attribute__ ((format (printf, 1, 2))); void *xcalloc(size_t, size_t); void *xmalloc(size_t); void *xrealloc(void *, size_t); void xpread(int, void *, size_t, off_t); -void xpwrite(int, const void *, size_t, off_t); +void xpwrite(int, const void *, size_t, off_t, const char *); struct stat xfstatat(int, const char *, int); struct stat xfstat(int); struct stat xstat(const char *); @@ -150,8 +151,6 @@ struct fiemap_extent fiemap_iter_next(struct fiemap_iter *); char *strcmp_prefix(char *, const char *); -unsigned hatoi_validate(const char *, const char *); - u32 crc32c(u32, const void *, size_t); char *dev_to_name(dev_t); -- 2.39.2