From d364661521d53cac21891bef4f406574d6332f15 Mon Sep 17 00:00:00 2001 From: Jonathan Carter Date: Thu, 28 Sep 2023 20:05:31 +0200 Subject: [PATCH] New upstream release --- .gitignore | 7 +- .travis.yml | 36 - INSTALL | 75 - INSTALL.md | 77 + Makefile | 90 +- Makefile.compiler | 18 +- README | 12 - README.md | 27 + bcachefs.8 | 70 +- bcachefs.c | 29 +- build.nix | 72 + cmd_device.c | 11 +- cmd_dump.c | 8 +- cmd_format.c | 8 +- cmd_fs.c | 32 +- cmd_fsck.c | 32 +- cmd_fusemount.c | 3 +- cmd_key.c | 6 +- cmd_kill_btree_node.c | 5 +- cmd_list.c | 408 ----- cmd_list_journal.c | 251 ++- cmd_migrate.c | 28 +- cmd_option.c | 3 +- cmds.h | 2 +- debian/changelog | 6 + debian/files | 2 +- default.nix | 138 +- flake.lock | 49 +- flake.nix | 114 +- include/linux/atomic.h | 21 + include/linux/bio.h | 2 +- include/linux/bit_spinlock.h | 1 + include/linux/bitmap.h | 8 + include/linux/blkdev.h | 3 + include/linux/closure.h | 8 +- include/linux/compiler.h | 4 + include/linux/freezer.h | 2 + include/linux/jiffies.h | 2 +- include/linux/kernel.h | 18 +- include/linux/list.h | 28 + include/linux/lz4.h | 5 + include/linux/math.h | 20 + include/linux/mean_and_variance.h | 162 +- include/linux/poison.h | 9 + include/linux/posix_acl_xattr.h | 4 +- include/linux/prandom.h | 6 + include/linux/random.h | 24 + include/linux/rcupdate.h | 1 + include/linux/rhashtable.h | 2 - include/linux/sched.h | 3 +- include/linux/seq_buf.h | 153 ++ include/linux/shrinker.h | 4 +- include/linux/six.h | 426 +++-- include/linux/slab.h | 85 +- include/linux/types.h | 1 + include/linux/uuid.h | 24 +- include/linux/wait.h | 8 +- include/linux/xattr.h | 2 +- include/trace/events/lock.h | 144 ++ libbcachefs.c | 10 +- libbcachefs.h | 16 +- libbcachefs/acl.c | 56 +- libbcachefs/acl.h | 4 +- libbcachefs/alloc_background.c | 1301 ++++++++++---- libbcachefs/alloc_background.h | 98 +- libbcachefs/alloc_foreground.c | 900 ++++++---- libbcachefs/alloc_foreground.h | 74 +- libbcachefs/alloc_types.h | 73 +- libbcachefs/backpointers.c | 686 +++----- libbcachefs/backpointers.h | 91 +- libbcachefs/bcachefs.h | 210 ++- libbcachefs/bcachefs_format.h | 348 +++- libbcachefs/bcachefs_ioctl.h | 2 +- libbcachefs/bkey.c | 45 +- libbcachefs/bkey.h | 156 +- libbcachefs/bkey_methods.c | 212 +-- libbcachefs/bkey_methods.h | 57 +- libbcachefs/bkey_sort.c | 10 +- libbcachefs/bset.c | 64 +- libbcachefs/bset.h | 21 +- libbcachefs/btree_cache.c | 314 ++-- libbcachefs/btree_cache.h | 36 +- libbcachefs/btree_gc.c | 356 ++-- libbcachefs/btree_gc.h | 3 +- libbcachefs/btree_io.c | 303 ++-- libbcachefs/btree_io.h | 12 +- libbcachefs/btree_iter.c | 903 +++++----- libbcachefs/btree_iter.h | 395 ++++- libbcachefs/btree_key_cache.c | 215 ++- libbcachefs/btree_key_cache.h | 4 +- libbcachefs/btree_locking.c | 177 +- libbcachefs/btree_locking.h | 77 +- libbcachefs/btree_types.h | 158 +- libbcachefs/btree_update.h | 253 ++- libbcachefs/btree_update_interior.c | 285 +-- libbcachefs/btree_update_interior.h | 15 +- libbcachefs/btree_update_leaf.c | 1313 +++++++++----- libbcachefs/btree_write_buffer.c | 372 ++++ libbcachefs/btree_write_buffer.h | 14 + libbcachefs/btree_write_buffer_types.h | 44 + libbcachefs/buckets.c | 471 +++-- libbcachefs/buckets.h | 80 +- libbcachefs/buckets_types.h | 11 - libbcachefs/buckets_waiting_for_journal.c | 40 +- .../buckets_waiting_for_journal_types.h | 4 +- libbcachefs/chardev.c | 15 +- libbcachefs/checksum.c | 7 +- libbcachefs/checksum.h | 6 - libbcachefs/clock.c | 8 +- libbcachefs/compress.c | 228 ++- libbcachefs/compress.h | 37 + libbcachefs/counters.c | 6 +- libbcachefs/darray.h | 26 +- libbcachefs/data_update.c | 352 ++-- libbcachefs/data_update.h | 3 +- libbcachefs/debug.c | 220 ++- libbcachefs/debug.h | 2 + libbcachefs/dirent.c | 42 +- libbcachefs/dirent.h | 5 +- libbcachefs/disk_groups.c | 88 +- libbcachefs/disk_groups.h | 17 +- libbcachefs/ec.c | 1104 +++++++----- libbcachefs/ec.h | 64 +- libbcachefs/ec_types.h | 7 +- libbcachefs/errcode.h | 164 +- libbcachefs/error.c | 105 +- libbcachefs/error.h | 13 +- libbcachefs/extent_update.c | 11 +- libbcachefs/extents.c | 261 +-- libbcachefs/extents.h | 92 +- libbcachefs/fs-common.c | 2 +- libbcachefs/fs-io.c | 1555 ++++++++++------- libbcachefs/fs-io.h | 2 +- libbcachefs/fs-ioctl.c | 25 +- libbcachefs/fs.c | 211 ++- libbcachefs/fs.h | 8 +- libbcachefs/fsck.c | 1029 +++++------ libbcachefs/fsck.h | 10 +- libbcachefs/inode.c | 396 ++++- libbcachefs/inode.h | 54 +- libbcachefs/io.c | 549 +++--- libbcachefs/io.h | 55 +- libbcachefs/io_types.h | 8 +- libbcachefs/journal.c | 428 ++--- libbcachefs/journal.h | 52 +- libbcachefs/journal_io.c | 290 +-- libbcachefs/journal_io.h | 13 +- libbcachefs/journal_reclaim.c | 146 +- libbcachefs/journal_sb.c | 39 +- libbcachefs/journal_sb.h | 2 +- libbcachefs/journal_seq_blacklist.c | 8 +- libbcachefs/journal_types.h | 63 +- libbcachefs/keylist.c | 20 +- libbcachefs/keylist.h | 1 - libbcachefs/lru.c | 212 +-- libbcachefs/lru.h | 58 +- libbcachefs/migrate.c | 14 +- libbcachefs/move.c | 473 +++-- libbcachefs/move.h | 32 +- libbcachefs/move_types.h | 17 + libbcachefs/movinggc.c | 354 ++-- libbcachefs/movinggc.h | 2 + libbcachefs/nocow_locking.c | 119 +- libbcachefs/nocow_locking.h | 58 +- libbcachefs/nocow_locking_types.h | 20 + libbcachefs/opts.c | 167 +- libbcachefs/opts.h | 103 +- {linux => libbcachefs}/printbuf.c | 129 +- {include/linux => libbcachefs}/printbuf.h | 86 +- libbcachefs/quota.c | 70 +- libbcachefs/quota.h | 5 +- libbcachefs/rebalance.c | 8 +- libbcachefs/recovery.c | 679 +++---- libbcachefs/recovery.h | 2 + libbcachefs/recovery_types.h | 48 + libbcachefs/reflink.c | 78 +- libbcachefs/reflink.h | 11 +- libbcachefs/replicas.c | 90 +- libbcachefs/replicas.h | 16 - libbcachefs/replicas_types.h | 16 + libbcachefs/seqmutex.h | 48 + libbcachefs/str_hash.h | 4 +- libbcachefs/subvolume.c | 1347 ++++++++++---- libbcachefs/subvolume.h | 165 +- libbcachefs/subvolume_types.h | 22 + libbcachefs/super-io.c | 394 +++-- libbcachefs/super-io.h | 20 +- libbcachefs/super.c | 296 ++-- libbcachefs/super.h | 5 +- libbcachefs/sysfs.c | 225 ++- libbcachefs/tests.c | 223 ++- libbcachefs/trace.c | 4 +- .../events/bcachefs.h => libbcachefs/trace.h | 295 +++- libbcachefs/two_state_shared_lock.c | 29 +- libbcachefs/two_state_shared_lock.h | 37 +- libbcachefs/util.c | 207 ++- libbcachefs/util.h | 121 +- libbcachefs/varint.c | 11 +- libbcachefs/xattr.c | 84 +- libbcachefs/xattr.h | 10 +- linux/blkdev.c | 30 +- linux/closure.c | 1 + linux/fs.c | 4 +- linux/kthread.c | 13 +- linux/mean_and_variance.c | 92 +- linux/pretty-printers.c | 60 - linux/printbuf_userspace.c | 34 - linux/seq_buf.c | 152 ++ linux/shrinker.c | 29 + linux/six.c | 798 +++++---- linux/string_helpers.c | 1 - mount.bcachefs | 4 + nix/bcachefs-kernel.nix | 34 - nix/bcachefs.rev.sha256 | 1 - nix/fetchnix.nix | 48 - nix/nixpkgs.json | 5 - nix/nixpkgs.nix | 9 - nix/overlay.nix | 28 - nix/update-nixpkgs.sh | 32 - packaging/bcachefs-tools.spec | 59 +- rust-src/{mount => }/.gitignore | 0 rust-src/{mount => }/Cargo.lock | 915 +++++----- rust-src/Cargo.toml | 28 + rust-src/{mount => }/README.md | 0 rust-src/bch_bindgen/Cargo.lock | 662 +++++-- rust-src/bch_bindgen/Cargo.toml | 17 +- rust-src/bch_bindgen/build.rs | 171 +- rust-src/bch_bindgen/default.nix | 76 - rust-src/bch_bindgen/rustfmt.toml | 5 +- rust-src/bch_bindgen/src/bcachefs.rs | 146 +- rust-src/bch_bindgen/src/bkey.rs | 121 ++ rust-src/bch_bindgen/src/btree.rs | 202 +++ rust-src/bch_bindgen/src/errcode.rs | 40 + rust-src/bch_bindgen/src/fs.rs | 27 + rust-src/bch_bindgen/src/lib.rs | 163 +- .../bch_bindgen/src/libbcachefs_wrapper.h | 12 + rust-src/bch_bindgen/src/opts.rs | 35 + rust-src/bch_bindgen/src/rs.rs | 79 +- rust-src/mount/Cargo.toml | 30 - rust-src/mount/default.nix | 41 - rust-src/mount/module.nix | 54 - rust-src/mount/rustfmt.toml | 2 - rust-src/mount/src/filesystem.rs | 208 --- rust-src/mount/src/key.rs | 97 - rust-src/mount/src/lib.rs | 91 - rust-src/mount/src/main.rs | 63 - rust-src/rustfmt.toml | 3 + rust-src/src/cmd_list.rs | 171 ++ rust-src/src/cmd_mount.rs | 245 +++ rust-src/src/key.rs | 129 ++ rust-src/src/lib.rs | 24 + rust-src/src/logger.rs | 28 + shell.nix | 18 - smoke_test | 81 - tools-util.c | 83 +- tools-util.h | 19 +- 256 files changed, 20475 insertions(+), 12396 deletions(-) delete mode 100644 .travis.yml delete mode 100644 INSTALL create mode 100644 INSTALL.md delete mode 100644 README create mode 100644 README.md create mode 100644 build.nix delete mode 100644 cmd_list.c create mode 100644 include/linux/seq_buf.h create mode 100644 include/trace/events/lock.h create mode 100644 libbcachefs/btree_write_buffer.c create mode 100644 libbcachefs/btree_write_buffer.h create mode 100644 libbcachefs/btree_write_buffer_types.h create mode 100644 libbcachefs/nocow_locking_types.h rename {linux => libbcachefs}/printbuf.c (72%) rename {include/linux => libbcachefs}/printbuf.h (76%) create mode 100644 libbcachefs/recovery_types.h create mode 100644 libbcachefs/seqmutex.h rename include/trace/events/bcachefs.h => libbcachefs/trace.h (82%) delete mode 100644 linux/pretty-printers.c delete mode 100644 linux/printbuf_userspace.c create mode 100644 linux/seq_buf.c create mode 100755 mount.bcachefs delete mode 100644 nix/bcachefs-kernel.nix delete mode 100644 nix/bcachefs.rev.sha256 delete mode 100644 nix/fetchnix.nix delete mode 100644 nix/nixpkgs.json delete mode 100644 nix/nixpkgs.nix delete mode 100644 nix/overlay.nix delete mode 100755 nix/update-nixpkgs.sh rename rust-src/{mount => }/.gitignore (100%) rename rust-src/{mount => }/Cargo.lock (54%) create mode 100644 rust-src/Cargo.toml rename rust-src/{mount => }/README.md (100%) delete mode 100644 rust-src/bch_bindgen/default.nix create mode 100644 rust-src/bch_bindgen/src/bkey.rs create mode 100644 rust-src/bch_bindgen/src/btree.rs create mode 100644 rust-src/bch_bindgen/src/errcode.rs create mode 100644 rust-src/bch_bindgen/src/fs.rs create mode 100644 rust-src/bch_bindgen/src/opts.rs delete mode 100644 rust-src/mount/Cargo.toml delete mode 100644 rust-src/mount/default.nix delete mode 100644 rust-src/mount/module.nix delete mode 100644 rust-src/mount/rustfmt.toml delete mode 100644 rust-src/mount/src/filesystem.rs delete mode 100644 rust-src/mount/src/key.rs delete mode 100644 rust-src/mount/src/lib.rs delete mode 100644 rust-src/mount/src/main.rs create mode 100644 rust-src/rustfmt.toml create mode 100644 rust-src/src/cmd_list.rs create mode 100644 rust-src/src/cmd_mount.rs create mode 100644 rust-src/src/key.rs create mode 100644 rust-src/src/lib.rs create mode 100644 rust-src/src/logger.rs delete mode 100644 shell.nix delete mode 100755 smoke_test diff --git a/.gitignore b/.gitignore index b1c03cd..efd7dc7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,19 +3,20 @@ bcachefs bcachefs.5 .* *.o +*.so *.d *.a tags TAGS cscope* bcachefs-tools +compile_commands.json tests/test_helper tests/__pycache__/ # dot-files that we don't want to ignore !.gitignore -!.travis.yml +!.github/dependabot.yml +!.editorconfig -mount/target -mount.bcachefs bcachefs-principles-of-operation.* diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e66f0c2..0000000 --- a/.travis.yml +++ /dev/null @@ -1,36 +0,0 @@ -os: linux -dist: bionic -language: c -arch: - - amd64 -# - arm64 - -addons: - apt: - packages: - - valgrind - - python3-docutils - - python3-pytest - - python3-pytest-xdist - - meson - - ninja-build - - pkg-config - - libaio-dev - - libblkid-dev - - libkeyutils-dev - - liblz4-dev - - libsodium-dev - - liburcu-dev - - libzstd-dev - - libudev-dev - - uuid-dev - - zlib1g-dev - -before_install: - - wget https://github.com/libfuse/libfuse/archive/fuse-3.7.0.tar.gz -O /tmp/fuse.tar.gz - - tar -C /tmp -zxvf /tmp/fuse.tar.gz - - mkdir /tmp/libfuse-fuse-3.7.0/build - - pushd /tmp/libfuse-fuse-3.7.0/build && meson .. && ninja && sudo ninja install && popd - - sudo ldconfig - -script: ./smoke_test diff --git a/INSTALL b/INSTALL deleted file mode 100644 index b4d60bf..0000000 --- a/INSTALL +++ /dev/null @@ -1,75 +0,0 @@ --- Getting started -- - -Dependencies: - - * libaio - * libblkid - * libkeyutils - * liblz4 - * libsodium - * liburcu - * libuuid - * libzstd - * pkg-config - * zlib1g - * valgrind - -Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with - apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \ - liblz4-dev libsodium-dev liburcu-dev libzstd-dev \ - uuid-dev zlib1g-dev valgrind libudev-dev git build-essential \ - python3 python3-docutils - -Fedora: install the "Development tools" group along with: - dnf install -y libaio-devel libsodium-devel \ - libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \ - lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \ - findutils - -Arch: install bcachefs-tools-git from the AUR. -Or to build from source, install build dependencies with - pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind - -Then, just make && make install - - --- Experimental features -- - -Experimental fuse support is currently disabled by default. Fuse support is at -an early stage and may corrupt your filesystem, so it should only be used for -testing. To enable, you'll also need to add: - -* libfuse3 >= 3.7 - -On Debian/Ubuntu (Bullseye/20.04 or later needed for libfuse >= 3.7): - apt install -y libfuse3-dev - -On Fedora (32 or later needed for lbifuse >= 3.7): - dnf install -y fuse3-devel - -Arch: - pacman -S fuse3 - -Then, make using the BCACHEFS_FUSE environment variable (make clean first if -previously built without fuse support): - -BCACHEFS_FUSE=1 make && make install - - --- Tests -- - -Some tests are available to validate the "bcachefs" binary. The tests depend -on python3 pytest. - -On debian: - apt install -u python3-pytest - -Then, you can run the tests via: - - make check - # or if pytest has a different name - make check PYTEST=pytest - -Optionally, you may wish to run tests in parallel using python3-pytest-xdist: - - cd tests; pytest-3 -n4 diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000..94b2877 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,77 @@ +Getting started +--------------- + +Build dependencies: + + * libaio + * libblkid + * libkeyutils + * liblz4 + * libsodium + * liburcu + * libuuid + * libzstd + * pkg-config + * valgrind + * zlib1g + +In addition a recent Rust toolchain is required (rustc, cargo), either by using +[rustup](https://rustup.rs/) or make sure to use a distribution where rustc (>=1.64) +is available. + +Debian (Bullseye or later) and Ubuntu (20.04 or later): you can install these with + +``` shell +apt install -y pkg-config libaio-dev libblkid-dev libkeyutils-dev \ + liblz4-dev libsodium-dev liburcu-dev libzstd-dev \ + uuid-dev zlib1g-dev valgrind libudev-dev git build-essential \ + python3 python3-docutils +``` + +Fedora: install the "Development tools" group along with: +```shell +dnf install -y libaio-devel libsodium-devel \ + libblkid-devel libzstd-devel zlib-devel userspace-rcu-devel \ + lz4-devel libuuid-devel valgrind-devel keyutils-libs-devel \ + findutils +``` + +Arch: install bcachefs-tools-git from the AUR. +Or to build from source, install build dependencies with +```shell +pacman -S base-devel libaio keyutils libsodium liburcu zstd valgrind +``` + +Then, just `make && make install` + + +Experimental features +--------------------- + +Experimental fuse support is currently disabled by default. Fuse support is at +an early stage and may corrupt your filesystem, so it should only be used for +testing. To enable, you'll also need to add: + +* libfuse3 >= 3.7 + +On Debian/Ubuntu (Bullseye/20.04 or later needed for libfuse >= 3.7): +```shell +apt install -y libfuse3-dev +``` + +On Fedora (32 or later needed for lbifuse >= 3.7): +```shell +dnf install -y fuse3-devel +``` + +Arch: +```shell +pacman -S fuse3 +``` + +Then, make using the `BCACHEFS_FUSE` environment variable (make clean first if +previously built without fuse support): + +```shell +BCACHEFS_FUSE=1 make && make install +``` diff --git a/Makefile b/Makefile index 49f06cf..c77c0c5 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,7 @@ endif CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ -Wno-pointer-sign \ + -Wno-deprecated-declarations \ -fno-strict-aliasing \ -fno-delete-null-pointer-checks \ -I. -Iinclude -Iraid \ @@ -33,14 +34,6 @@ CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC \ $(EXTRA_CFLAGS) LDFLAGS+=$(CFLAGS) $(EXTRA_LDFLAGS) -## Configure Tools -PYTEST_ARGS?= -PYTEST_CMD?=$(shell \ - command -v pytest-3 \ - || which pytest-3 2>/dev/null \ -) -PYTEST:=$(PYTEST_CMD) $(PYTEST_ARGS) - CARGO_ARGS= CARGO=cargo $(CARGO_ARGS) CARGO_PROFILE=release @@ -81,30 +74,23 @@ LDLIBS+=-lm -lpthread -lrt -lkeyutils -laio -ldl LDLIBS+=$(EXTRA_LDLIBS) ifeq ($(PREFIX),/usr) - ROOT_SBINDIR=/sbin + ROOT_SBINDIR?=/sbin INITRAMFS_DIR=$(PREFIX)/share/initramfs-tools else - ROOT_SBINDIR=$(PREFIX)/sbin + ROOT_SBINDIR?=$(PREFIX)/sbin INITRAMFS_DIR=/etc/initramfs-tools endif .PHONY: all -all: bcachefs lib +all: bcachefs -.PHONY: lib -lib: libbcachefs.so +.PHONY: debug +debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y +debug: bcachefs .PHONY: tests tests: tests/test_helper -.PHONY: check -check: tests bcachefs -ifneq (,$(PYTEST_CMD)) - $(PYTEST) -else - @echo "WARNING: pytest not found or specified, tests could not be run." -endif - .PHONY: TAGS tags TAGS: ctags -e -R . @@ -112,7 +98,7 @@ TAGS: tags: ctags -R . -SRCS=$(shell find . -type f ! -path '*/.*/*' -iname '*.c') +SRCS=$(sort $(shell find . -type f ! -path '*/.*/*' -iname '*.c')) DEPS=$(SRCS:.c=.d) -include $(DEPS) @@ -122,30 +108,25 @@ OBJS=$(SRCS:.c=.o) @echo " [CC] $@" $(Q)$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $@ $< -bcachefs: $(filter-out ./tests/%.o, $(OBJS)) - @echo " [LD] $@" - $(Q)$(CC) $(LDFLAGS) $+ $(LOADLIBES) $(LDLIBS) -o $@ - -RUST_SRCS=$(shell find rust-src/ -type f -iname '*.rs') -MOUNT_SRCS=$(filter %mount, $(RUST_SRCS)) +BCACHEFS_DEPS=libbcachefs.a -debug: CFLAGS+=-Werror -DCONFIG_BCACHEFS_DEBUG=y -DCONFIG_VALGRIND=y -debug: bcachefs - -MOUNT_OBJ=$(filter-out ./bcachefs.o ./tests/%.o ./cmd_%.o , $(OBJS)) -libbcachefs.so: LDFLAGS+=-shared -libbcachefs.so: $(MOUNT_OBJ) - @echo " [CC] $@" - $(Q)$(CC) $(LDFLAGS) $+ -o $@ $(LDLIBS) +ifndef NO_RUST + BCACHEFS_DEPS+=rust-src/target/release/libbcachefs_rust.a +else + CFLAGS+=-DBCACHEFS_NO_RUST +endif -MOUNT_TOML=rust-src/mount/Cargo.toml -mount.bcachefs: lib $(MOUNT_SRCS) - LIBBCACHEFS_LIB=$(CURDIR) \ - LIBBCACHEFS_INCLUDE=$(CURDIR) \ - $(CARGO_BUILD) --manifest-path $(MOUNT_TOML) +bcachefs: $(BCACHEFS_DEPS) + @echo " [LD] $@" + $(Q)$(CC) $(LDFLAGS) -Wl,--whole-archive $+ $(LOADLIBES) -Wl,--no-whole-archive $(LDLIBS) -o $@ - ln -f rust-src/mount/target/$(CARGO_PROFILE)/bcachefs-mount $@ +libbcachefs.a: $(filter-out ./tests/%.o, $(OBJS)) + @echo " [AR] $@" + $(Q)ar -rc $@ $+ +RUST_SRCS=$(shell find rust-src/src rust-src/bch_bindgen/src -type f -iname '*.rs') +rust-src/target/release/libbcachefs_rust.a: $(RUST_SRCS) + $(CARGO_BUILD) --manifest-path rust-src/Cargo.toml tests/test_helper: $(filter ./tests/%.o, $(OBJS)) @echo " [LD] $@" @@ -165,15 +146,14 @@ cmd_version.o : .version .PHONY: install install: INITRAMFS_HOOK=$(INITRAMFS_DIR)/hooks/bcachefs install: INITRAMFS_SCRIPT=$(INITRAMFS_DIR)/scripts/local-premount/bcachefs -install: bcachefs lib +install: bcachefs $(INSTALL) -m0755 -D bcachefs -t $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0755 fsck.bcachefs $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0755 mkfs.bcachefs $(DESTDIR)$(ROOT_SBINDIR) + $(INSTALL) -m0755 mount.bcachefs $(DESTDIR)$(ROOT_SBINDIR) $(INSTALL) -m0644 -D bcachefs.8 -t $(DESTDIR)$(PREFIX)/share/man/man8/ $(INSTALL) -m0755 -D initramfs/script $(DESTDIR)$(INITRAMFS_SCRIPT) $(INSTALL) -m0755 -D initramfs/hook $(DESTDIR)$(INITRAMFS_HOOK) - $(INSTALL) -m0755 -D mount.bcachefs.sh $(DESTDIR)$(ROOT_SBINDIR) - $(INSTALL) -m0755 -D libbcachefs.so -t $(DESTDIR)$(PREFIX)/lib/ sed -i '/^# Note: make install replaces/,$$d' $(DESTDIR)$(INITRAMFS_HOOK) echo "copy_exec $(ROOT_SBINDIR)/bcachefs /sbin/bcachefs" >> $(DESTDIR)$(INITRAMFS_HOOK) @@ -181,13 +161,17 @@ install: bcachefs lib .PHONY: clean clean: @echo "Cleaning all" - $(Q)$(RM) bcachefs mount.bcachefs libbcachefs_mount.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED) + $(Q)$(RM) bcachefs libbcachefs.a tests/test_helper .version *.tar.xz $(OBJS) $(DEPS) $(DOCGENERATED) $(Q)$(RM) -rf rust-src/*/target .PHONY: deb deb: all debuild -us -uc -nc -b -i -I +.PHONY: rpm +rpm: clean + rpmbuild --build-in-place -bb --define "_version $(subst -,_,$(VERSION))" packaging/bcachefs-tools.spec + bcachefs-principles-of-operation.pdf: doc/bcachefs-principles-of-operation.tex pdflatex doc/bcachefs-principles-of-operation.tex pdflatex doc/bcachefs-principles-of-operation.tex @@ -200,8 +184,10 @@ update-bcachefs-sources: test -d libbcachefs || mkdir libbcachefs cp $(LINUX_DIR)/fs/bcachefs/*.[ch] libbcachefs/ git add libbcachefs/*.[ch] - cp $(LINUX_DIR)/include/trace/events/bcachefs.h include/trace/events/ - git add include/trace/events/bcachefs.h + cp $(LINUX_DIR)/include/linux/closure.h include/linux/ + git add include/linux/closure.h + cp $(LINUX_DIR)/lib/closure.c linux/ + git add linux/closure.c cp $(LINUX_DIR)/include/linux/xxhash.h include/linux/ git add include/linux/xxhash.h cp $(LINUX_DIR)/lib/xxhash.c linux/ @@ -220,16 +206,12 @@ update-bcachefs-sources: git add linux/generic-radix-tree.c cp $(LINUX_DIR)/include/linux/kmemleak.h include/linux/ git add include/linux/kmemleak.h - cp $(LINUX_DIR)/include/linux/printbuf.h include/linux/ - git add include/linux/printbuf.h - cp $(LINUX_DIR)/lib/printbuf.c linux/ - git add linux/printbuf.c + cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ + git add linux/int_sqrt.c cp $(LINUX_DIR)/lib/math/mean_and_variance.c linux/ git add linux/mean_and_variance.c cp $(LINUX_DIR)/include/linux/mean_and_variance.h include/linux/ git add include/linux/mean_and_variance.h - cp $(LINUX_DIR)/lib/math/int_sqrt.c linux/ - git add linux/int_sqrt.c cp $(LINUX_DIR)/scripts/Makefile.compiler ./ git add Makefile.compiler $(RM) libbcachefs/*.mod.c diff --git a/Makefile.compiler b/Makefile.compiler index 94d0d40..7aa1fbc 100644 --- a/Makefile.compiler +++ b/Makefile.compiler @@ -29,16 +29,16 @@ try-run = $(shell set -e; \ fi) # as-option -# Usage: cflags-y += $(call as-option,-Wa$(comma)-isa=foo,) +# Usage: aflags-y += $(call as-option,-Wa$(comma)-isa=foo,) as-option = $(call try-run,\ - $(CC) $(KBUILD_CFLAGS) $(1) -c -x assembler /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_AFLAGS) $(1) -c -x assembler-with-cpp /dev/null -o "$$TMP",$(1),$(2)) # as-instr -# Usage: cflags-y += $(call as-instr,instr,option1,option2) +# Usage: aflags-y += $(call as-instr,instr,option1,option2) as-instr = $(call try-run,\ - printf "%b\n" "$(1)" | $(CC) $(KBUILD_AFLAGS) -c -x assembler -o "$$TMP" -,$(2),$(3)) + printf "%b\n" "$(1)" | $(CC) -Werror $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3)) # __cc-option # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586) @@ -61,9 +61,13 @@ cc-option-yn = $(call try-run,\ cc-disable-warning = $(call try-run,\ $(CC) -Werror $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) -# cc-ifversion -# Usage: EXTRA_CFLAGS += $(call cc-ifversion, -lt, 0402, -O1) -cc-ifversion = $(shell [ $(CONFIG_GCC_VERSION)0 $(1) $(2)000 ] && echo $(3) || echo $(4)) +# gcc-min-version +# Usage: cflags-$(call gcc-min-version, 70100) += -foo +gcc-min-version = $(call test-ge, $(CONFIG_GCC_VERSION), $1) + +# clang-min-version +# Usage: cflags-$(call clang-min-version, 110000) += -foo +clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1) # ld-option # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y) diff --git a/README b/README deleted file mode 100644 index 3d2641e..0000000 --- a/README +++ /dev/null @@ -1,12 +0,0 @@ -Userspace tools for bcachefs - -This builds the bcachefs tool, which has a number of subcommands for formatting -and managing bcachefs filesystems: - -bcachefs format -bcachefs unlock -bcachefs assemble -bcachefs incremental -etc. - -Run bcachefs --help for full list of commands. diff --git a/README.md b/README.md new file mode 100644 index 0000000..30149ab --- /dev/null +++ b/README.md @@ -0,0 +1,27 @@ +bcachefs-tools +============== +Userspace tools and docs for bcachefs + +Bcachefs is an advanced new filesystem for Linux, with an emphasis on reliability and robustness +and the complete set of features one would expect from a modern filesystem. + +This repo primarily consists of the following: + +- bcachefs tool, the reason this repo exists. +- {mkfs,mount,fsck}.bcachefs utils, which is just wrappers calling the corresponding subcommands + in the main tool +- docs in the form of man-pages and a user manual + +Please refer to the main site for [getting started](https://bcachefs.org/#Getting_started) +An in-depth user manual is (also) found on the [official website](https://bcachefs.org/#Documentation) + +Build and install +----------------- + +Refer to [INSTALL.md](./INSTALL.md) + +Bug reports and contributions +----------------------------- + +- The official mailing list, linux-bcachefs@vger.kernel.org +- IRC: #bcache on OFTC (irc.oftc.net). Although, note that it can be easily missed. diff --git a/bcachefs.8 b/bcachefs.8 index d5c4e89..994b50f 100644 --- a/bcachefs.8 +++ b/bcachefs.8 @@ -1,4 +1,4 @@ -.Dd May 26, 2018 +.Dd March 7, 2023 .Dt BCACHEFS 8 SMM .Os .Sh NAME @@ -21,22 +21,16 @@ Format one or a list of devices with bcachefs data structures. .It Ic show-super Dump superblock information to stdout. .El +.Ss Mount commands +.Bl -tag -width 18n -compact +.It Ic mount +Mount a filesystem. +.El .Ss Repair commands .Bl -tag -width 18n -compact .It Ic fsck Check an existing filesystem for errors. .El -.Ss Startup/shutdown, assembly of multi device filesystems -.Bl -tag -width 18n -compact -.It Ic assemble -Assemble an existing multi device filesystem -.It Ic incremental -Incrementally assemble an existing multi device filesystem -.It Ic run -Start a partially assembled filesystem. -.It Ic stop -Stop a running filesystem. -.El .Ss Commands for managing a running filesystem .Bl -tag -width 18n -compact .It Ic fs usage @@ -162,6 +156,47 @@ List of sections to print Print superblock layout .El .El +.Sh Mount commands +.Bl -tag -width Ds +.It Nm Ic mount Oo Ar options Oc Ar device mountpoint +Mount a filesystem. The +.Ar device +can be a device, a colon-separated list of devices, or UUID=. The +.Ar mountpoint +is the path where the filesystem should be mounted. If not set, then the filesystem won't actually be mounted +but all steps preceeding mounting the filesystem (e.g. asking for passphrase) will still be performed. +.Pp the options are as follows: +.Bl -tag -width Ds +.It Fl o Ar options +Mount options provided as a comma-separated list. See userguide for complete list. +.Bl -tag -width Ds -compact +.It Cm degraded +Allow mounting with data degraded +.It Cm verbose +Extra debugging info during mount/recovery +.It Cm fsck +Run fsck during mount +.It Cm fix_errors +Fix errors without asking during fsck +.It Cm read_only +Mount in read only mode +.It Cm version_upgrade +.El +.It Fl k , Fl -key-location Ns = Ns ( Cm fail | wait | ask ) +Where the password would be loaded from. (default: +.Cm ask ) . +.Bl -tag -width Ds -compact +.It Cm fail +don't ask for password, fail if filesystem is encrypted. +.It Cm wait +wait for password to become available before mounting. +.It Cm ask +prompt the user for password. +.El +.It Fl v +Be verbose. Can be specified more than once. +.El +.El .Sh Repair commands .Bl -tag -width Ds .It Nm Ic fsck Oo Ar options Oc Ar devices\ ... @@ -179,17 +214,6 @@ Force checking even if filesystem is marked clean Be verbose .El .El -.Sh Startup/shutdown, assembly of multi device filesystems -.Bl -tag -width Ds -.It Nm Ic assemble Ar devices\ ... -Assemble an existing multi device filesystem. -.It Nm Ic incremental Ar device -Incrementally assemble an existing multi device filesystem. -.It Nm Ic run -Start a partially assembled filesystem. -.It Nm Ic stop Ar filesystem -Stop a running filesystem. -.El .Sh Commands for managing a running filesystem .Bl -tag -width Ds .It Nm Ic fs Ic usage Oo Ar options Oc Op Ar filesystem diff --git a/bcachefs.c b/bcachefs.c index 31d9628..a3fe6d8 100644 --- a/bcachefs.c +++ b/bcachefs.c @@ -35,6 +35,11 @@ static void usage(void) " show-super Dump superblock information to stdout\n" " set-option Set a filesystem option\n" "\n" +#ifndef BCACHEFS_NO_RUST + "Mount:\n" + " mount Mount a filesystem\n" + "\n" +#endif "Repair:\n" " fsck Check an existing filesystem for errors\n" "\n" @@ -82,7 +87,9 @@ static void usage(void) "Debug:\n" "These commands work on offline, unmounted filesystems\n" " dump Dump filesystem metadata to a qcow2 image\n" +#ifndef BCACHEFS_NO_RUST " list List filesystem metadata in textual form\n" +#endif " list_journal List contents of journal\n" "\n" "Miscellaneous:\n" @@ -106,8 +113,10 @@ static int fs_cmds(int argc, char *argv[]) { char *cmd = pop_cmd(&argc, argv); - if (argc < 1) - return fs_usage(); + if (argc < 1) { + usage(); + exit(EXIT_FAILURE); + } if (!strcmp(cmd, "usage")) return cmd_fs_usage(argc, argv); @@ -178,7 +187,7 @@ int main(int argc, char *argv[]) setvbuf(stdout, NULL, _IOLBF, 0); char *cmd = pop_cmd(&argc, argv); - if (argc < 1) { + if (!cmd) { puts("missing command\n"); goto usage; } @@ -203,12 +212,6 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "set-option")) return cmd_set_option(argc, argv); - if (argc < 2) { - printf("%s: missing command\n", argv[0]); - usage(); - exit(EXIT_FAILURE); - } - #if 0 if (!strcmp(cmd, "assemble")) return cmd_assemble(argc, argv); @@ -234,8 +237,10 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "dump")) return cmd_dump(argc, argv); +#ifndef BCACHEFS_NO_RUST if (!strcmp(cmd, "list")) return cmd_list(argc, argv); +#endif if (!strcmp(cmd, "list_journal")) return cmd_list_journal(argc, argv); if (!strcmp(cmd, "kill_btree_node")) @@ -243,6 +248,12 @@ int main(int argc, char *argv[]) if (!strcmp(cmd, "setattr")) return cmd_setattr(argc, argv); +#ifndef BCACHEFS_NO_RUST + if (!strcmp(cmd, "mount")) { + cmd_mount(argc, argv); + return 0; + } +#endif #ifdef BCACHEFS_FUSE if (!strcmp(cmd, "fusemount")) diff --git a/build.nix b/build.nix new file mode 100644 index 0000000..5cf07de --- /dev/null +++ b/build.nix @@ -0,0 +1,72 @@ +{ lib +, stdenv +, pkg-config +, attr +, libuuid +, libsodium +, keyutils +, liburcu +, zlib +, libaio +, udev +, zstd +, lz4 +, nix-gitignore +, rustPlatform +, rustc +, cargo + }: + +let + src = nix-gitignore.gitignoreSource [] ./. ; + + commit = lib.strings.substring 0 7 (builtins.readFile ./.bcachefs_revision); + version = "git-${commit}"; + +in stdenv.mkDerivation { + inherit src version; + + pname = "bcachefs-tools"; + + nativeBuildInputs = [ + pkg-config + cargo + rustc + rustPlatform.cargoSetupHook + rustPlatform.bindgenHook + ]; + + buildInputs = [ + libaio + keyutils # libkeyutils + lz4 # liblz4 + + libsodium + liburcu + libuuid + zstd # libzstd + zlib # zlib1g + attr + udev + ]; + + cargoRoot = "rust-src"; + # when git-based crates are updated, run: + # nix run github:Mic92/nix-update -- --version=skip --flake default + # to update the hashes + cargoDeps = rustPlatform.importCargoLock { + lockFile = "${src}/rust-src/Cargo.lock"; + outputHashes = { + "bindgen-0.64.0" = "sha256-GNG8as33HLRYJGYe0nw6qBzq86aHiGonyynEM7gaEE4="; + }; + }; + + makeFlags = [ + "PREFIX=${placeholder "out"}" + "VERSION=${commit}" + ]; + + dontStrip = true; + checkPhase = "./bcachefs version"; + doCheck = true; +} diff --git a/cmd_device.c b/cmd_device.c index e3c5d51..c59d370 100644 --- a/cmd_device.c +++ b/cmd_device.c @@ -14,6 +14,7 @@ #include "libbcachefs/bcachefs.h" #include "libbcachefs/bcachefs_ioctl.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/journal.h" #include "libbcachefs/super-io.h" #include "cmds.h" @@ -410,7 +411,7 @@ int cmd_device_set_state(int argc, char *argv[]) int ret = bch2_read_super(dev_str, &opts, &sb); if (ret) - die("error opening %s: %s", dev_str, strerror(-ret)); + die("error opening %s: %s", dev_str, bch2_err_str(ret)); struct bch_member *m = bch2_sb_get_members(sb.sb)->members + sb.sb->dev_idx; @@ -527,7 +528,7 @@ int cmd_device_resize(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty()); if (IS_ERR(c)) - die("error opening %s: %s", dev, strerror(-PTR_ERR(c))); + die("error opening %s: %s", dev, bch2_err_str(PTR_ERR(c))); struct bch_dev *ca, *resize = NULL; unsigned i; @@ -547,7 +548,7 @@ int cmd_device_resize(int argc, char *argv[]) printf("resizing %s to %llu buckets\n", dev, nbuckets); int ret = bch2_dev_resize(c, resize, nbuckets); if (ret) - fprintf(stderr, "resize error: %s\n", strerror(-ret)); + fprintf(stderr, "resize error: %s\n", bch2_err_str(ret)); percpu_ref_put(&resize->io_ref); bch2_fs_stop(c); @@ -630,7 +631,7 @@ int cmd_device_resize_journal(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(&dev, 1, bch2_opts_empty()); if (IS_ERR(c)) - die("error opening %s: %s", dev, strerror(-PTR_ERR(c))); + die("error opening %s: %s", dev, bch2_err_str(PTR_ERR(c))); struct bch_dev *ca, *resize = NULL; unsigned i; @@ -647,7 +648,7 @@ int cmd_device_resize_journal(int argc, char *argv[]) printf("resizing journal on %s to %llu buckets\n", dev, nbuckets); int ret = bch2_set_nr_journal_buckets(c, resize, nbuckets); if (ret) - fprintf(stderr, "resize error: %s\n", strerror(-ret)); + fprintf(stderr, "resize error: %s\n", bch2_err_str(ret)); percpu_ref_put(&resize->io_ref); bch2_fs_stop(c); diff --git a/cmd_dump.c b/cmd_dump.c index 4e3d721..cc25a6a 100644 --- a/cmd_dump.c +++ b/cmd_dump.c @@ -82,9 +82,9 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd, } if (ret) - die("error %s walking btree nodes", strerror(-ret)); + die("error %s walking btree nodes", bch2_err_str(ret)); - b = c->btree_roots[i].b; + b = bch2_btree_id_root(c, i)->b; if (!btree_node_fake(b)) { ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); @@ -117,7 +117,7 @@ int cmd_dump(int argc, char *argv[]) opt_set(opts, norecovery, true); opt_set(opts, degraded, true); opt_set(opts, errors, BCH_ON_ERROR_continue); - opt_set(opts, fix_errors, FSCK_OPT_NO); + opt_set(opts, fix_errors, FSCK_FIX_no); while ((opt = getopt(argc, argv, "o:fjvh")) != -1) switch (opt) { @@ -147,7 +147,7 @@ int cmd_dump(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(argv, argc, opts); if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c))); down_read(&c->gc_lock); diff --git a/cmd_format.c b/cmd_format.c index 4debc28..26a1cd9 100644 --- a/cmd_format.c +++ b/cmd_format.c @@ -24,6 +24,7 @@ #include "libbcachefs.h" #include "crypto.h" #include "libbcachefs/darray.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/opts.h" #include "libbcachefs/super-io.h" #include "libbcachefs/util.h" @@ -218,6 +219,9 @@ int cmd_format(int argc, char *argv[]) break; } + if (opts.version != bcachefs_metadata_version_current) + initialize = false; + if (!devices.nr) die("Please supply a device"); @@ -270,7 +274,7 @@ int cmd_format(int argc, char *argv[]) mount_opts); if (IS_ERR(c)) die("error opening %s: %s", device_paths.data[0], - strerror(-PTR_ERR(c))); + bch2_err_str(PTR_ERR(c))); bch2_fs_stop(c); } @@ -336,7 +340,7 @@ int cmd_show_super(int argc, char *argv[]) struct bch_sb_handle sb; int ret = bch2_read_super(dev, &opts, &sb); if (ret) - die("Error opening %s: %s", dev, strerror(-ret)); + die("Error opening %s: %s", dev, bch2_err_str(ret)); struct printbuf buf = PRINTBUF; diff --git a/cmd_fs.c b/cmd_fs.c index 007c8d8..d6e2b22 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -1,4 +1,4 @@ - +#include #include #include @@ -275,30 +275,40 @@ static void fs_usage_to_text(struct printbuf *out, const char *path) bcache_fs_close(fs); } -int fs_usage(void) +static void fs_usage_usage(void) { - puts("bcachefs fs - manage a running filesystem\n" - "Usage: bcachefs fs [OPTION]... path\n" - "\n" - "Commands:\n" - " usage show disk usage\n" - "\n" - "Report bugs to "); - return 0; + puts("bcachefs fs usage - display detailed filesystem usage\n" + "Usage: bcachefs fs usage [OPTION]... \n" + "\n" + "Options:\n" + " -h, --human-readable Human readable units\n" + " --help Display this help and exit\n" + "Report bugs to "); } int cmd_fs_usage(int argc, char *argv[]) { + static const struct option longopts[] = { + { "help", no_argument, NULL, 'H' }, + { NULL } + }; bool human_readable = false; struct printbuf buf = PRINTBUF; char *fs; int opt; - while ((opt = getopt(argc, argv, "h")) != -1) + while ((opt = getopt_long(argc, argv, "h", + longopts, NULL)) != -1) switch (opt) { case 'h': human_readable = true; break; + case 'H': + fs_usage_usage(); + exit(EXIT_SUCCESS); + default: + fs_usage_usage(); + exit(EXIT_FAILURE); } args_shift(optind); diff --git a/cmd_fsck.c b/cmd_fsck.c index 247e207..0954a83 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -12,20 +12,23 @@ static void usage(void) "Usage: bcachefs fsck [OPTION]... \n" "\n" "Options:\n" - " -p Automatic repair (no questions)\n" - " -n Don't repair, only check for errors\n" - " -y Assume \"yes\" to all questions\n" - " -f Force checking even if filesystem is marked clean\n" - " --reconstruct_alloc Reconstruct the alloc btree\n" - " -v Be verbose\n" - " -h Display this help and exit\n" + " -p Automatic repair (no questions)\n" + " -n Don't repair, only check for errors\n" + " -y Assume \"yes\" to all questions\n" + " -f Force checking even if filesystem is marked clean\n" + " -r, --ratelimit_errors Don't display more than 10 errors of a given type\n" + " -R, --reconstruct_alloc Reconstruct the alloc btree\n" + " -v Be verbose\n" + " -h, --help Display this help and exit\n" "Report bugs to "); } int cmd_fsck(int argc, char *argv[]) { static const struct option longopts[] = { + { "ratelimit_errors", no_argument, NULL, 'r' }, { "reconstruct_alloc", no_argument, NULL, 'R' }, + { "help", no_argument, NULL, 'h' }, { NULL } }; struct bch_opts opts = bch2_opts_empty(); @@ -34,22 +37,20 @@ int cmd_fsck(int argc, char *argv[]) opt_set(opts, degraded, true); opt_set(opts, fsck, true); - opt_set(opts, fix_errors, FSCK_OPT_ASK); + opt_set(opts, fix_errors, FSCK_FIX_ask); while ((opt = getopt_long(argc, argv, - "apynfo:vh", + "apynfo:rvh", longopts, NULL)) != -1) switch (opt) { case 'a': /* outdated alias for -p */ case 'p': - opt_set(opts, fix_errors, FSCK_OPT_YES); - break; case 'y': - opt_set(opts, fix_errors, FSCK_OPT_YES); + opt_set(opts, fix_errors, FSCK_FIX_yes); break; case 'n': opt_set(opts, nochanges, true); - opt_set(opts, fix_errors, FSCK_OPT_NO); + opt_set(opts, fix_errors, FSCK_FIX_no); break; case 'f': /* force check, even if filesystem marked clean: */ @@ -59,6 +60,9 @@ int cmd_fsck(int argc, char *argv[]) if (ret) return ret; break; + case 'r': + opt_set(opts, ratelimit_errors, true); + break; case 'R': opt_set(opts, reconstruct_alloc, true); break; @@ -89,7 +93,7 @@ int cmd_fsck(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(argv, argc, opts); if (IS_ERR(c)) { - fprintf(stderr, "error opening %s: %s\n", argv[0], strerror(-PTR_ERR(c))); + fprintf(stderr, "error opening %s: %s\n", argv[0], bch2_err_str(PTR_ERR(c))); exit(8); } diff --git a/cmd_fusemount.c b/cmd_fusemount.c index 216094f..4470f83 100644 --- a/cmd_fusemount.c +++ b/cmd_fusemount.c @@ -17,6 +17,7 @@ #include "libbcachefs/btree_iter.h" #include "libbcachefs/buckets.h" #include "libbcachefs/dirent.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/error.h" #include "libbcachefs/fs-common.h" #include "libbcachefs/inode.h" @@ -1229,7 +1230,7 @@ int cmd_fusemount(int argc, char *argv[]) c = bch2_fs_open(ctx.devices, ctx.nr_devices, bch_opts); if (IS_ERR(c)) die("error opening %s: %s", ctx.devices_str, - strerror(-PTR_ERR(c))); + bch2_err_str(PTR_ERR(c))); /* Fuse */ struct fuse_session *se = diff --git a/cmd_key.c b/cmd_key.c index 63b0541..e8c3eea 100644 --- a/cmd_key.c +++ b/cmd_key.c @@ -55,7 +55,7 @@ int cmd_unlock(int argc, char *argv[]) struct bch_sb_handle sb; int ret = bch2_read_super(dev, &opts, &sb); if (ret) - die("Error opening %s: %s", dev, strerror(-ret)); + die("Error opening %s: %s", dev, bch2_err_str(ret)); if (!bch2_sb_is_encrypted(sb.sb)) die("%s is not encrypted", dev); @@ -90,7 +90,7 @@ int cmd_set_passphrase(int argc, char *argv[]) c = bch2_fs_open(argv + 1, argc - 1, opts); if (IS_ERR(c)) - die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); + die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c))); struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) @@ -127,7 +127,7 @@ int cmd_remove_passphrase(int argc, char *argv[]) opt_set(opts, nostart, true); c = bch2_fs_open(argv + 1, argc - 1, opts); if (IS_ERR(c)) - die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c))); + die("Error opening %s: %s", argv[1], bch2_err_str(PTR_ERR(c))); struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb); if (!crypt) diff --git a/cmd_kill_btree_node.c b/cmd_kill_btree_node.c index a0e0fc9..a8915a1 100644 --- a/cmd_kill_btree_node.c +++ b/cmd_kill_btree_node.c @@ -9,6 +9,7 @@ #include "libbcachefs/bcachefs.h" #include "libbcachefs/btree_iter.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/error.h" #include "libbcachefs/super.h" @@ -60,7 +61,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(argv, argc, opts); if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c))); struct btree_trans trans; struct btree_iter iter; @@ -70,7 +71,7 @@ int cmd_kill_btree_node(int argc, char *argv[]) ret = posix_memalign(&zeroes, c->opts.block_size, c->opts.block_size); if (ret) - die("error %s from posix_memalign", strerror(ret)); + die("error %s from posix_memalign", bch2_err_str(ret)); bch2_trans_init(&trans, c, 0, 0); diff --git a/cmd_list.c b/cmd_list.c deleted file mode 100644 index 382153d..0000000 --- a/cmd_list.c +++ /dev/null @@ -1,408 +0,0 @@ -#include -#include -#include -#include - -#include "cmds.h" -#include "libbcachefs.h" -#include "qcow2.h" -#include "tools-util.h" - -#include "libbcachefs/bcachefs.h" -#include "libbcachefs/btree_cache.h" -#include "libbcachefs/btree_io.h" -#include "libbcachefs/btree_iter.h" -#include "libbcachefs/checksum.h" -#include "libbcachefs/error.h" -#include "libbcachefs/extents.h" -#include "libbcachefs/super.h" - -static void list_keys(struct bch_fs *c, enum btree_id btree_id, - struct bpos start, struct bpos end) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, btree_id, start, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_PREFETCH, k, ret) { - if (bkey_cmp(k.k->p, end) > 0) - break; - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, k); - puts(buf.buf); - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - - printbuf_exit(&buf); -} - -static void list_btree_formats(struct bch_fs *c, enum btree_id btree_id, unsigned level, - struct bpos start, struct bpos end) -{ - struct btree_trans trans; - struct btree_iter iter; - struct btree *b; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { - if (bkey_cmp(b->key.k.p, end) > 0) - break; - - printbuf_reset(&buf); - bch2_btree_node_to_text(&buf, c, b); - puts(buf.buf); - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) - die("error %s walking btree nodes", strerror(-ret)); - - bch2_trans_exit(&trans); - printbuf_exit(&buf); -} - -static void list_nodes(struct bch_fs *c, enum btree_id btree_id, unsigned level, - struct bpos start, struct bpos end) -{ - struct btree_trans trans; - struct btree_iter iter; - struct btree *b; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { - if (bkey_cmp(b->key.k.p, end) > 0) - break; - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - fputs(buf.buf, stdout); - putchar('\n'); - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) - die("error %s walking btree nodes", strerror(-ret)); - - bch2_trans_exit(&trans); - printbuf_exit(&buf); -} - -static void print_node_ondisk(struct bch_fs *c, struct btree *b) -{ - struct btree_node *n_ondisk; - struct extent_ptr_decoded pick; - struct bch_dev *ca; - struct bio *bio; - unsigned offset = 0; - int ret; - - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { - printf("error getting device to read from\n"); - return; - } - - ca = bch_dev_bkey_exists(c, pick.ptr.dev); - if (!bch2_dev_get_ioref(ca, READ)) { - printf("error getting device to read from\n"); - return; - } - - n_ondisk = aligned_alloc(block_bytes(c), btree_bytes(c)); - - bio = bio_alloc_bioset(ca->disk_sb.bdev, - buf_pages(n_ondisk, btree_bytes(c)), - REQ_OP_READ|REQ_META, - GFP_NOIO, - &c->btree_bio); - bio->bi_iter.bi_sector = pick.ptr.offset; - bch2_bio_map(bio, n_ondisk, btree_bytes(c)); - - ret = submit_bio_wait(bio); - if (ret) - die("error reading btree node: %i", ret); - - bio_put(bio); - percpu_ref_put(&ca->io_ref); - - while (offset < btree_sectors(c)) { - struct bset *i; - struct nonce nonce; - struct bch_csum csum; - struct bkey_packed *k; - unsigned sectors; - - if (!offset) { - i = &n_ondisk->keys; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - die("unknown checksum type at offset %u: %llu", - offset, BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); - - if (bch2_crc_cmp(csum, n_ondisk->csum)) - die("invalid checksum\n"); - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(n_ondisk, c->block_bits); - } else { - struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); - - i = &bne->keys; - - if (i->seq != n_ondisk->keys.seq) - break; - - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - die("unknown checksum type at offset %u: %llu", - offset, BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, offset << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - - if (bch2_crc_cmp(csum, bne->csum)) - die("invalid checksum"); - - bset_encrypt(c, i, offset << 9); - - sectors = vstruct_sectors(bne, c->block_bits); - } - - fprintf(stdout, " offset %u version %u, journal seq %llu\n", - offset, - le16_to_cpu(i->version), - le64_to_cpu(i->journal_seq)); - offset += sectors; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) { - struct bkey u; - struct printbuf buf = PRINTBUF; - - printbuf_indent_add(&buf, 4); - - bch2_bkey_val_to_text(&buf, c, bkey_disassemble(b, k, &u)); - fprintf(stdout, "%s\n", buf.buf); - - printbuf_exit(&buf); - } - } - - free(n_ondisk); -} - -static void list_nodes_ondisk(struct bch_fs *c, enum btree_id btree_id, unsigned level, - struct bpos start, struct bpos end) -{ - struct btree_trans trans; - struct btree_iter iter; - struct btree *b; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { - if (bkey_cmp(b->key.k.p, end) > 0) - break; - - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - fputs(buf.buf, stdout); - putchar('\n'); - - print_node_ondisk(c, b); - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) - die("error %s walking btree nodes", strerror(-ret)); - - bch2_trans_exit(&trans); - printbuf_exit(&buf); -} - -static void list_nodes_keys(struct bch_fs *c, enum btree_id btree_id, unsigned level, - struct bpos start, struct bpos end) -{ - struct btree_trans trans; - struct btree_iter iter; - struct btree_node_iter node_iter; - struct bkey unpacked; - struct bkey_s_c k; - struct btree *b; - struct printbuf buf = PRINTBUF; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - __for_each_btree_node(&trans, iter, btree_id, start, 0, level, 0, b, ret) { - if (bkey_cmp(b->key.k.p, end) > 0) - break; - - printbuf_reset(&buf); - bch2_btree_node_to_text(&buf, c, b); - fputs(buf.buf, stdout); - - for_each_btree_node_key_unpack(b, k, &node_iter, &unpacked) { - printbuf_reset(&buf); - bch2_bkey_val_to_text(&buf, c, k); - putchar('\t'); - puts(buf.buf); - } - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) - die("error %s walking btree nodes", strerror(-ret)); - - bch2_trans_exit(&trans); - printbuf_exit(&buf); -} - -static void list_keys_usage(void) -{ - puts("bcachefs list - list filesystem metadata to stdout\n" - "Usage: bcachefs list [OPTION]... \n" - "\n" - "Options:\n" - " -b (extents|inodes|dirents|xattrs) Btree to list from\n" - " -l level Btree depth to descend to (0 == leaves)\n" - " -s inode:offset Start position to list from\n" - " -e inode:offset End position\n" - " -i inode List keys for a given inode number\n" - " -m (keys|formats|nodes|nodes_ondisk|nodes_keys)\n" - " List mode\n" - " -f Check (fsck) the filesystem first\n" - " -v Verbose mode\n" - " -h Display this help and exit\n" - "Report bugs to "); -} - -#define LIST_MODES() \ - x(keys) \ - x(formats) \ - x(nodes) \ - x(nodes_ondisk) \ - x(nodes_keys) - -enum list_modes { -#define x(n) LIST_MODE_##n, - LIST_MODES() -#undef x -}; - -static const char * const list_modes[] = { -#define x(n) #n, - LIST_MODES() -#undef x - NULL -}; - -int cmd_list(int argc, char *argv[]) -{ - struct bch_opts opts = bch2_opts_empty(); - enum btree_id btree_id_start = 0; - enum btree_id btree_id_end = BTREE_ID_NR; - enum btree_id btree_id; - unsigned level = 0; - struct bpos start = POS_MIN, end = POS_MAX; - u64 inum = 0; - int mode = 0, opt; - - opt_set(opts, nochanges, true); - opt_set(opts, norecovery, true); - opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_continue); - - while ((opt = getopt(argc, argv, "b:l:s:e:i:m:fvh")) != -1) - switch (opt) { - case 'b': - btree_id_start = read_string_list_or_die(optarg, - bch2_btree_ids, "btree id"); - btree_id_end = btree_id_start + 1; - break; - case 'l': - if (kstrtouint(optarg, 10, &level) || level >= BTREE_MAX_DEPTH) - die("invalid level"); - break; - case 's': - start = bpos_parse(optarg); - break; - case 'e': - end = bpos_parse(optarg); - break; - case 'i': - if (kstrtoull(optarg, 10, &inum)) - die("invalid inode %s", optarg); - start = POS(inum, 0); - end = POS(inum + 1, 0); - break; - case 'm': - mode = read_string_list_or_die(optarg, - list_modes, "list mode"); - break; - case 'f': - opt_set(opts, fix_errors, FSCK_OPT_YES); - opt_set(opts, norecovery, false); - break; - case 'v': - opt_set(opts, verbose, true); - break; - case 'h': - list_keys_usage(); - exit(EXIT_SUCCESS); - } - args_shift(optind); - - if (!argc) - die("Please supply device(s)"); - - struct bch_fs *c = bch2_fs_open(argv, argc, opts); - if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); - - - for (btree_id = btree_id_start; - btree_id < btree_id_end; - btree_id++) { - switch (mode) { - case LIST_MODE_keys: - list_keys(c, btree_id, start, end); - break; - case LIST_MODE_formats: - list_btree_formats(c, btree_id, level, start, end); - break; - case LIST_MODE_nodes: - list_nodes(c, btree_id, level, start, end); - break; - case LIST_MODE_nodes_ondisk: - list_nodes_ondisk(c, btree_id, level, start, end); - break; - case LIST_MODE_nodes_keys: - list_nodes_keys(c, btree_id, level, start, end); - break; - default: - die("Invalid mode"); - } - } - - bch2_fs_stop(c); - return 0; -} diff --git a/cmd_list_journal.c b/cmd_list_journal.c index 0836ebf..655bfe2 100644 --- a/cmd_list_journal.c +++ b/cmd_list_journal.c @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,21 +10,27 @@ #include "libbcachefs/bcachefs.h" #include "libbcachefs/btree_iter.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/error.h" #include "libbcachefs/journal_io.h" #include "libbcachefs/journal_seq_blacklist.h" #include "libbcachefs/super.h" +static const char *NORMAL = "\x1B[0m"; +static const char *RED = "\x1B[31m"; + static void list_journal_usage(void) { puts("bcachefs list_journal - print contents of journal\n" "Usage: bcachefs list_journal [OPTION]... \n" "\n" "Options:\n" - " -a Read entire journal, not just dirty entries\n" - " -n Number of journal entries to print, starting from the most recent\n" - " -v Verbose mode\n" - " -h Display this help and exit\n" + " -a Read entire journal, not just dirty entries\n" + " -n, --nr-entries=nr Number of journal entries to print, starting from the most recent\n" + " -t, --transaction-filter=bbpos Filter transactions not updating \n" + " -k, --key-filter=btree Filter keys not updating btree\n" + " -v, --verbose Verbose mode\n" + " -h, --help Display this help and exit\n" "Report bugs to "); } @@ -38,48 +45,91 @@ static void star_start_of_lines(char *buf) p[1] = '*'; } -int cmd_list_journal(int argc, char *argv[]) +static inline bool entry_is_transaction_start(struct jset_entry *entry) { - struct bch_opts opts = bch2_opts_empty(); - u32 nr_entries = U32_MAX; - int opt; + return entry->type == BCH_JSET_ENTRY_log && !entry->level; +} - opt_set(opts, nochanges, true); - opt_set(opts, norecovery, true); - opt_set(opts, degraded, true); - opt_set(opts, errors, BCH_ON_ERROR_continue); - opt_set(opts, fix_errors, FSCK_OPT_YES); - opt_set(opts, keep_journal, true); - opt_set(opts, read_journal_only,true); +typedef DARRAY(struct bbpos) d_bbpos; +typedef DARRAY(enum btree_id) d_btree_id; - while ((opt = getopt(argc, argv, "an:vh")) != -1) - switch (opt) { - case 'a': - opt_set(opts, read_entire_journal, true); - break; - case 'n': - nr_entries = kstrtouint(optarg, 10, &nr_entries); - opt_set(opts, read_entire_journal, true); - break; - case 'v': - opt_set(opts, verbose, true); - break; - case 'h': - list_journal_usage(); - exit(EXIT_SUCCESS); - } - args_shift(optind); +static bool bkey_matches_filter(d_bbpos filter, struct jset_entry *entry, struct bkey_i *k) +{ + struct bbpos *i; - if (!argc) - die("Please supply device(s) to open"); + darray_for_each(filter, i) { + if (i->btree != entry->btree_id) + continue; - struct bch_fs *c = bch2_fs_open(argv, argc, opts); - if (IS_ERR(c)) - die("error opening %s: %s", argv[0], strerror(-PTR_ERR(c))); + if (bkey_eq(i->pos, k->k.p)) + return true; + + if (btree_node_type_is_extents(i->btree) && + bkey_ge(i->pos, bkey_start_pos(&k->k)) && + bkey_lt(i->pos, k->k.p)) + return true; + } + return false; +} + +static bool entry_matches_transaction_filter(struct jset_entry *entry, + d_bbpos filter) +{ + if (entry->type == BCH_JSET_ENTRY_btree_root || + entry->type == BCH_JSET_ENTRY_btree_keys || + entry->type == BCH_JSET_ENTRY_overwrite) { + struct bkey_i *k; + + jset_entry_for_each_key(entry, k) + if (bkey_matches_filter(filter, entry, k)) + return true; + } + + return false; +} +static bool should_print_transaction(struct jset_entry *entry, struct jset_entry *end, + d_bbpos filter) +{ + if (!filter.nr) + return true; + + for (entry = vstruct_next(entry); + entry != end && !entry_is_transaction_start(entry); + entry = vstruct_next(entry)) + if (entry_matches_transaction_filter(entry, filter)) + return true; + + return false; +} + +static bool should_print_entry(struct jset_entry *entry, d_btree_id filter) +{ + struct bkey_i *k; + enum btree_id *id; + + if (!filter.nr) + return true; + + if (entry->type != BCH_JSET_ENTRY_btree_root && + entry->type != BCH_JSET_ENTRY_btree_keys && + entry->type != BCH_JSET_ENTRY_overwrite) + return true; + + jset_entry_for_each_key(entry, k) + darray_for_each(filter, id) + if (entry->btree_id == *id) + return true; + + return false; +} + +static void journal_entries_print(struct bch_fs *c, unsigned nr_entries, + d_bbpos transaction_filter, + d_btree_id key_filter) +{ struct journal_replay *p, **_p; struct genradix_iter iter; - struct jset_entry *entry; struct printbuf buf = PRINTBUF; genradix_for_each(&c->journal_entries, iter, _p) { @@ -94,46 +144,133 @@ int cmd_list_journal(int argc, char *argv[]) bch2_journal_seq_is_blacklisted(c, le64_to_cpu(p->j.seq), false); - if (blacklisted) - printf("blacklisted "); - - printf("journal entry %llu\n", le64_to_cpu(p->j.seq)); + if (!transaction_filter.nr) { + if (blacklisted) + printf("blacklisted "); - printbuf_reset(&buf); + printf("journal entry %llu\n", le64_to_cpu(p->j.seq)); - prt_printf(&buf, - " version %u\n" - " last seq %llu\n" - " flush %u\n" - " written at ", - le32_to_cpu(p->j.version), - le64_to_cpu(p->j.last_seq), - !JSET_NO_FLUSH(&p->j)); - bch2_journal_ptrs_to_text(&buf, c, p); + printbuf_reset(&buf); - if (blacklisted) - star_start_of_lines(buf.buf); - printf("%s\n", buf.buf); + prt_printf(&buf, + " version %u\n" + " last seq %llu\n" + " flush %u\n" + " written at ", + le32_to_cpu(p->j.version), + le64_to_cpu(p->j.last_seq), + !JSET_NO_FLUSH(&p->j)); + bch2_journal_ptrs_to_text(&buf, c, p); - vstruct_for_each(&p->j, entry) { + if (blacklisted) + star_start_of_lines(buf.buf); + printf("%s\n", buf.buf); printbuf_reset(&buf); + } + + struct jset_entry *entry = p->j.start; + struct jset_entry *end = vstruct_last(&p->j); + while (entry != end) { /* * log entries denote the start of a new transaction * commit: */ - if (entry->type == BCH_JSET_ENTRY_log && !entry->level) + if (entry_is_transaction_start(entry)) { + if (!should_print_transaction(entry, end, transaction_filter)) { + do { + entry = vstruct_next(entry); + } while (entry != end && !entry_is_transaction_start(entry)); + + continue; + } + prt_newline(&buf); + } + + if (!should_print_entry(entry, key_filter)) + goto next; + + bool highlight = entry_matches_transaction_filter(entry, transaction_filter); + if (highlight) + fputs(RED, stdout); + printbuf_indent_add(&buf, 4); bch2_journal_entry_to_text(&buf, c, entry); if (blacklisted) star_start_of_lines(buf.buf); printf("%s\n", buf.buf); + printbuf_reset(&buf); + + if (highlight) + fputs(NORMAL, stdout); +next: + entry = vstruct_next(entry); } } printbuf_exit(&buf); +} + +int cmd_list_journal(int argc, char *argv[]) +{ + static const struct option longopts[] = { + { "nr-entries", required_argument, NULL, 'n' }, + { "transaction-filter", required_argument, NULL, 't' }, + { "key-filter", required_argument, NULL, 'k' }, + { "verbose", no_argument, NULL, 'v' }, + { "help", no_argument, NULL, 'h' }, + { NULL } + }; + struct bch_opts opts = bch2_opts_empty(); + u32 nr_entries = U32_MAX; + d_bbpos transaction_filter = { 0 }; + d_btree_id key_filter = { 0 }; + int opt; + + opt_set(opts, nochanges, true); + opt_set(opts, norecovery, true); + opt_set(opts, degraded, true); + opt_set(opts, errors, BCH_ON_ERROR_continue); + opt_set(opts, fix_errors, FSCK_FIX_yes); + opt_set(opts, keep_journal, true); + opt_set(opts, read_journal_only,true); + + while ((opt = getopt_long(argc, argv, "an:t:k:vh", + longopts, NULL)) != -1) + switch (opt) { + case 'a': + opt_set(opts, read_entire_journal, true); + break; + case 'n': + if (kstrtouint(optarg, 10, &nr_entries)) + die("error parsing nr_entries"); + opt_set(opts, read_entire_journal, true); + break; + case 't': + darray_push(&transaction_filter, bbpos_parse(optarg)); + break; + case 'k': + darray_push(&key_filter, read_string_list_or_die(optarg, bch2_btree_ids, "btree id")); + break; + case 'v': + opt_set(opts, verbose, true); + break; + case 'h': + list_journal_usage(); + exit(EXIT_SUCCESS); + } + args_shift(optind); + + if (!argc) + die("Please supply device(s) to open"); + + struct bch_fs *c = bch2_fs_open(argv, argc, opts); + if (IS_ERR(c)) + die("error opening %s: %s", argv[0], bch2_err_str(PTR_ERR(c))); + + journal_entries_print(c, nr_entries, transaction_filter, key_filter); bch2_fs_stop(c); return 0; } diff --git a/cmd_migrate.c b/cmd_migrate.c index 707f13e..3958ba6 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -30,6 +30,7 @@ #include "libbcachefs/btree_update.h" #include "libbcachefs/buckets.h" #include "libbcachefs/dirent.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/fs-common.h" #include "libbcachefs/inode.h" #include "libbcachefs/io.h" @@ -127,7 +128,7 @@ static void update_inode(struct bch_fs *c, ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i, NULL, NULL, 0); if (ret) - die("error updating inode: %s", strerror(-ret)); + die("error updating inode: %s", bch2_err_str(ret)); } static void create_link(struct bch_fs *c, @@ -143,7 +144,7 @@ static void create_link(struct bch_fs *c, (subvol_inum) { 1, parent->bi_inum }, &parent_u, (subvol_inum) { 1, inum }, &inode, &qstr)); if (ret) - die("error creating hardlink: %s", strerror(-ret)); + die("error creating hardlink: %s", bch2_err_str(ret)); } static struct bch_inode_unpacked create_file(struct bch_fs *c, @@ -164,7 +165,7 @@ static struct bch_inode_unpacked create_file(struct bch_fs *c, uid, gid, mode, rdev, NULL, NULL, (subvol_inum) {}, 0)); if (ret) - die("error creating %s: %s", name, strerror(-ret)); + die("error creating %s: %s", name, bch2_err_str(ret)); return new_inode; } @@ -228,14 +229,15 @@ static void copy_xattrs(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error getting xattr val: %m"); const struct xattr_handler *h = xattr_resolve_name(&attr); + struct bch_inode_unpacked inode_u; int ret = bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_set(&trans, (subvol_inum) { 1, dst->bi_inum }, - &hash_info, attr, + &inode_u, &hash_info, attr, val, val_size, h->flags, 0)); if (ret < 0) - die("error creating xattr: %s", strerror(-ret)); + die("error creating xattr: %s", bch2_err_str(ret)); } } @@ -270,7 +272,7 @@ static void write_data(struct bch_fs *c, int ret = bch2_disk_reservation_get(c, &op.res, len >> 9, c->opts.data_replicas, 0); if (ret) - die("error reserving space in new filesystem: %s", strerror(-ret)); + die("error reserving space in new filesystem: %s", bch2_err_str(ret)); closure_call(&op.cl, bch2_write, NULL, &cl); @@ -310,7 +312,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, while (length) { struct bkey_i_extent *e; - __BKEY_PADDED(k, BKEY_EXTENT_VAL_U64s_MAX) k; + BKEY_PADDED_ONSTACK(k, BKEY_EXTENT_VAL_U64s_MAX) k; u64 b = sector_to_bucket(ca, physical); struct disk_reservation res; unsigned sectors; @@ -335,12 +337,12 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, BCH_DISK_RESERVATION_NOFAIL); if (ret) die("error reserving space in new filesystem: %s", - strerror(-ret)); + bch2_err_str(ret)); ret = bch2_btree_insert(c, BTREE_ID_extents, &e->k_i, &res, NULL, 0); if (ret) - die("btree insert error %s", strerror(-ret)); + die("btree insert error %s", bch2_err_str(ret)); bch2_disk_reservation_put(c, &res); @@ -581,7 +583,7 @@ static void copy_fs(struct bch_fs *c, int src_fd, const char *src_path, int ret = bch2_inode_find_by_inum(c, (subvol_inum) { 1, BCACHEFS_ROOT_INO }, &root_inode); if (ret) - die("error looking up root directory: %s", strerror(-ret)); + die("error looking up root directory: %s", bch2_err_str(ret)); if (fchdir(src_fd)) die("chdir error: %m"); @@ -706,13 +708,13 @@ static int migrate_fs(const char *fs_path, c = bch2_fs_open(path, 1, opts); if (IS_ERR(c)) - die("Error opening new filesystem: %s", strerror(-PTR_ERR(c))); + die("Error opening new filesystem: %s", bch2_err_str(PTR_ERR(c))); mark_unreserved_space(c, extents); int ret = bch2_fs_start(c); if (ret) - die("Error starting new filesystem: %s", strerror(-ret)); + die("Error starting new filesystem: %s", bch2_err_str(ret)); copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents); @@ -724,7 +726,7 @@ static int migrate_fs(const char *fs_path, c = bch2_fs_open(path, 1, opts); if (IS_ERR(c)) - die("Error opening new filesystem: %s", strerror(-PTR_ERR(c))); + die("Error opening new filesystem: %s", bch2_err_str(PTR_ERR(c))); bch2_fs_stop(c); printf("fsck complete\n"); diff --git a/cmd_option.c b/cmd_option.c index 86768e5..6ce3401 100644 --- a/cmd_option.c +++ b/cmd_option.c @@ -20,6 +20,7 @@ #include "cmds.h" #include "libbcachefs.h" +#include "libbcachefs/errcode.h" #include "libbcachefs/opts.h" #include "libbcachefs/super-io.h" @@ -64,7 +65,7 @@ int cmd_set_option(int argc, char *argv[]) struct bch_fs *c = bch2_fs_open(argv, argc, open_opts); if (IS_ERR(c)) { - fprintf(stderr, "error opening %s: %s\n", argv[0], strerror(-PTR_ERR(c))); + fprintf(stderr, "error opening %s: %s\n", argv[0], bch2_err_str(PTR_ERR(c))); exit(EXIT_FAILURE); } diff --git a/cmds.h b/cmds.h index c18a87f..96216b2 100644 --- a/cmds.h +++ b/cmds.h @@ -20,7 +20,6 @@ int cmd_run(int argc, char *argv[]); int cmd_stop(int argc, char *argv[]); #endif -int fs_usage(void); int cmd_fs_usage(int argc, char *argv[]); int device_usage(void); @@ -61,5 +60,6 @@ int cmd_subvolume_delete(int argc, char *argv[]); int cmd_subvolume_snapshot(int argc, char *argv[]); int cmd_fusemount(int argc, char *argv[]); +void cmd_mount(int agc, char *argv[]); #endif /* _CMDS_H */ diff --git a/debian/changelog b/debian/changelog index 5d7601e..4351993 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +bcachefs-tools (24~really1.2-1) unstable; urgency=medium + + * New upstream release + + -- Jonathan Carter Thu, 28 Sep 2023 19:54:47 +0200 + bcachefs-tools (24-1) unstable; urgency=medium * New upstream release diff --git a/debian/files b/debian/files index a5d4aa2..2458325 100644 --- a/debian/files +++ b/debian/files @@ -1 +1 @@ -bcachefs-tools_24-1_source.buildinfo utils optional +bcachefs-tools_24~really1.2-1_source.buildinfo utils optional diff --git a/default.nix b/default.nix index a693194..2cccff2 100644 --- a/default.nix +++ b/default.nix @@ -1,128 +1,10 @@ -{ lib -, doCheck ? true -, stdenv -, pkg-config -, attr -, libuuid -, libsodium -, keyutils - -, liburcu -, zlib -, libaio -, udev -, zstd -, lz4 - -, python39 -, python39Packages -, docutils -, nixosTests - -, versionString ? "0.1" - -, inShell ? false -, debugMode ? inShell - -, testWithValgrind ? true -, valgrind - -, fuseSupport ? false -, fuse3 ? null }: - -assert fuseSupport -> fuse3 != null; -assert testWithValgrind -> valgrind != null; -stdenv.mkDerivation { - pname = "bcachefs-tools"; - - version = "v0.1-flake-${versionString}"; - VERSION = "v0.1-flake-${versionString}"; - - src = (lib.cleanSource (builtins.path { name = "bcachefs-tools-src"; path = ./. ;} )); - - postPatch = "patchShebangs --build doc/macro2rst.py"; - - nativeBuildInputs = [ - # used to find dependencies - ## see ./INSTALL - pkg-config - ]; - buildInputs = [ - # bcachefs explicit dependencies - ## see ./INSTALL - libaio - - # libblkid - keyutils # libkeyutils - lz4 # liblz4 - - libsodium - liburcu - libuuid - zstd # libzstd - zlib # zlib1g - valgrind - - # unspecified dependencies - attr - udev - - # documentation depenedencies - docutils - python39Packages.pygments - ] ++ (lib.optional fuseSupport fuse3) - ++ (lib.optional testWithValgrind valgrind) ; - - makeFlags = [ - "PREFIX=${placeholder "out"}" - ] ++ lib.optional debugMode "EXTRA_CFLAGS=-ggdb"; - - installFlags = [ - "INITRAMFS_DIR=${placeholder "out"}/etc/initramfs-tools" - ]; - - doCheck = doCheck; # needs bcachefs module loaded on builder - - checkInputs = [ - python39Packages.pytest - python39Packages.pytest-xdist - ] ++ lib.optional testWithValgrind valgrind; - - checkFlags = [ - "BCACHEFS_TEST_USE_VALGRIND=${if testWithValgrind then "yes" else "no"}" - # cannot escape spaces within make flags, quotes are stripped - "PYTEST_CMD=pytest" # "PYTEST_ARGS='-n4 --version'" - ]; - - preCheck = - '' - makeFlagsArray+=(PYTEST_ARGS="--verbose -n2") - '' + - lib.optionalString fuseSupport '' - rm tests/test_fuse.py - ''; - - dontStrip = debugMode; - passthru = { - bcachefs_revision = let - file = builtins.readFile ./.bcachefs_revision; - removeLineFeeds = str: lib.lists.foldr (lib.strings.removeSuffix) str ["\r" "\n"]; - in removeLineFeeds file; - - tests = { - smoke-test = nixosTests.bcachefs; - }; - }; - - enableParallelBuilding = true; - meta = with lib; { - description = "Userspace tools for bcachefs"; - homepage = http://bcachefs.org; - license = licenses.gpl2; - platforms = platforms.linux; - maintainers = - [ "Kent Overstreet " - ]; - - }; -} +(import + ( + let lock = builtins.fromJSON (builtins.readFile ./flake.lock); in + fetchTarball { + url = "https://github.com/edolstra/flake-compat/archive/${lock.nodes.flake-compat.locked.rev}.tar.gz"; + sha256 = lock.nodes.flake-compat.locked.narHash; + } + ) + { src = ./.; } +).defaultNix diff --git a/flake.lock b/flake.lock index 2c9c15b..899f297 100644 --- a/flake.lock +++ b/flake.lock @@ -1,27 +1,28 @@ { "nodes": { - "filter": { + "flake-compat": { + "flake": false, "locked": { - "lastModified": 1620202920, - "narHash": "sha256-BOkm3eKT45Dk4NNxJT0xL9NnyYeZcF+t79zPnJkggac=", - "owner": "numtide", - "repo": "nix-filter", - "rev": "3c9e33ed627e009428197b07216613206f06ed80", + "lastModified": 1673956053, + "narHash": "sha256-4gtG9iQuiKITOjNQQeQIpoIB6b16fm+504Ch3sNKLd8=", + "owner": "edolstra", + "repo": "flake-compat", + "rev": "35bb57c0c8d8b62bbfd284272c928ceb64ddbde9", "type": "github" }, "original": { - "owner": "numtide", - "repo": "nix-filter", + "owner": "edolstra", + "repo": "flake-compat", "type": "github" } }, "nixpkgs": { "locked": { - "lastModified": 1633351077, - "narHash": "sha256-z38JG4Bb0GtM1aF1pANVdp1dniMP23Yb3HnRoJRy2uU=", + "lastModified": 1686592866, + "narHash": "sha256-riGg89eWhXJcPNrQGcSwTEEm7CGxWC06oSX44hajeMw=", "owner": "nixos", "repo": "nixpkgs", - "rev": "14aef06d9b3ad1d07626bdbb16083b83f92dc6c1", + "rev": "0eeebd64de89e4163f4d3cf34ffe925a5cf67a05", "type": "github" }, "original": { @@ -33,18 +34,36 @@ }, "root": { "inputs": { - "filter": "filter", + "flake-compat": "flake-compat", "nixpkgs": "nixpkgs", "utils": "utils" } }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, "utils": { + "inputs": { + "systems": "systems" + }, "locked": { - "lastModified": 1629481132, - "narHash": "sha256-JHgasjPR0/J1J3DRm4KxM4zTyAj4IOJY8vIl75v/kPI=", + "lastModified": 1685518550, + "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=", "owner": "numtide", "repo": "flake-utils", - "rev": "997f7efcb746a9c140ce1f13c72263189225f482", + "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index b52bc7e..0f8e90d 100644 --- a/flake.nix +++ b/flake.nix @@ -1,96 +1,22 @@ { - description = "Userspace tools for bcachefs"; - - # Nixpkgs / NixOS version to use. - inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; - inputs.utils.url = "github:numtide/flake-utils"; - inputs.filter.url = "github:numtide/nix-filter"; - - outputs = { self, nixpkgs, utils, filter, ... }@inputs: - let - # System types to support. - supportedSystems = [ "x86_64-linux" ]; - in - { - version = "${builtins.substring 0 8 self.lastModifiedDate}-${self.shortRev or "dirty"}"; - - overlay = import ./nix/overlay.nix inputs; - nixosModule = self.nixosModules.bcachefs; - nixosModules.bcachefs = import ./rust-src/mount/module.nix; - nixosModules.bcachefs-enable-boot = ({config, pkgs, lib, ... }:{ - # Disable Upstream NixOS Module when this is in use - disabledModules = [ "tasks/filesystems/bcachefs.nix" ]; - # Import needed packages - nixpkgs.overlays = [ self.overlay ]; - - # Add bcachefs to boot and kernel - boot.initrd.supportedFilesystems = [ "bcachefs" ]; - boot.supportedFilesystems = [ "bcachefs" ]; - }); - - nixosConfigurations.netboot-bcachefs = self.systems.netboot-bcachefs "x86_64-linux"; - systems.netboot-bcachefs = system: (nixpkgs.lib.nixosSystem { - inherit system; modules = [ - self.nixosModule - self.nixosModules.bcachefs-enable-boot - ("${nixpkgs}/nixos/modules/installer/netboot/netboot-minimal.nix") - ({ lib, pkgs, config, ... }: { - # installation disk autologin - services.getty.autologinUser = lib.mkForce "root"; - users.users.root.initialPassword = "toor"; - - # Symlink everything together - system.build.netboot = pkgs.symlinkJoin { - name = "netboot"; - paths = with config.system.build; [ - netbootRamdisk - kernel - netbootIpxeScript - ]; - preferLocalBuild = true; - }; - }) - ]; - }); - } - // utils.lib.eachSystem supportedSystems (system: - let pkgs = import nixpkgs { - inherit system; - overlays = [ self.overlay ]; - }; - in rec { - - # A Nixpkgs overlay. - - # Provide some binary packages for selected system types. - defaultPackage = pkgs.bcachefs.tools; - packages = { - inherit (pkgs.bcachefs) - tools - toolsValgrind - toolsDebug - mount - bch_bindgen - kernel; - - tools-musl = pkgs.pkgsMusl.bcachefs.tools; - mount-musl = pkgs.pkgsMusl.bcachefs.mount; - }; - - checks = { - kernelSrc = packages.kernel.src; - inherit (packages) - mount - bch_bindgen - toolsValgrind; - - # Build and test initrd with bcachefs and bcachefs.mount installed - # Disabled Test because it takes a while to build the kernel - # bootStage1Module = self.nixosConfigurations.netboot-bcachefs.config.system.build.bootStage1; - }; - - devShell = devShells.tools; - devShells.tools = pkgs.bcachefs.tools.override { inShell = true; }; - devShells.mount = pkgs.bcachefs.mount.override { inShell = true; }; - }); + description = "Userspace tools for bcachefs"; + + # Nixpkgs / NixOS version to use. + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixos-unstable"; + inputs.utils.url = "github:numtide/flake-utils"; + inputs.flake-compat = { + url = "github:edolstra/flake-compat"; + flake = false; + }; + + outputs = { self, nixpkgs, utils, ... }: + utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.${system}; + bcachefs = pkgs.callPackage ./build.nix {}; + in { + packages = { + default = bcachefs; + }; + }); } diff --git a/include/linux/atomic.h b/include/linux/atomic.h index a9852fa..f4d047c 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -32,6 +32,8 @@ typedef struct { #define __ATOMIC_SUB(v, p) uatomic_sub(p, v) #define __ATOMIC_INC(p) uatomic_inc(p) #define __ATOMIC_DEC(p) uatomic_dec(p) +#define __ATOMIC_AND(v, p) uatomic_and(p, v) +#define __ATOMIC_OR(v, p) uatomic_or(p, v) #define xchg(p, v) uatomic_xchg(p, v) #define xchg_acquire(p, v) uatomic_xchg(p, v) @@ -56,6 +58,8 @@ typedef struct { #define __ATOMIC_SUB_RETURN(v, p) __atomic_sub_fetch(p, v, __ATOMIC_RELAXED) #define __ATOMIC_SUB_RETURN_RELEASE(v, p) \ __atomic_sub_fetch(p, v, __ATOMIC_RELEASE) +#define __ATOMIC_AND(p) __atomic_and_fetch(p, v, __ATOMIC_RELAXED) +#define __ATOMIC_OR(p) __atomic_or_fetch(p, v, __ATOMIC_RELAXED) #define xchg(p, v) __atomic_exchange_n(p, v, __ATOMIC_SEQ_CST) #define xchg_acquire(p, v) __atomic_exchange_n(p, v, __ATOMIC_ACQUIRE) @@ -244,6 +248,16 @@ static inline bool a_type##_inc_not_zero(a_type##_t *v) \ return a_type##_add_unless(v, 1, 0); \ } \ \ +static inline void a_type##_and(i_type a, a_type##_t *v) \ +{ \ + __ATOMIC_AND(a, v); \ +} \ + \ +static inline void a_type##_or(i_type a, a_type##_t *v) \ +{ \ + __ATOMIC_OR(a, v); \ +} \ + \ static inline i_type a_type##_xchg(a_type##_t *v, i_type i) \ { \ return xchg(&v->counter, i); \ @@ -257,6 +271,13 @@ static inline i_type a_type##_cmpxchg(a_type##_t *v, i_type old, i_type new)\ static inline i_type a_type##_cmpxchg_acquire(a_type##_t *v, i_type old, i_type new)\ { \ return cmpxchg_acquire(&v->counter, old, new); \ +} \ + \ +static inline bool a_type##_try_cmpxchg_acquire(a_type##_t *v, i_type *old, i_type new)\ +{ \ + i_type prev = *old; \ + *old = cmpxchg_acquire(&v->counter, *old, new); \ + return prev == *old; \ } DEF_ATOMIC_OPS(atomic, int) diff --git a/include/linux/bio.h b/include/linux/bio.h index 0ad5a87..1f8acca 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -234,7 +234,7 @@ enum { }; struct bio *bio_alloc_bioset(struct block_device *, unsigned, - unsigned, gfp_t, struct bio_set *); + blk_opf_t, gfp_t, struct bio_set *); extern void bio_put(struct bio *); int bio_add_page(struct bio *, struct page *, unsigned, unsigned); diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index ed47cc6..62b91af 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -4,6 +4,7 @@ #include #include #include +#include static inline void bit_spin_lock(int nr, unsigned long *_addr) { diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 80e8ecd..db2dfdb 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -135,4 +135,12 @@ static inline unsigned long find_next_zero_bit(const unsigned long *addr, unsign #define find_first_bit(addr, size) find_next_bit((addr), (size), 0) #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +static inline bool bitmap_empty(const unsigned long *src, unsigned nbits) +{ + if (small_const_nbits(nbits)) + return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); + + return find_first_bit(src, nbits) == nbits; +} + #endif /* _PERF_BITOPS_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 01b3d4a..7d378ab 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -9,6 +9,7 @@ #define BIO_MAX_VECS 256U typedef unsigned fmode_t; +typedef __u32 __bitwise blk_opf_t; struct bio; struct user_namespace; @@ -43,6 +44,7 @@ struct user_namespace; #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)0x400) +#define FMODE_BUFFERED ((__force fmode_t)0x800) struct inode { unsigned long i_ino; @@ -70,6 +72,7 @@ static inline void submit_bio(struct bio *bio) } int blkdev_issue_discard(struct block_device *, sector_t, sector_t, gfp_t); +int blkdev_issue_zeroout(struct block_device *, sector_t, sector_t, gfp_t, unsigned); #define bdev_get_queue(bdev) (&((bdev)->queue)) diff --git a/include/linux/closure.h b/include/linux/closure.h index d85ca86..722a586 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -3,7 +3,6 @@ #define _LINUX_CLOSURE_H #include -#include #include #include #include @@ -173,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void __closure_sync(struct closure *cl); +static inline unsigned closure_nr_remaining(struct closure *cl) +{ + return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; +} + /** * closure_sync - sleep until a closure a closure has nothing left to wait on * @@ -181,7 +185,7 @@ void __closure_sync(struct closure *cl); */ static inline void closure_sync(struct closure *cl) { - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + if (closure_nr_remaining(cl) != 1) __closure_sync(cl); } diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 6d039ea..39df1f1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -10,6 +10,10 @@ # define __always_inline inline __attribute__((always_inline)) #endif +#ifndef __attribute_const__ +#define __attribute_const__ __attribute__((__const__)) +#endif + #ifdef __ANDROID__ /* * FIXME: Big hammer to get rid of tons of: diff --git a/include/linux/freezer.h b/include/linux/freezer.h index cf485d7..d90373f 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -7,4 +7,6 @@ #define freezable_schedule() schedule() #define freezable_schedule_timeout(_t) schedule_timeout(_t) +static inline void __refrigerator(bool f) {} + #endif /* __TOOLS_LINUX_FREEZER_H */ diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 4fd3b68..d16ea76 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -71,7 +71,7 @@ static inline u64 sched_clock(void) { struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); + clock_gettime(CLOCK_MONOTONIC_COARSE, &ts); return ((s64) ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b2c1751..35a7207 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -13,6 +13,9 @@ #include #include +#define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) + #define __ARG_PLACEHOLDER_1 0, #define __take_second_arg(__ignored, val, ...) val @@ -96,6 +99,14 @@ (type *)((char *)__mptr - offsetof(type, member)); }) #endif +#define __struct_group(TAG, NAME, ATTRS, MEMBERS...) \ + union { \ + struct { MEMBERS } ATTRS; \ + struct TAG { MEMBERS } ATTRS NAME; \ + } +#define struct_group(NAME, MEMBERS...) \ + __struct_group(/* no tag */, NAME, /* no attrs */, MEMBERS) + #define max(x, y) ({ \ typeof(x) _max1 = (x); \ typeof(y) _max2 = (y); \ @@ -132,8 +143,8 @@ #define might_sleep() -#define cpu_relax() do {} while (0) -#define cpu_relax_lowlatency() do {} while (0) +#define cpu_relax() barrier() +#define cpu_relax_lowlatency() barrier() #define panic(fmt, ...) \ do { \ @@ -264,4 +275,7 @@ struct qstr { static inline void dump_stack(void) {} +#define unsafe_memcpy(dst, src, bytes, justification) \ + memcpy(dst, src, bytes) + #endif diff --git a/include/linux/list.h b/include/linux/list.h index dcc4745..bdd09ef 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -70,4 +70,32 @@ static inline void list_splice_init(struct list_head *list, #define hlist_head cds_hlist_head #define hlist_node cds_hlist_node +#define hlist_add_head(n, h) cds_hlist_add_head(n, h) +#define hlist_del(n) cds_hlist_del(n) +#define hlist_del_init(n) cds_hlist_del_init(n) + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->prev; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + hlist_del(n); + n->prev = NULL; + n->next = NULL; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_entry_safe(ptr, type, member) \ + ({ typeof(ptr) ____ptr = (ptr); \ + ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ + }) + +#define hlist_for_each_entry(pos, head, member) \ + for (pos = hlist_entry_safe((head)->next, typeof(*(pos)), member);\ + pos; \ + pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) + #endif /* _LIST_LIST_H */ diff --git a/include/linux/lz4.h b/include/linux/lz4.h index 86e1dde..f574964 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -2,4 +2,9 @@ #define LZ4_compress_destSize(src, dst, srclen, dstlen, workspace) \ LZ4_compress_destSize(src, dst, srclen, dstlen) + +#define LZ4_compress_HC(src, dst, srclen, dstlen, level, workspace) -1 + #define LZ4_MEM_COMPRESS 0 +#define LZ4HC_MEM_COMPRESS 0 +#define LZ4HC_MIN_CLEVEL 0 diff --git a/include/linux/math.h b/include/linux/math.h index 3cf6726..85c8c8a 100644 --- a/include/linux/math.h +++ b/include/linux/math.h @@ -2,6 +2,11 @@ #ifndef _LINUX_MATH_H #define _LINUX_MATH_H +#include + +/* abs() */ +#include + /* * This looks more complex than it should be. But we need to * get the type for the ~ right in round_down (it needs to be @@ -148,4 +153,19 @@ static inline u32 int_sqrt64(u64 x) } #endif +#define abs(x) __abs_choose_expr(x, long long, \ + __abs_choose_expr(x, long, \ + __abs_choose_expr(x, int, \ + __abs_choose_expr(x, short, \ + __abs_choose_expr(x, char, \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), char), \ + (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ + ((void)0))))))) + +#define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(x), signed type) || \ + __builtin_types_compatible_p(typeof(x), unsigned type), \ + ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) + #endif /* _LINUX_MATH_H */ diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h index cbac6ac..6475050 100644 --- a/include/linux/mean_and_variance.h +++ b/include/linux/mean_and_variance.h @@ -4,98 +4,109 @@ #include #include +#include #include -#include #define SQRT_U64_MAX 4294967295ULL +/* + * u128_u: u128 user mode, because not all architectures support a real int128 + * type + */ -#if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) +#ifdef __SIZEOF_INT128__ -typedef unsigned __int128 u128; +typedef struct { + unsigned __int128 v; +} __aligned(16) u128_u; -static inline u128 u64_to_u128(u64 a) +static inline u128_u u64_to_u128(u64 a) { - return (u128)a; + return (u128_u) { .v = a }; } -static inline u64 u128_to_u64(u128 a) +static inline u64 u128_lo(u128_u a) { - return (u64)a; + return a.v; } -static inline u64 u128_shr64_to_u64(u128 a) +static inline u64 u128_hi(u128_u a) { - return (u64)(a >> 64); + return a.v >> 64; } -static inline u128 u128_add(u128 a, u128 b) +static inline u128_u u128_add(u128_u a, u128_u b) { - return a + b; + a.v += b.v; + return a; } -static inline u128 u128_sub(u128 a, u128 b) +static inline u128_u u128_sub(u128_u a, u128_u b) { - return a - b; + a.v -= b.v; + return a; } -static inline u128 u128_shl(u128 i, s8 shift) +static inline u128_u u128_shl(u128_u a, s8 shift) { - return i << shift; + a.v <<= shift; + return a; } -static inline u128 u128_shl64_add(u64 a, u64 b) +static inline u128_u u128_square(u64 a) { - return ((u128)a << 64) + b; -} + u128_u b = u64_to_u128(a); -static inline u128 u128_square(u64 i) -{ - return i*i; + b.v *= b.v; + return b; } #else typedef struct { u64 hi, lo; -} u128; +} __aligned(16) u128_u; -static inline u128 u64_to_u128(u64 a) +/* conversions */ + +static inline u128_u u64_to_u128(u64 a) { - return (u128){ .lo = a }; + return (u128_u) { .lo = a }; } -static inline u64 u128_to_u64(u128 a) +static inline u64 u128_lo(u128_u a) { return a.lo; } -static inline u64 u128_shr64_to_u64(u128 a) +static inline u64 u128_hi(u128_u a) { return a.hi; } -static inline u128 u128_add(u128 a, u128 b) +/* arithmetic */ + +static inline u128_u u128_add(u128_u a, u128_u b) { - u128 c; + u128_u c; c.lo = a.lo + b.lo; c.hi = a.hi + b.hi + (c.lo < a.lo); return c; } -static inline u128 u128_sub(u128 a, u128 b) +static inline u128_u u128_sub(u128_u a, u128_u b) { - u128 c; + u128_u c; c.lo = a.lo - b.lo; c.hi = a.hi - b.hi - (c.lo > a.lo); return c; } -static inline u128 u128_shl(u128 i, s8 shift) +static inline u128_u u128_shl(u128_u i, s8 shift) { - u128 r; + u128_u r; r.lo = i.lo << shift; if (shift < 64) @@ -107,15 +118,10 @@ static inline u128 u128_shl(u128 i, s8 shift) return r; } -static inline u128 u128_shl64_add(u64 a, u64 b) +static inline u128_u u128_square(u64 i) { - return u128_add(u128_shl(u64_to_u128(a), 64), u64_to_u128(b)); -} - -static inline u128 u128_square(u64 i) -{ - u128 r; - u64 h = i >> 32, l = i & (u64)U32_MAX; + u128_u r; + u64 h = i >> 32, l = i & U32_MAX; r = u128_shl(u64_to_u128(h*h), 64); r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); @@ -126,45 +132,67 @@ static inline u128 u128_square(u64 i) #endif -static inline u128 u128_div(u128 n, u64 d) +static inline u128_u u64s_to_u128(u64 hi, u64 lo) { - u128 r; - u64 rem; - u64 hi = u128_shr64_to_u64(n); - u64 lo = u128_to_u64(n); - u64 h = hi & ((u64)U32_MAX << 32); - u64 l = (hi & (u64)U32_MAX) << 32; + u128_u c = u64_to_u128(hi); - r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); - r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); - r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); - return r; + c = u128_shl(c, 64); + c = u128_add(c, u64_to_u128(lo)); + return c; } +u128_u u128_div(u128_u n, u64 d); + struct mean_and_variance { - s64 n; - s64 sum; - u128 sum_squares; + s64 n; + s64 sum; + u128_u sum_squares; }; /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 w; - s64 mean; - u64 variance; + bool init; + u8 weight; /* base 2 logarithim */ + s64 mean; + u64 variance; }; -s64 fast_divpow2(s64 n, u8 d); +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +static inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} + +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ +static inline void +mean_and_variance_update(struct mean_and_variance *s, s64 v) +{ + s->n++; + s->sum += v; + s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); +} + +s64 mean_and_variance_get_mean(struct mean_and_variance s); +u64 mean_and_variance_get_variance(struct mean_and_variance s1); +u32 mean_and_variance_get_stddev(struct mean_and_variance s); -struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1); - s64 mean_and_variance_get_mean(struct mean_and_variance s); - u64 mean_and_variance_get_variance(struct mean_and_variance s1); - u32 mean_and_variance_get_stddev(struct mean_and_variance s); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); -struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, s64 v1); - s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); - u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); - u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/poison.h b/include/linux/poison.h index d62ef5a..851a855 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -81,4 +81,13 @@ /********** net/core/page_pool.c **********/ #define PP_SIGNATURE (0x40 + POISON_POINTER_DELTA) +/********** net/core/skbuff.c **********/ +#define SKB_LIST_POISON_NEXT ((void *)(0x800 + POISON_POINTER_DELTA)) + +/********** kernel/bpf/ **********/ +#define BPF_PTR_POISON ((void *)(0xeB9FUL + POISON_POINTER_DELTA)) + +/********** VFS **********/ +#define VFS_PTR_POISON ((void *)(0xF5 + POISON_POINTER_DELTA)) + #endif diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h index 65beeb1..a8dad16 100644 --- a/include/linux/posix_acl_xattr.h +++ b/include/linux/posix_acl_xattr.h @@ -28,7 +28,7 @@ typedef struct { posix_acl_xattr_entry a_entries[0]; } posix_acl_xattr_header; -extern const struct xattr_handler posix_acl_access_xattr_handler; -extern const struct xattr_handler posix_acl_default_xattr_handler; +extern const struct xattr_handler nop_posix_acl_access; +extern const struct xattr_handler nop_posix_acl_default; #endif /* _POSIX_ACL_XATTR_H */ diff --git a/include/linux/prandom.h b/include/linux/prandom.h index 6f177cd..9aea22d 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -23,5 +23,11 @@ prandom_type(u32); prandom_type(u64); #undef prandom_type +static inline u32 prandom_u32_max(u32 max) +{ + return prandom_u32() % max; + +} + #endif /* _LINUX_PRANDOM_H */ diff --git a/include/linux/random.h b/include/linux/random.h index ea101d5..3203d13 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef SYS_getrandom static inline int getrandom(void *buf, size_t buflen, unsigned int flags) @@ -40,7 +41,30 @@ static inline type get_random_##type(void) \ get_random_type(int); get_random_type(long); +get_random_type(u8); +get_random_type(u16); get_random_type(u32); get_random_type(u64); +static inline u32 get_random_u32_below(u32 ceil) +{ + if (ceil <= 1) + return 0; + for (;;) { + if (ceil <= 1U << 8) { + u32 mult = ceil * get_random_u8(); + if (likely(is_power_of_2(ceil) || (u8)mult >= (1U << 8) % ceil)) + return mult >> 8; + } else if (ceil <= 1U << 16) { + u32 mult = ceil * get_random_u16(); + if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil)) + return mult >> 16; + } else { + u64 mult = (u64)ceil * get_random_u32(); + if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil)) + return mult >> 32; + } + } +} + #endif /* _LINUX_RANDOM_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3db40cb..ef03253 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -12,6 +12,7 @@ #define rcu_access_pointer(p) READ_ONCE(p) #define kfree_rcu(ptr, rcu_head) kfree(ptr) /* XXX */ +#define kvfree_rcu(ptr) kfree(ptr) /* XXX */ #define RCU_INIT_POINTER(p, v) WRITE_ONCE(p, v) diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index c5e717b..1c6dbdc 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -27,8 +27,6 @@ #include #include -#define BIT(nr) (1UL << (nr)) - #include /* * Objects in an rhashtable have an embedded struct rhash_head diff --git a/include/linux/sched.h b/include/linux/sched.h index ac6d27b..c5c8e3a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #define TASK_NEW 2048 #define TASK_IDLE_WORKER 4096 #define TASK_STATE_MAX 8192 +#define TASK_FREEZABLE (1U << 14) /* Convenience macros for the sake of set_task_state */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) @@ -116,7 +117,7 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -#define cond_resched() +static inline void cond_resched(void) {} #define need_resched() 0 void schedule(void); diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h new file mode 100644 index 0000000..8c9c0dd --- /dev/null +++ b/include/linux/seq_buf.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SEQ_BUF_H +#define _LINUX_SEQ_BUF_H + +#include +#include +#include + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use. + */ + +/** + * seq_buf - seq buffer structure + * @buffer: pointer to the buffer + * @size: size of the buffer + * @len: the amount of data inside the buffer + * @readpos: The next position to read in the buffer. + */ +struct seq_buf { + char *buffer; + size_t size; + size_t len; + loff_t readpos; +}; + +static inline void seq_buf_clear(struct seq_buf *s) +{ + s->len = 0; + s->readpos = 0; +} + +static inline void +seq_buf_init(struct seq_buf *s, char *buf, unsigned int size) +{ + s->buffer = buf; + s->size = size; + seq_buf_clear(s); +} + +/* + * seq_buf have a buffer that might overflow. When this happens + * the len and size are set to be equal. + */ +static inline bool +seq_buf_has_overflowed(struct seq_buf *s) +{ + return s->len > s->size; +} + +static inline void +seq_buf_set_overflow(struct seq_buf *s) +{ + s->len = s->size + 1; +} + +/* + * How much buffer is left on the seq_buf? + */ +static inline unsigned int +seq_buf_buffer_left(struct seq_buf *s) +{ + if (seq_buf_has_overflowed(s)) + return 0; + + return s->size - s->len; +} + +/* How much buffer was written? */ +static inline unsigned int seq_buf_used(struct seq_buf *s) +{ + return min(s->len, s->size); +} + +/** + * seq_buf_terminate - Make sure buffer is nul terminated + * @s: the seq_buf descriptor to terminate. + * + * This makes sure that the buffer in @s is nul terminated and + * safe to read as a string. + * + * Note, if this is called when the buffer has overflowed, then + * the last byte of the buffer is zeroed, and the len will still + * point passed it. + * + * After this function is called, s->buffer is safe to use + * in string operations. + */ +static inline void seq_buf_terminate(struct seq_buf *s) +{ + if (WARN_ON(s->size == 0)) + return; + + if (seq_buf_buffer_left(s)) + s->buffer[s->len] = 0; + else + s->buffer[s->size - 1] = 0; +} + +/** + * seq_buf_get_buf - get buffer to write arbitrary data to + * @s: the seq_buf handle + * @bufp: the beginning of the buffer is stored here + * + * Return the number of bytes available in the buffer, or zero if + * there's no space. + */ +static inline size_t seq_buf_get_buf(struct seq_buf *s, char **bufp) +{ + WARN_ON(s->len > s->size + 1); + + if (s->len < s->size) { + *bufp = s->buffer + s->len; + return s->size - s->len; + } + + *bufp = NULL; + return 0; +} + +/** + * seq_buf_commit - commit data to the buffer + * @s: the seq_buf handle + * @num: the number of bytes to commit + * + * Commit @num bytes of data written to a buffer previously acquired + * by seq_buf_get. To signal an error condition, or that the data + * didn't fit in the available space, pass a negative @num value. + */ +static inline void seq_buf_commit(struct seq_buf *s, int num) +{ + if (num < 0) { + seq_buf_set_overflow(s); + } else { + /* num must be negative on overflow */ + BUG_ON(s->len + num > s->size); + s->len += num; + } +} + +extern __printf(2, 3) +int seq_buf_printf(struct seq_buf *s, const char *fmt, ...); +extern __printf(2, 0) +int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args); +extern int seq_buf_to_user(struct seq_buf *s, char __user *ubuf, + int cnt); +extern int seq_buf_puts(struct seq_buf *s, const char *str); +extern int seq_buf_putc(struct seq_buf *s, unsigned char c); + +void seq_buf_human_readable_u64(struct seq_buf *, u64); + +#endif /* _LINUX_SEQ_BUF_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index ebbab7a..bca00d6 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -11,13 +11,13 @@ struct shrink_control { #define SHRINK_STOP (~0UL) -struct printbuf; +struct seq_buf; struct shrinker { unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); - void (*to_text)(struct printbuf *, struct shrinker *); + void (*to_text)(struct seq_buf *, struct shrinker *); int seeks; /* seeks to recreate an obj */ long batch; /* reclaim batch size, 0 = default */ diff --git a/include/linux/six.h b/include/linux/six.h index 362a577..394da42 100644 --- a/include/linux/six.h +++ b/include/linux/six.h @@ -3,99 +3,131 @@ #ifndef _LINUX_SIX_H #define _LINUX_SIX_H -/* - * Shared/intent/exclusive locks: sleepable read/write locks, much like rw - * semaphores, except with a third intermediate state, intent. Basic operations - * are: +/** + * DOC: SIX locks overview * - * six_lock_read(&foo->lock); - * six_unlock_read(&foo->lock); + * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores + * but with an additional state: read/shared, intent, exclusive/write * - * six_lock_intent(&foo->lock); - * six_unlock_intent(&foo->lock); + * The purpose of the intent state is to allow for greater concurrency on tree + * structures without deadlocking. In general, a read can't be upgraded to a + * write lock without deadlocking, so an operation that updates multiple nodes + * will have to take write locks for the full duration of the operation. * - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); + * But by adding an intent state, which is exclusive with other intent locks but + * not with readers, we can take intent locks at thte start of the operation, + * and then take write locks only for the actual update to each individual + * nodes, without deadlocking. * - * Intent locks block other intent locks, but do not block read locks, and you - * must have an intent lock held before taking a write lock, like so: + * Example usage: + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); * - * six_lock_intent(&foo->lock); - * six_lock_write(&foo->lock); - * six_unlock_write(&foo->lock); - * six_unlock_intent(&foo->lock); + * An intent lock must be held before taking a write lock: + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); * * Other operations: - * * six_trylock_read() * six_trylock_intent() * six_trylock_write() * - * six_lock_downgrade(): convert from intent to read - * six_lock_tryupgrade(): attempt to convert from read to intent - * - * Locks also embed a sequence number, which is incremented when the lock is - * locked or unlocked for write. The current sequence number can be grabbed - * while a lock is held from lock->state.seq; then, if you drop the lock you can - * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock - * iff it hasn't been locked for write in the meantime. - * - * There are also operations that take the lock type as a parameter, where the - * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: - * - * six_lock_type(lock, type) - * six_unlock_type(lock, type) - * six_relock(lock, type, seq) - * six_trylock_type(lock, type) - * six_trylock_convert(lock, from, to) - * - * A lock may be held multiple types by the same thread (for read or intent, - * not write). However, the six locks code does _not_ implement the actual - * recursive checks itself though - rather, if your code (e.g. btree iterator - * code) knows that the current thread already has a lock held, and for the - * correct type, six_lock_increment() may be used to bump up the counter for - * that type - the only effect is that one more call to unlock will be required - * before the lock is unlocked. + * six_lock_downgrade() convert from intent to read + * six_lock_tryupgrade() attempt to convert from read to intent, may fail + * + * There are also interfaces that take the lock type as an enum: + * + * six_lock_type(&foo->lock, SIX_LOCK_read); + * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) + * six_lock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_intent); + * + * Lock sequence numbers - unlock(), relock(): + * + * Locks embed sequences numbers, which are incremented on write lock/unlock. + * This allows locks to be dropped and the retaken iff the state they protect + * hasn't changed; this makes it much easier to avoid holding locks while e.g. + * doing IO or allocating memory. + * + * Example usage: + * six_lock_read(&foo->lock); + * u32 seq = six_lock_seq(&foo->lock); + * six_unlock_read(&foo->lock); + * + * some_operation_that_may_block(); + * + * if (six_relock_read(&foo->lock, seq)) { ... } + * + * If the relock operation succeeds, it is as if the lock was never unlocked. + * + * Reentrancy: + * + * Six locks are not by themselves reentrent, but have counters for both the + * read and intent states that can be used to provide reentrency by an upper + * layer that tracks held locks. If a lock is known to already be held in the + * read or intent state, six_lock_increment() can be used to bump the "lock + * held in this state" counter, increasing the number of unlock calls that + * will be required to fully unlock it. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_lock_increment(&foo->lock, SIX_LOCK_read); + * six_unlock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * foo->lock is now fully unlocked. + * + * Since the intent state supercedes read, it's legal to increment the read + * counter when holding an intent lock, but not the reverse. + * + * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) + * is not legal. + * + * should_sleep_fn: + * + * There is a six_lock() variant that takes a function pointer that is called + * immediately prior to schedule() when blocking, and may return an error to + * abort. + * + * One possible use for this feature is when objects being locked are part of + * a cache and may reused, and lock ordering is based on a property of the + * object that will change when the object is reused - i.e. logical key order. + * + * If looking up an object in the cache may race with object reuse, and lock + * ordering is required to prevent deadlock, object reuse may change the + * correct lock order for that object and cause a deadlock. should_sleep_fn + * can be used to check if the object is still the object we want and avoid + * this deadlock. + * + * Wait list entry interface: + * + * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a + * wait list entry. By embedding six_lock_waiter into another object, and by + * traversing lock waitlists, it is then possible for an upper layer to + * implement full cycle detection for deadlock avoidance. + * + * should_sleep_fn should be used for invoking the cycle detector, walking the + * graph of held locks to check for a deadlock. The upper layer must track + * held locks for each thread, and each thread's held locks must be reachable + * from its six_lock_waiter object. + * + * six_lock_waiter() will add the wait object to the waitlist re-trying taking + * the lock, and before calling should_sleep_fn, and the wait object will not + * be removed from the waitlist until either the lock has been successfully + * acquired, or we aborted because should_sleep_fn returned an error. + * + * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will + * have timestamps in strictly ascending order - this is so the timestamp can + * be used as a cursor for lock graph traverse. */ #include +#include #include #include -#define SIX_LOCK_SEPARATE_LOCKFNS - -union six_lock_state { - struct { - atomic64_t counter; - }; - - struct { - u64 v; - }; - - struct { - /* for waitlist_bitnr() */ - unsigned long l; - }; - - struct { - unsigned read_lock:27; - unsigned write_locking:1; - unsigned intent_lock:1; - unsigned waiters:3; - /* - * seq works much like in seqlocks: it's incremented every time - * we lock and unlock for write. - * - * If it's odd write lock is held, even unlocked. - * - * Thus readers can unlock, and then lock again later iff it - * hasn't been modified in the meantime. - */ - u32 seq; - }; -}; - enum six_lock_type { SIX_LOCK_read, SIX_LOCK_intent, @@ -103,11 +135,12 @@ enum six_lock_type { }; struct six_lock { - union six_lock_state state; + atomic_t state; + u32 seq; + unsigned intent_lock_recurse; struct task_struct *owner; unsigned __percpu *readers; - unsigned intent_lock_recurse; - unsigned long ip; + struct optimistic_spin_queue osq; raw_spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -125,82 +158,217 @@ struct six_lock_waiter { typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -static __always_inline void __six_lock_init(struct six_lock *lock, - const char *name, - struct lock_class_key *key) -{ - atomic64_set(&lock->state.counter, 0); - raw_spin_lock_init(&lock->wait_lock); - INIT_LIST_HEAD(&lock->wait_list); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - debug_check_no_locks_freed((void *) lock, sizeof(*lock)); - lockdep_init_map(&lock->dep_map, name, key, 0); -#endif -} +void six_lock_exit(struct six_lock *lock); + +enum six_lock_init_flags { + SIX_LOCK_INIT_PCPU = 1U << 0, +}; + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags); -#define six_lock_init(lock) \ +/** + * six_lock_init - initialize a six lock + * @lock: lock to initialize + * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU + */ +#define six_lock_init(lock, flags) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key); \ + __six_lock_init((lock), #lock, &__key, flags); \ } while (0) -#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) +/** + * six_lock_seq - obtain current lock sequence number + * @lock: six_lock to obtain sequence number for + * + * @lock should be held for read or intent, and not write + * + * By saving the lock sequence number, we can unlock @lock and then (typically + * after some blocking operation) attempt to relock it: the relock will succeed + * if the sequence number hasn't changed, meaning no write locks have been taken + * and state corresponding to what @lock protects is still valid. + */ +static inline u32 six_lock_seq(const struct six_lock *lock) +{ + return lock->seq; +} -#define __SIX_LOCK(type) \ -bool six_trylock_##type(struct six_lock *); \ -bool six_relock_##type(struct six_lock *, u32); \ -int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ -int six_lock_waiter_##type(struct six_lock *, struct six_lock_waiter *, \ - six_lock_should_sleep_fn, void *); \ -void six_unlock_##type(struct six_lock *); +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) -#undef __SIX_LOCK +/** + * six_trylock_type - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * Return: true on success, false on failure. + */ +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return six_trylock_ip(lock, type, _THIS_IP_); +} -#define SIX_LOCK_DISPATCH(type, fn, ...) \ - switch (type) { \ - case SIX_LOCK_read: \ - return fn##_read(__VA_ARGS__); \ - case SIX_LOCK_intent: \ - return fn##_intent(__VA_ARGS__); \ - case SIX_LOCK_write: \ - return fn##_write(__VA_ARGS__); \ - default: \ - BUG(); \ - } +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip); -static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +/** + * six_lock_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * This is a convenience wrapper around six_lock_ip_waiter(), see that function + * for full documentation. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) { - SIX_LOCK_DISPATCH(type, six_trylock, lock); + return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); } -static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) +/** + * six_lock_ip - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { - SIX_LOCK_DISPATCH(type, six_relock, lock, seq); + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); } +/** + * six_lock_type - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { - SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); } -static inline int six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip); + +/** + * six_relock_type - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * + * Return: true on success, false on failure. + */ +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) { - SIX_LOCK_DISPATCH(type, six_lock_waiter, lock, wait, should_sleep_fn, p); + return six_relock_ip(lock, type, seq, _THIS_IP_); } +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_unlock_type - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) { - SIX_LOCK_DISPATCH(type, six_unlock, lock); + six_unlock_ip(lock, type, _THIS_IP_); +} + +#define __SIX_LOCK(type) \ +static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} \ + \ +static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ +{ \ + return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ +} \ + \ +static inline int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ +{ \ + return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ +} \ + \ +static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ +} \ + \ +static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ +} \ + \ +static inline int six_lock_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn fn, void *p)\ +{ \ + return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ +} \ + \ +static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ } +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) +#undef __SIX_LOCK + void six_lock_downgrade(struct six_lock *); bool six_lock_tryupgrade(struct six_lock *); bool six_trylock_convert(struct six_lock *, enum six_lock_type, @@ -210,13 +378,11 @@ void six_lock_increment(struct six_lock *, enum six_lock_type); void six_lock_wakeup_all(struct six_lock *); -void six_lock_pcpu_free(struct six_lock *); -void six_lock_pcpu_alloc(struct six_lock *); - struct six_lock_count { unsigned n[3]; }; struct six_lock_count six_lock_counts(struct six_lock *); +void six_lock_readers_add(struct six_lock *, int); #endif /* _LINUX_SIX_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 17fe235..25ccf1a 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -15,17 +15,17 @@ #include #include +#define alloc_hooks(_do, ...) _do + #define ARCH_KMALLOC_MINALIGN 16 #define KMALLOC_MAX_SIZE SIZE_MAX -static inline void *kmalloc(size_t size, gfp_t flags) +static inline void *kmalloc_noprof(size_t size, gfp_t flags) { - unsigned i = 0; + unsigned i; void *p; - do { - run_shrinkers(flags, i != 0); - + for (i = 0; i < 10; i++) { if (size) { size_t alignment = min(rounddown_pow_of_two(size), (size_t)PAGE_SIZE); alignment = max(sizeof(void *), alignment); @@ -34,12 +34,19 @@ static inline void *kmalloc(size_t size, gfp_t flags) } else { p = malloc(0); } - if (p && (flags & __GFP_ZERO)) - memset(p, 0, size); - } while (!p && i++ < 10); + + if (p) { + if (flags & __GFP_ZERO) + memset(p, 0, size); + break; + } + + run_shrinkers(flags, true); + } return p; } +#define kmalloc kmalloc_noprof static inline void *krealloc(void *old, size_t size, gfp_t flags) { @@ -73,9 +80,15 @@ static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t } #define kzalloc(size, flags) kmalloc(size, flags|__GFP_ZERO) -#define kmalloc_array(n, size, flags) \ - ((size) != 0 && (n) > SIZE_MAX / (size) \ - ? NULL : kmalloc((n) * (size), flags)) + +static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) +{ + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + return kmalloc(bytes, flags); +} #define kvmalloc_array(n, size, flags) \ ((size) != 0 && (n) > SIZE_MAX / (size) \ @@ -90,26 +103,34 @@ static inline void *krealloc_array(void *p, size_t new_n, size_t new_size, gfp_t #define kvzalloc(size, flags) kzalloc(size, flags) #define kvfree(p) kfree(p) -static inline struct page *alloc_pages(gfp_t flags, unsigned int order) +static inline struct page *alloc_pages_noprof(gfp_t flags, unsigned int order) { size_t size = PAGE_SIZE << order; - unsigned i = 0; + unsigned i; void *p; - do { - run_shrinkers(flags, i != 0); - + for (i = 0; i < 10; i++) { p = aligned_alloc(PAGE_SIZE, size); - if (p && (flags & __GFP_ZERO)) - memset(p, 0, size); - } while (!p && i++ < 10); + + if (p) { + if (flags & __GFP_ZERO) + memset(p, 0, size); + break; + } + + run_shrinkers(flags, true); + } return p; } +#define alloc_pages alloc_pages_noprof #define alloc_page(gfp) alloc_pages(gfp, 0) +#define _get_free_pages(gfp, order) ((unsigned long) alloc_pages(gfp, order)) #define __get_free_pages(gfp, order) ((unsigned long) alloc_pages(gfp, order)) +#define get_free_pages_noprof(gfp, order) \ + ((unsigned long) alloc_pages(gfp, order)) #define __get_free_page(gfp) __get_free_pages(gfp, 0) #define __free_pages(page, order) \ @@ -166,6 +187,11 @@ static inline void *kmem_cache_alloc(struct kmem_cache *c, gfp_t gfp) return kmalloc(c->obj_size, gfp); } +static inline void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t gfp) +{ + return kzalloc(c->obj_size, gfp); +} + static inline void kmem_cache_free(struct kmem_cache *c, void *p) { kfree(p); @@ -193,23 +219,28 @@ static inline struct kmem_cache *kmem_cache_create(size_t obj_size) #define vfree(p) free(p) -static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) +static inline void *__vmalloc_noprof(unsigned long size, gfp_t flags) { - unsigned i = 0; + unsigned i; void *p; size = round_up(size, PAGE_SIZE); - do { - run_shrinkers(gfp_mask, i != 0); - + for (i = 0; i < 10; i++) { p = aligned_alloc(PAGE_SIZE, size); - if (p && gfp_mask & __GFP_ZERO) - memset(p, 0, size); - } while (!p && i++ < 10); + + if (p) { + if (flags & __GFP_ZERO) + memset(p, 0, size); + break; + } + + run_shrinkers(flags, true); + } return p; } +#define __vmalloc __vmalloc_noprof static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) { diff --git a/include/linux/types.h b/include/linux/types.h index fc05e23..ce454e2 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -82,5 +82,6 @@ typedef int (*cmp_func_t)(const void *a, const void *b); typedef unsigned int __bitwise slab_flags_t; typedef u64 phys_addr_t; struct vm_struct; +struct mnt_idmap; #endif /* _TOOLS_LINUX_TYPES_H_ */ diff --git a/include/linux/uuid.h b/include/linux/uuid.h index c8eeb70..a999090 100644 --- a/include/linux/uuid.h +++ b/include/linux/uuid.h @@ -18,32 +18,24 @@ #include #include +#include -typedef struct { - __u8 b[16]; -} uuid_le; +#define UUID_SIZE 16 typedef struct { - __u8 b[16]; -} uuid_be; - -#define UUID_LE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ -((uuid_le) \ -{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ - (b) & 0xff, ((b) >> 8) & 0xff, \ - (c) & 0xff, ((c) >> 8) & 0xff, \ - (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + __u8 b[UUID_SIZE]; +} __uuid_t; -#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ -((uuid_be) \ +#define UUID_INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +((__uuid_t) \ {{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \ ((b) >> 8) & 0xff, (b) & 0xff, \ ((c) >> 8) & 0xff, (c) & 0xff, \ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) -static inline int uuid_le_cmp(const uuid_le u1, const uuid_le u2) +static inline bool uuid_equal(const __uuid_t *u1, const __uuid_t *u2) { - return memcmp(&u1, &u2, sizeof(uuid_le)); + return memcmp(u1, u2, sizeof(__uuid_t)) == 0; } #endif diff --git a/include/linux/wait.h b/include/linux/wait.h index d30fb10..4b9cbf3 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -18,10 +18,12 @@ struct __wait_queue { struct list_head task_list; }; -typedef struct { +struct wait_queue_head { spinlock_t lock; struct list_head task_list; -} wait_queue_head_t; +}; + +typedef struct wait_queue_head wait_queue_head_t; void wake_up(wait_queue_head_t *); void wake_up_all(wait_queue_head_t *); @@ -42,7 +44,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int flags, void *ke .task_list = { &(name).task_list, &(name).task_list } } #define DECLARE_WAIT_QUEUE_HEAD(name) \ - wait_queue_head_t name = __WAIT_QUEUE_HEAD_INITIALIZER(name) + struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name) static inline void init_waitqueue_head(wait_queue_head_t *q) { diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 222c72f..dcdff6e 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -42,7 +42,7 @@ struct xattr_handler { struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, - struct user_namespace *mnt_userns, struct dentry *dentry, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags); }; diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h new file mode 100644 index 0000000..9ebd081 --- /dev/null +++ b/include/trace/events/lock.h @@ -0,0 +1,144 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#if !defined(_TRACE_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_LOCK_H + +#include +#include + +/* flags for lock:contention_begin */ +#define LCB_F_SPIN (1U << 0) +#define LCB_F_READ (1U << 1) +#define LCB_F_WRITE (1U << 2) +#define LCB_F_RT (1U << 3) +#define LCB_F_PERCPU (1U << 4) +#define LCB_F_MUTEX (1U << 5) + + +#ifdef CONFIG_LOCKDEP + +#include + +TRACE_EVENT(lock_acquire, + + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __string(name, lock->name) + __field(void *, lockdep_addr) + ), + + TP_fast_assign( + __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); + __assign_str(name, lock->name); + __entry->lockdep_addr = lock; + ), + + TP_printk("%p %s%s%s", __entry->lockdep_addr, + (__entry->flags & 1) ? "try " : "", + (__entry->flags & 2) ? "read " : "", + __get_str(name)) +); + +DECLARE_EVENT_CLASS(lock, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip), + + TP_STRUCT__entry( + __string( name, lock->name ) + __field( void *, lockdep_addr ) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + __entry->lockdep_addr = lock; + ), + + TP_printk("%p %s", __entry->lockdep_addr, __get_str(name)) +); + +DEFINE_EVENT(lock, lock_release, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +#ifdef CONFIG_LOCK_STAT + +DEFINE_EVENT(lock, lock_contended, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +DEFINE_EVENT(lock, lock_acquired, + + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + + TP_ARGS(lock, ip) +); + +#endif /* CONFIG_LOCK_STAT */ +#endif /* CONFIG_LOCKDEP */ + +TRACE_EVENT(contention_begin, + + TP_PROTO(void *lock, unsigned int flags), + + TP_ARGS(lock, flags), + + TP_STRUCT__entry( + __field(void *, lock_addr) + __field(unsigned int, flags) + ), + + TP_fast_assign( + __entry->lock_addr = lock; + __entry->flags = flags; + ), + + TP_printk("%p (flags=%s)", __entry->lock_addr, + __print_flags(__entry->flags, "|", + { LCB_F_SPIN, "SPIN" }, + { LCB_F_READ, "READ" }, + { LCB_F_WRITE, "WRITE" }, + { LCB_F_RT, "RT" }, + { LCB_F_PERCPU, "PERCPU" }, + { LCB_F_MUTEX, "MUTEX" } + )) +); + +TRACE_EVENT(contention_end, + + TP_PROTO(void *lock, int ret), + + TP_ARGS(lock, ret), + + TP_STRUCT__entry( + __field(void *, lock_addr) + __field(int, ret) + ), + + TP_fast_assign( + __entry->lock_addr = lock; + __entry->ret = ret; + ), + + TP_printk("%p (ret=%d)", __entry->lock_addr, __entry->ret) +); + +#endif /* _TRACE_LOCK_H */ + +/* This part must be outside protection */ +#include diff --git a/libbcachefs.c b/libbcachefs.c index 4fe2c3d..bac772b 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -39,7 +39,7 @@ static void init_layout(struct bch_sb_layout *l, memset(l, 0, sizeof(*l)); - l->magic = BCACHE_MAGIC; + l->magic = BCHFS_MAGIC; l->layout_type = 0; l->nr_superblocks = 2; l->sb_max_size_bits = ilog2(sb_size); @@ -188,7 +188,7 @@ struct bch_sb *bch2_format(struct bch_opt_strs fs_opt_strs, sb.sb->version = le16_to_cpu(opts.version); sb.sb->version_min = le16_to_cpu(opts.version); - sb.sb->magic = BCACHE_MAGIC; + sb.sb->magic = BCHFS_MAGIC; sb.sb->user_uuid = opts.uuid; sb.sb->nr_devices = nr_devs; @@ -353,7 +353,8 @@ struct bch_sb *__bch2_super_read(int fd, u64 sector) xpread(fd, &sb, sizeof(sb), sector << 9); - if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic))) + if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)) && + memcmp(&sb.magic, &BCHFS_MAGIC, sizeof(sb.magic))) die("not a bcachefs superblock"); size_t bytes = vstruct_bytes(&sb); @@ -603,8 +604,7 @@ struct bch_opts bch2_parse_opts(struct bch_opt_strs strs) u64 v; for (i = 0; i < bch2_opts_nr; i++) { - if (!strs.by_id[i] || - bch2_opt_table[i].type == BCH_OPT_FN) + if (!strs.by_id[i]) continue; ret = bch2_opt_parse(NULL, diff --git a/libbcachefs.h b/libbcachefs.h index 17e8eef..ba5d380 100644 --- a/libbcachefs.h +++ b/libbcachefs.h @@ -32,7 +32,7 @@ void bch2_opts_usage(unsigned); struct format_opts { char *label; - uuid_le uuid; + __uuid_t uuid; unsigned version; unsigned superblock_size; bool encrypted; @@ -41,8 +41,12 @@ struct format_opts { static inline struct format_opts format_opts_default() { + unsigned version = !access( "/sys/module/bcachefs/parameters/version", R_OK) + ? read_file_u64(AT_FDCWD, "/sys/module/bcachefs/parameters/version") + : bcachefs_metadata_version_current; + return (struct format_opts) { - .version = bcachefs_metadata_version_current, + .version = version, .superblock_size = SUPERBLOCK_SIZE_DEFAULT, }; } @@ -84,9 +88,9 @@ struct bch_sb *__bch2_super_read(int, u64); int bcachectl_open(void); struct bchfs_handle { - uuid_le uuid; - int ioctl_fd; - int sysfs_fd; + __uuid_t uuid; + int ioctl_fd; + int sysfs_fd; }; void bcache_fs_close(struct bchfs_handle); @@ -235,7 +239,7 @@ struct dev_name { unsigned idx; char *dev; char *label; - uuid_le uuid; + uuid_t uuid; }; typedef DARRAY(struct dev_name) dev_names; diff --git a/libbcachefs/acl.c b/libbcachefs/acl.c index 9592541..b1a4888 100644 --- a/libbcachefs/acl.c +++ b/libbcachefs/acl.c @@ -35,12 +35,14 @@ static inline int acl_to_xattr_type(int type) /* * Convert from filesystem to in-memory representation. */ -static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) +static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, + const void *value, size_t size) { const void *p, *end = value + size; struct posix_acl *acl; struct posix_acl_entry *out; unsigned count = 0; + int ret; if (!value) return NULL; @@ -81,9 +83,14 @@ static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) if (!count) return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); + acl = allocate_dropping_locks(trans, ret, + posix_acl_alloc(count, _gfp)); if (!acl) return ERR_PTR(-ENOMEM); + if (ret) { + kfree(acl); + return ERR_PTR(ret); + } out = acl->a_entries; @@ -212,11 +219,13 @@ bch2_acl_to_xattr(struct btree_trans *trans, return xattr; } -struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) +struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) { - struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); struct btree_trans trans; struct btree_iter iter = { NULL }; struct bkey_s_c_xattr xattr; @@ -224,21 +233,14 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) struct bkey_s_c k; int ret; - if (rcu) - return ERR_PTR(-ECHILD); - bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, - &hash, inode_inum(inode), - &X_SEARCH(acl_to_xattr_type(type), "", 0), - 0); + &hash, inode_inum(inode), &search, 0); if (ret) { - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - if (ret != -ENOENT) + if (!bch2_err_matches(ret, ENOENT)) acl = ERR_PTR(ret); goto out; } @@ -251,12 +253,15 @@ retry: } xattr = bkey_s_c_to_xattr(k); - acl = bch2_acl_from_disk(xattr_val(xattr.v), + acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); if (!IS_ERR(acl)) set_cached_acl(&inode->v, type, acl); out: + if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); return acl; @@ -289,13 +294,14 @@ int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, inum, &search); } - return ret == -ENOENT ? 0 : ret; + return bch2_err_matches(ret, ENOENT) ? 0 : ret; } -int bch2_set_acl(struct user_namespace *mnt_userns, - struct inode *vinode, struct posix_acl *_acl, int type) +int bch2_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, + struct posix_acl *_acl, int type) { - struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct btree_trans trans; struct btree_iter inode_iter = { NULL }; @@ -318,7 +324,7 @@ retry: mode = inode_u.bi_mode; if (type == ACL_TYPE_ACCESS) { - ret = posix_acl_update_mode(mnt_userns, &inode->v, &mode, &acl); + ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); if (ret) goto btree_err; } @@ -357,6 +363,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct posix_acl **new_acl) { struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); + struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; @@ -365,24 +372,23 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inum, - &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), - BTREE_ITER_INTENT); + &hash_info, inum, &search, BTREE_ITER_INTENT); if (ret) - return ret == -ENOENT ? 0 : ret; + return bch2_err_matches(ret, ENOENT) ? 0 : ret; k = bch2_btree_iter_peek_slot(&iter); xattr = bkey_s_c_to_xattr(k); if (ret) goto err; - acl = bch2_acl_from_disk(xattr_val(xattr.v), + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), le16_to_cpu(xattr.v->x_val_len)); ret = PTR_ERR_OR_ZERO(acl); if (IS_ERR_OR_NULL(acl)) goto err; - ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); + ret = allocate_dropping_locks_errcode(trans, + __posix_acl_chmod(&acl, _gfp, mode)); if (ret) goto err; diff --git a/libbcachefs/acl.h b/libbcachefs/acl.h index 2d76a48..bb21d8d 100644 --- a/libbcachefs/acl.h +++ b/libbcachefs/acl.h @@ -26,12 +26,12 @@ typedef struct { __le32 a_version; } bch_acl_header; -struct posix_acl *bch2_get_acl(struct inode *, int, bool); +struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct posix_acl *, int); -int bch2_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); +int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int bch2_acl_chmod(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, umode_t, struct posix_acl **); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 742313c..7bf2a50 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "buckets_waiting_for_journal.h" #include "clock.h" @@ -17,6 +18,7 @@ #include "error.h" #include "lru.h" #include "recovery.h" +#include "trace.h" #include "varint.h" #include @@ -26,7 +28,6 @@ #include #include #include -#include /* Persistent alloc info: */ @@ -78,36 +79,6 @@ static inline u64 alloc_field_v1_get(const struct bch_alloc *a, return v; } -static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, - unsigned field, u64 v) -{ - unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; - - if (!v) - return; - - a->v.fields |= 1 << field; - - switch (bytes) { - case 1: - *((u8 *) *p) = v; - break; - case 2: - *((__le16 *) *p) = cpu_to_le16(v); - break; - case 4: - *((__le32 *) *p) = cpu_to_le32(v); - break; - case 8: - *((__le64 *) *p) = cpu_to_le64(v); - break; - default: - BUG(); - } - - *p += bytes; -} - static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, struct bkey_s_c k) { @@ -222,7 +193,8 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -230,60 +202,62 @@ int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { prt_printf(err, "incorrect value size (%zu < %u)", bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; } int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_alloc_unpacked u; if (bch2_alloc_unpack_v2(&u, k)) { prt_printf(err, "unpack error"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; } int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_alloc_unpacked u; if (bch2_alloc_unpack_v3(&u, k)) { prt_printf(err, "unpack error"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; } int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int rw = flags & WRITE; - if (alloc_v4_u64s(a.v) != bkey_val_u64s(k.k)) { - prt_printf(err, "bad val size (%lu != %u)", - bkey_val_u64s(k.k), alloc_v4_u64s(a.v)); - return -EINVAL; + if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%u > %lu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + return -BCH_ERR_invalid_bkey; } if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { prt_printf(err, "invalid backpointers_start"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - /* - * XXX this is wrong, we'll be checking updates that happened from - * before BCH_FS_CHECK_BACKPOINTERS_DONE - */ - if (rw == WRITE && test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + if (rw == WRITE && + !(flags & BKEY_INVALID_JOURNAL) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { unsigned i, bp_len = 0; for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) @@ -291,7 +265,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, if (bp_len > a.v->dirty_sectors) { prt_printf(err, "too many backpointers"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } } @@ -299,7 +273,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { prt_printf(err, "invalid data type (got %u should be %u)", a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } switch (a.v->data_type) { @@ -310,7 +284,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, a.v->cached_sectors || a.v->stripe) { prt_printf(err, "empty data type free but have data"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } break; case BCH_DATA_sb: @@ -321,7 +295,7 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, if (!a.v->dirty_sectors) { prt_printf(err, "data_type %s but dirty_sectors==0", bch2_data_types[a.v->data_type]); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } break; case BCH_DATA_cached: @@ -329,20 +303,20 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, a.v->dirty_sectors || a.v->stripe) { prt_printf(err, "data type inconsistency"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (!a.v->io_time[READ] && - test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { prt_printf(err, "cached bucket with read_time == 0"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } break; case BCH_DATA_stripe: if (!a.v->stripe) { prt_printf(err, "data_type %s but stripe==0", bch2_data_types[a.v->data_type]); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } break; } @@ -385,20 +359,17 @@ void bch2_alloc_v4_swab(struct bkey_s k) void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = &_a; - const struct bch_backpointer *bps; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); unsigned i; - if (k.k->type == KEY_TYPE_alloc_v4) - a = bkey_s_c_to_alloc_v4(k).v; - else - bch2_alloc_to_v4(k, &_a); - prt_newline(out); printbuf_indent_add(out, 2); prt_printf(out, "gen %u oldest_gen %u data_type %s", - a->gen, a->oldest_gen, bch2_data_types[a->data_type]); + a->gen, a->oldest_gen, + a->data_type < BCH_DATA_NR + ? bch2_data_types[a->data_type] + : "(invalid data type)"); prt_newline(out); prt_printf(out, "journal_seq %llu", a->journal_seq); prt_newline(out); @@ -418,33 +389,44 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_newline(out); prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); prt_newline(out); - prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a)); - printbuf_indent_add(out, 2); + prt_printf(out, "fragmentation %llu", a->fragmentation_lru); + prt_newline(out); + prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_newline(out); + + if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { + struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); + const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); - bps = alloc_v4_backpointers_c(a); - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a); i++) { - prt_newline(out); - bch2_backpointer_to_text(out, &bps[i]); + prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); + printbuf_indent_add(out, 2); + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { + prt_newline(out); + bch2_backpointer_to_text(out, &bps[i]); + } + + printbuf_indent_sub(out, 2); } - printbuf_indent_sub(out, 4); + printbuf_indent_sub(out, 2); } -void bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) +void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { - int d; + void *src, *dst; *out = *bkey_s_c_to_alloc_v4(k).v; - d = (int) BCH_ALLOC_V4_U64s - - (int) (BCH_ALLOC_V4_BACKPOINTERS_START(out) ?: BCH_ALLOC_V4_U64s_V0); - if (unlikely(d > 0)) { - memset((u64 *) out + BCH_ALLOC_V4_BACKPOINTERS_START(out), - 0, - d * sizeof(u64)); - SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); - } + src = alloc_v4_backpointers(out); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(out); + + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); } else { struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); @@ -470,20 +452,13 @@ static noinline struct bkey_i_alloc_v4 * __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i_alloc_v4 *ret; - unsigned bytes = k.k->type == KEY_TYPE_alloc_v4 - ? bkey_bytes(k.k) - : sizeof(struct bkey_i_alloc_v4); - /* - * Reserve space for one more backpointer here: - * Not sketchy at doing it this way, nope... - */ - ret = bch2_trans_kmalloc(trans, bytes + sizeof(struct bch_backpointer)); + ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); if (IS_ERR(ret)) return ret; if (k.k->type == KEY_TYPE_alloc_v4) { - struct bch_backpointer *src, *dst; + void *src, *dst; bkey_reassemble(&ret->k_i, k); @@ -491,9 +466,10 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); dst = alloc_v4_backpointers(&ret->v); - memmove(dst, src, BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v) * - sizeof(struct bch_backpointer)); - memset(src, 0, dst - src); + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); set_alloc_v4_u64s(ret); } else { bkey_alloc_v4_init(&ret->k_i); @@ -505,18 +481,12 @@ __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) { + struct bkey_s_c_alloc_v4 a; + if (likely(k.k->type == KEY_TYPE_alloc_v4) && - BCH_ALLOC_V4_BACKPOINTERS_START(bkey_s_c_to_alloc_v4(k).v) == BCH_ALLOC_V4_U64s) { - /* - * Reserve space for one more backpointer here: - * Not sketchy at doing it this way, nope... - */ - struct bkey_i_alloc_v4 *ret = - bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_backpointer)); - if (!IS_ERR(ret)) - bkey_reassemble(&ret->k_i, k); - return ret; - } + ((a = bkey_s_c_to_alloc_v4(k), true) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) + return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); return __bch2_alloc_to_v4_mut(trans, k); } @@ -534,30 +504,82 @@ bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter struct bkey_i_alloc_v4 *a; int ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, BTREE_ITER_WITH_UPDATES| BTREE_ITER_CACHED| BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); - if (ret) { - bch2_trans_iter_exit(trans, iter); + if (unlikely(ret)) return ERR_PTR(ret); - } a = bch2_alloc_to_v4_mut_inlined(trans, k); - if (IS_ERR(a)) - bch2_trans_iter_exit(trans, iter); + ret = PTR_ERR_OR_ZERO(a); + if (unlikely(ret)) + goto err; return a; +err: + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); } -int bch2_alloc_read(struct bch_fs *c) +static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) +{ + *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; + + pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; + return pos; +} + +static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) +{ + pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; + pos.offset += offset; + return pos; +} + +static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) +{ + return k.k->type == KEY_TYPE_bucket_gens + ? bkey_s_c_to_bucket_gens(k).v->gens[offset] + : 0; +} + +int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { + prt_printf(err, "bad val size (%lu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { + if (i) + prt_char(out, ' '); + prt_printf(out, "%u", g.v->gens[i]); + } +} + +int bch2_bucket_gens_init(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bch_alloc_v4 a; - struct bch_dev *ca; + struct bkey_i_bucket_gens g; + bool have_bucket_gens_key = false; + unsigned offset; + struct bpos pos; + u8 gen; int ret; bch2_trans_init(&trans, c, 0, 0); @@ -571,17 +593,106 @@ int bch2_alloc_read(struct bch_fs *c) if (!bch2_dev_bucket_exists(c, k.k->p)) continue; - ca = bch_dev_bkey_exists(c, k.k->p.inode); - bch2_alloc_to_v4(k, &a); + gen = bch2_alloc_to_v4(k, &a)->gen; + pos = alloc_gens_pos(iter.pos, &offset); - *bucket_gen(ca, k.k->p.offset) = a.gen; + if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + if (ret) + break; + have_bucket_gens_key = false; + } + + if (!have_bucket_gens_key) { + bkey_bucket_gens_init(&g.k_i); + g.k.p = pos; + have_bucket_gens_key = true; + } + + g.v.gens[offset] = gen; } bch2_trans_iter_exit(&trans, &iter); + if (have_bucket_gens_key && !ret) + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + int ret; + + down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { + const struct bch_bucket_gens *g; + u64 b; + + for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + + if (k.k->type != KEY_TYPE_bucket_gens) + continue; + + g = bkey_s_c_to_bucket_gens(k).v; + + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_exists2(c, k.k->p.inode)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + for (b = max_t(u64, ca->mi.first_bucket, start); + b < min_t(u64, ca->mi.nbuckets, end); + b++) + *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; + } + bch2_trans_iter_exit(&trans, &iter); + } else { + struct bch_alloc_v4 a; + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; + } + bch2_trans_iter_exit(&trans, &iter); + } + bch2_trans_exit(&trans); + up_read(&c->gc_lock); if (ret) - bch_err(c, "error reading alloc info: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -608,7 +719,7 @@ static int bch2_bucket_do_index(struct btree_trans *trans, a->data_type != BCH_DATA_need_discard) return 0; - k = bch2_trans_kmalloc(trans, sizeof(*k)); + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); if (IS_ERR(k)) return PTR_ERR(k); @@ -629,20 +740,22 @@ static int bch2_bucket_do_index(struct btree_trans *trans, return 0; } - bch2_trans_iter_init(trans, &iter, btree, + old = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - old = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(old); if (ret) - goto err; + return ret; if (ca->mi.freespace_initialized && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && bch2_trans_inconsistent_on(old.k->type != old_type, trans, - "incorrect key when %s %s btree (got %s should be %s)\n" + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" " for %s", set ? "setting" : "clearing", bch2_btree_ids[btree], + iter.pos.inode, + iter.pos.offset, bch2_bkey_types[old.k->type], bch2_bkey_types[old_type], (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -657,13 +770,50 @@ err: return ret; } +static noinline int bch2_bucket_gen_update(struct btree_trans *trans, + struct bpos bucket, u8 gen) +{ + struct btree_iter iter; + unsigned offset; + struct bpos pos = alloc_gens_pos(bucket, &offset); + struct bkey_i_bucket_gens *g; + struct bkey_s_c k; + int ret; + + g = bch2_trans_kmalloc(trans, sizeof(*g)); + ret = PTR_ERR_OR_ZERO(g); + if (ret) + return ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_bucket_gens) { + bkey_bucket_gens_init(&g->k_i); + g->k.p = iter.pos; + } else { + bkey_reassemble(&g->k_i, k); + } + + g->v.gens[offset] = gen; + + ret = bch2_trans_update(trans, &iter, &g->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_trans_mark_alloc(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a, *new_a; + struct bch_alloc_v4 old_a_convert, *new_a; + const struct bch_alloc_v4 *old_a; u64 old_lru, new_lru; int ret = 0; @@ -673,13 +823,13 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, */ BUG_ON(new->k.type != KEY_TYPE_alloc_v4); - bch2_alloc_to_v4(old, &old_a); + old_a = bch2_alloc_to_v4(old, &old_a_convert); new_a = &bkey_i_to_alloc_v4(new)->v; new_a->data_type = alloc_data_type(*new_a, new_a->data_type); - if (new_a->dirty_sectors > old_a.dirty_sectors || - new_a->cached_sectors > old_a.cached_sectors) { + if (new_a->dirty_sectors > old_a->dirty_sectors || + new_a->cached_sectors > old_a->cached_sectors) { new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); @@ -693,10 +843,10 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); } - if (old_a.data_type != new_a->data_type || + if (old_a->data_type != new_a->data_type || (new_a->data_type == BCH_DATA_free && - alloc_freespace_genbits(old_a) != alloc_freespace_genbits(*new_a))) { - ret = bch2_bucket_do_index(trans, old, &old_a, false) ?: + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); if (ret) return ret; @@ -706,45 +856,161 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); - old_lru = alloc_lru_idx(old_a); - new_lru = alloc_lru_idx(*new_a); + old_lru = alloc_lru_idx_read(*old_a); + new_lru = alloc_lru_idx_read(*new_a); if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset, - old_lru, &new_lru, old); + ret = bch2_lru_change(trans, new->k.p.inode, + bucket_to_u64(new->k.p), + old_lru, new_lru); if (ret) return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new->k.p.inode)); + + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new->k.p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } - if (new_a->data_type == BCH_DATA_cached) - new_a->io_time[READ] = new_lru; + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); + if (ret) + return ret; } return 0; } -static int bch2_check_alloc_key(struct btree_trans *trans, - struct btree_iter *alloc_iter, - struct btree_iter *discard_iter, - struct btree_iter *freespace_iter) +/* + * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * extents style btrees, but works on non-extents btrees: + */ +static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +{ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + + if (bkey_err(k)) + return k; + + if (k.k->type) { + return k; + } else { + struct btree_iter iter2; + struct bpos next; + + bch2_trans_copy_iter(&iter2, iter); + + if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + + end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); + + /* + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ + k = bch2_btree_iter_peek_upto(&iter2, end); + next = iter2.pos; + bch2_trans_iter_exit(iter->trans, &iter2); + + BUG_ON(next.offset >= iter->pos.offset + U32_MAX); + + if (bkey_err(k)) + return k; + + bkey_init(hole); + hole->p = iter->pos; + + bch2_key_resize(hole, next.offset - iter->pos.offset); + return (struct bkey_s_c) { hole, NULL }; + } +} + +static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +{ + struct bch_dev *ca; + unsigned iter; + + if (bch2_dev_bucket_exists(c, *bucket)) + return true; + + if (bch2_dev_exists2(c, bucket->inode)) { + ca = bch_dev_bkey_exists(c, bucket->inode); + + if (bucket->offset < ca->mi.first_bucket) { + bucket->offset = ca->mi.first_bucket; + return true; + } + + bucket->inode++; + bucket->offset = 0; + } + + rcu_read_lock(); + iter = bucket->inode; + ca = __bch2_next_dev(c, &iter, NULL); + if (ca) + *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + rcu_read_unlock(); + + return ca != NULL; +} + +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +{ + struct bch_fs *c = iter->trans->c; + struct bkey_s_c k; +again: + k = bch2_get_key_or_hole(iter, POS_MAX, hole); + if (bkey_err(k)) + return k; + + if (!k.k->type) { + struct bpos bucket = bkey_start_pos(k.k); + + if (!bch2_dev_bucket_exists(c, bucket)) { + if (!next_bucket(c, &bucket)) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bucket); + goto again; + } + + if (!bch2_dev_bucket_exists(c, k.k->p)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); + } + } + + return k; +} + +static noinline_for_stack +int bch2_check_alloc_key(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter, + struct btree_iter *bucket_gens_iter) { struct bch_fs *c = trans->c; struct bch_dev *ca; - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; unsigned discard_key_type, freespace_key_type; - struct bkey_s_c alloc_k, k; + unsigned gens_offset; + struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - alloc_k = bch2_dev_bucket_exists(c, alloc_iter->pos) - ? bch2_btree_iter_peek_slot(alloc_iter) - : bch2_btree_iter_peek(alloc_iter); - if (!alloc_k.k) - return 1; - - ret = bkey_err(alloc_k); - if (ret) - return ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, "alloc key for invalid device:bucket %llu:%llu", alloc_k.k->p.inode, alloc_k.k->p.offset)) @@ -754,16 +1020,10 @@ static int bch2_check_alloc_key(struct btree_trans *trans, if (!ca->mi.freespace_initialized) return 0; - bch2_alloc_to_v4(alloc_k, &a); - - discard_key_type = a.data_type == BCH_DATA_need_discard - ? KEY_TYPE_set : 0; - freespace_key_type = a.data_type == BCH_DATA_free - ? KEY_TYPE_set : 0; + a = bch2_alloc_to_v4(alloc_k, &a_convert); + discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); - bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, a)); - k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) @@ -792,6 +1052,8 @@ static int bch2_check_alloc_key(struct btree_trans *trans, goto err; } + freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) @@ -821,19 +1083,175 @@ static int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; } + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (a->gen != alloc_gen(k, gens_offset) && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i_bucket_gens *g = + bch2_trans_kmalloc(trans, sizeof(*g)); + + ret = PTR_ERR_OR_ZERO(g); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_bucket_gens) { + bkey_reassemble(&g->k_i, k); + } else { + bkey_bucket_gens_init(&g->k_i); + g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); + } + + g->v.gens[gens_offset] = a->gen; + + ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); + if (ret) + goto err; + } err: fsck_err: printbuf_exit(&buf); return ret; } -static int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter) +static noinline_for_stack +int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *freespace_iter) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + ca = bch_dev_bkey_exists(c, start.inode); + if (!ca->mi.freespace_initialized) + return 0; + + bch2_btree_iter_set_pos(freespace_iter, start); + + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + *end = bkey_min(k.k->p, *end); + + if (k.k->type != KEY_TYPE_set && + (c->opts.reconstruct_alloc || + fsck_err(c, "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = KEY_TYPE_set; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, + min_t(u64, U32_MAX, end->offset - + freespace_iter->pos.offset)); + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack +int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *bucket_gens_iter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + unsigned i, gens_offset, gens_end_offset; + int ret; + + if (c->sb.version < bcachefs_metadata_version_bucket_gens) + return 0; + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (bkey_cmp(alloc_gens_pos(start, &gens_offset), + alloc_gens_pos(*end, &gens_end_offset))) + gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; + + if (k.k->type == KEY_TYPE_bucket_gens) { + struct bkey_i_bucket_gens g; + bool need_update = false; + + bkey_reassemble(&g.k_i, k); + + for (i = gens_offset; i < gens_end_offset; i++) { + if (fsck_err_on(g.v.gens[i], c, + "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", + bucket_gens_pos_to_alloc(k.k->p, i).inode, + bucket_gens_pos_to_alloc(k.k->p, i).offset, + g.v.gens[i])) { + g.v.gens[i] = 0; + need_update = true; + } + } + + if (need_update) { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g)); + + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + memcpy(k, &g, sizeof(g)); + + ret = bch2_trans_update(trans, bucket_gens_iter, k, 0); + if (ret) + goto err; + } + } + + *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter; struct bkey_s_c alloc_k; - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; u64 genbits; struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard @@ -846,45 +1264,140 @@ static int bch2_check_discard_freespace_key(struct btree_trans *trans, pos.offset &= ~(~0ULL << 56); genbits = iter->pos.offset & (~0ULL << 56); - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + ret = bkey_err(alloc_k); + if (ret) + return ret; if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, "entry in %s btree for nonexistant dev:bucket %llu:%llu", bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) goto delete; - alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(alloc_k); - if (ret) - goto err; - - bch2_alloc_to_v4(alloc_k, &a); + a = bch2_alloc_to_v4(alloc_k, &a_convert); - if (fsck_err_on(a.data_type != state || + if (fsck_err_on(a->data_type != state || (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(a)), c, - "%s\n incorrectly set in %s index (free %u, genbits %llu should be %llu)", + genbits != alloc_freespace_genbits(*a)), c, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_ids[iter->btree_id], - a.data_type == state, - genbits >> 56, alloc_freespace_genbits(a) >> 56)) + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) goto delete; out: -err: fsck_err: + set_btree_iter_dontneed(&alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_delete_extent_at(trans, iter, - iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0); + ret = bch2_btree_delete_extent_at(trans, iter, + iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); goto out; } +static int bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end) +{ + if (!btree_id_is_extents(iter->btree_id)) { + return __bch2_check_discard_freespace_key(trans, iter); + } else { + int ret; + + while (!bkey_eq(iter->pos, end) && + !(ret = btree_trans_too_many_iters(trans) ?: + __bch2_check_discard_freespace_key(trans, iter))) + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + + return ret; + } +} + +/* + * We've already checked that generation numbers in the bucket_gens btree are + * valid for buckets that exist; this just checks for keys for nonexistent + * buckets. + */ +static noinline_for_stack +int bch2_check_bucket_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_i_bucket_gens g; + struct bch_dev *ca; + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + u64 b; + bool need_update = false, dev_exists; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(k.k->type != KEY_TYPE_bucket_gens); + bkey_reassemble(&g.k_i, k); + + /* if no bch_dev, skip out whether we repair or not */ + dev_exists = bch2_dev_exists2(c, k.k->p.inode); + if (!dev_exists) { + if (fsck_err_on(!dev_exists, c, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + } + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (fsck_err_on(end <= ca->mi.first_bucket || + start >= ca->mi.nbuckets, c, + "bucket_gens key for invalid buckets:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } + + for (b = start; b < ca->mi.first_bucket; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + for (b = ca->mi.nbuckets; b < end; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + if (need_update) { + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(g)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto out; + + memcpy(k, &g, sizeof(g)); + ret = bch2_trans_update(trans, iter, k, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + int bch2_check_alloc_info(struct bch_fs *c) { struct btree_trans trans; - struct btree_iter iter, discard_iter, freespace_iter; + struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bkey hole; struct bkey_s_c k; int ret = 0; @@ -896,18 +1409,61 @@ int bch2_check_alloc_info(struct bch_fs *c) BTREE_ITER_PREFETCH); bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH); + while (1) { - ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW, - bch2_check_alloc_key(&trans, &iter, - &discard_iter, - &freespace_iter)); + struct bpos next; + + bch2_trans_begin(&trans); + + k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + ret = bkey_err(k); if (ret) + goto bkey_err; + + if (!k.k) break; - bch2_btree_iter_advance(&iter); + if (k.k->type) { + next = bpos_nosnap_successor(k.k->p); + + ret = bch2_check_alloc_key(&trans, + k, &iter, + &discard_iter, + &freespace_iter, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } else { + next = k.k->p; + + ret = bch2_check_alloc_hole_freespace(&trans, + bkey_start_pos(k.k), + &next, + &freespace_iter) ?: + bch2_check_alloc_hole_bucket_gens(&trans, + bkey_start_pos(k.k), + &next, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, next); +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; } + bch2_trans_iter_exit(&trans, &bucket_gens_iter); bch2_trans_iter_exit(&trans, &freespace_iter); bch2_trans_iter_exit(&trans, &discard_iter); bch2_trans_iter_exit(&trans, &iter); @@ -915,19 +1471,24 @@ int bch2_check_alloc_info(struct bch_fs *c) if (ret < 0) goto err; - ret = for_each_btree_key_commit(&trans, iter, + ret = for_each_btree_key2(&trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_discard_freespace_key(&trans, &iter)) ?: - for_each_btree_key_commit(&trans, iter, + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_freespace, POS_MIN, BTREE_ITER_PREFETCH, k, + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: + for_each_btree_key_commit(&trans, iter, + BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_discard_freespace_key(&trans, &iter)); + bch2_check_bucket_gens_key(&trans, &iter, k)); err: bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + if (ret) + bch_err_fn(c, ret); + return ret; } static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, @@ -935,10 +1496,10 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter lru_iter; - struct bch_alloc_v4 a; - struct bkey_s_c alloc_k, k; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c alloc_k, lru_k; struct printbuf buf = PRINTBUF; - struct printbuf buf2 = PRINTBUF; int ret; alloc_k = bch2_btree_iter_peek(alloc_iter); @@ -949,52 +1510,47 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (ret) return ret; - bch2_alloc_to_v4(alloc_k, &a); + a = bch2_alloc_to_v4(alloc_k, &a_convert); - if (a.data_type != BCH_DATA_cached) + if (a->data_type != BCH_DATA_cached) return 0; - bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru, - POS(alloc_k.k->p.inode, a.io_time[READ]), 0); - - k = bch2_btree_iter_peek_slot(&lru_iter); - ret = bkey_err(k); + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ]), 0); + ret = bkey_err(lru_k); if (ret) - goto err; + return ret; - if (fsck_err_on(!a.io_time[READ], c, + if (fsck_err_on(!a->io_time[READ], c, "cached bucket with read_time 0\n" " %s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || - fsck_err_on(k.k->type != KEY_TYPE_lru || - le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != alloc_k.k->p.offset, c, - "incorrect/missing lru entry\n" - " %s\n" + fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + "missing lru entry\n" " %s", (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { - u64 read_time = a.io_time[READ]; - - if (!a.io_time[READ]) - a.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + u64 read_time = a->io_time[READ] ?: + atomic64_read(&c->io_clock[READ].now); ret = bch2_lru_set(trans, alloc_k.k->p.inode, - alloc_k.k->p.offset, - &a.io_time[READ]); + bucket_to_u64(alloc_k.k->p), + read_time); if (ret) goto err; - if (a.io_time[READ] != read_time) { + if (a->io_time[READ] != read_time) { struct bkey_i_alloc_v4 *a_mut = bch2_alloc_to_v4_mut(trans, alloc_k); ret = PTR_ERR_OR_ZERO(a_mut); if (ret) goto err; - a_mut->v.io_time[READ] = a.io_time[READ]; + a_mut->v.io_time[READ] = read_time; ret = bch2_trans_update(trans, alloc_iter, &a_mut->k_i, BTREE_TRIGGER_NORUN); if (ret) @@ -1004,27 +1560,24 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &lru_iter); - printbuf_exit(&buf2); printbuf_exit(&buf); return ret; } int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, - POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_alloc_to_lru_ref(&trans, &iter)); - - bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(&trans, &iter))); + if (ret) + bch_err_fn(c, ret); + return ret; } static int bch2_discard_one_bucket(struct btree_trans *trans, @@ -1042,7 +1595,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; - bool did_discard = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1063,10 +1615,9 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - need_discard_iter->pos, - BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + need_discard_iter->pos, + BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) goto out; @@ -1082,25 +1633,32 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto write; } - if (bch2_trans_inconsistent_on(a->v.journal_seq > c->journal.flushed_seq_ondisk, trans, - "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" - "%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" + "%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } goto out; } - if (bch2_trans_inconsistent_on(a->v.data_type != BCH_DATA_need_discard, trans, - "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + if (a->v.data_type != BCH_DATA_need_discard) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } + goto out; } - if (bkey_cmp(*discard_pos_done, iter.pos) && + if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* * This works without any other locks because this is the only @@ -1111,29 +1669,27 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, k.k->p.offset * ca->mi.bucket_size, ca->mi.bucket_size, GFP_KERNEL); + *discard_pos_done = iter.pos; - ret = bch2_trans_relock(trans); + ret = bch2_trans_relock_notrace(trans); if (ret) goto out; } - *discard_pos_done = iter.pos; - did_discard = true; - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); a->v.data_type = alloc_data_type(a->v, a->v.data_type); write: ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); if (ret) goto out; - if (did_discard) { - this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); - (*discarded)++; - } + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; out: + (*seen)++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); printbuf_exit(&buf); @@ -1170,7 +1726,7 @@ static void bch2_do_discards_work(struct work_struct *work) if (need_journal_commit * 2 > seen) bch2_journal_flush_async(&c->journal, NULL); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_discard); trace_discard_buckets(c, seen, open, need_journal_commit, discarded, bch2_err_str(ret)); @@ -1178,62 +1734,45 @@ static void bch2_do_discards_work(struct work_struct *work) void bch2_do_discards(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && - !queue_work(system_long_wq, &c->discard_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && + !queue_work(c->write_ref_wq, &c->discard_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard); } static int invalidate_one_bucket(struct btree_trans *trans, - struct btree_iter *lru_iter, struct bkey_s_c k, - unsigned dev_idx, s64 *nr_to_invalidate) + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; - struct bkey_i_alloc_v4 *a; - struct bpos bucket; + struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); unsigned cached_sectors; int ret = 0; - if (*nr_to_invalidate <= 0 || k.k->p.inode != dev_idx) + if (*nr_to_invalidate <= 0) return 1; - if (k.k->type != KEY_TYPE_lru) { - prt_printf(&buf, "non lru key in lru btree:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { - bch_err(c, "%s", buf.buf); - } else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - ret = -EINVAL; - } - - goto out; + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "lru entry points to invalid bucket"); + goto err; } - bucket = POS(dev_idx, le64_to_cpu(bkey_s_c_to_lru(k).v->idx)); + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) + return 0; a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); ret = PTR_ERR_OR_ZERO(a); if (ret) goto out; - if (k.k->p.offset != alloc_lru_idx(a->v)) { - prt_printf(&buf, "alloc key does not point back to lru entry when invalidating bucket:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - - if (!test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { - bch_err(c, "%s", buf.buf); - } else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - ret = -EINVAL; - } - + /* We expect harmless races here due to the btree write buffer: */ + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) goto out; - } + + BUG_ON(a->v.data_type != BCH_DATA_cached); if (!a->v.cached_sectors) bch_err(c, "invalidating empty bucket, confused"); @@ -1251,7 +1790,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, BTREE_TRIGGER_BUCKET_INVALIDATE) ?: bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_USE_RESERVE|BTREE_INSERT_NOFAIL); + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); if (ret) goto out; @@ -1261,6 +1801,26 @@ out: bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; +err: + prt_str(&buf, "\n lru key: "); + bch2_bkey_val_to_text(&buf, c, lru_k); + + prt_str(&buf, "\n lru entry: "); + bch2_lru_pos_to_text(&buf, lru_iter->pos); + + prt_str(&buf, "\n alloc key: "); + if (!a) + bch2_bpos_to_text(&buf, bucket); + else + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } + + goto out; } static void bch2_do_invalidates_work(struct work_struct *work) @@ -1275,59 +1835,122 @@ static void bch2_do_invalidates_work(struct work_struct *work) bch2_trans_init(&trans, c, 0, 0); + ret = bch2_btree_write_buffer_flush(&trans); + if (ret) + goto err; + for_each_member_device(ca, c, i) { s64 nr_to_invalidate = should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); - ret = for_each_btree_key2(&trans, iter, BTREE_ID_lru, - POS(ca->dev_idx, 0), BTREE_ITER_INTENT, k, - invalidate_one_bucket(&trans, &iter, k, ca->dev_idx, &nr_to_invalidate)); + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, 0), + lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), + BTREE_ITER_INTENT, k, + invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); if (ret < 0) { percpu_ref_put(&ca->ref); break; } } - +err: bch2_trans_exit(&trans); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) { - if (percpu_ref_tryget_live(&c->writes) && - !queue_work(system_long_wq, &c->invalidate_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && + !queue_work(c->write_ref_wq, &c->invalidate_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); } -static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, struct bch_dev *ca) -{ - struct bch_alloc_v4 a; - - if (iter->pos.offset >= ca->mi.nbuckets) - return 1; - - bch2_alloc_to_v4(k, &a); - return bch2_bucket_do_index(trans, k, &a, true); -} - -static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca) +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + unsigned long *last_updated) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct bkey hole; + struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets); struct bch_member *m; int ret; bch2_trans_init(&trans, c, 0, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, - POS(ca->dev_idx, ca->mi.first_bucket), - BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_LAZY_RW, - bucket_freespace_init(&trans, &iter, k, ca)); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_PREFETCH); + /* + * Scan the alloc btree for every bucket on @ca, and add buckets to the + * freespace/need_discard/need_gc_gens btrees as needed: + */ + while (1) { + if (*last_updated + HZ * 10 < jiffies) { + bch_info(ca, "%s: currently at %llu/%llu", + __func__, iter.pos.offset, ca->mi.nbuckets); + *last_updated = jiffies; + } + + bch2_trans_begin(&trans); + if (bkey_ge(iter.pos, end)) { + ret = 0; + break; + } + + k = bch2_get_key_or_hole(&iter, end, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; + + if (k.k->type) { + /* + * We process live keys in the alloc btree one at a + * time: + */ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + ret = bch2_bucket_do_index(&trans, k, a, true) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_advance(&iter); + } else { + struct bkey_i *freespace; + + freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace)); + ret = PTR_ERR_OR_ZERO(freespace); + if (ret) + goto bkey_err; + + bkey_init(&freespace->k); + freespace->k.type = KEY_TYPE_set; + freespace->k.p = k.k->p; + freespace->k.size = k.k->size; + + ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, k.k->p); + } +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } + + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret < 0) { @@ -1349,6 +1972,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) unsigned i; int ret = 0; bool doing_init = false; + unsigned long last_updated = jiffies; /* * We can crash during the device add path, so we need to check this on @@ -1364,9 +1988,10 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca); + ret = bch2_dev_freespace_init(c, ca, &last_updated); if (ret) { percpu_ref_put(&ca->ref); + bch_err_fn(c, ret); return ret; } } @@ -1375,11 +2000,10 @@ int bch2_fs_freespace_init(struct bch_fs *c) mutex_lock(&c->sb_lock); bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch_verbose(c, "done initializing freespace"); } - return ret; + return 0; } /* Bucket IO clocks: */ @@ -1518,40 +2142,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) */ bch2_recalc_capacity(c); - /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_writepoint_stop(c, ca, &c->write_points[i]); - - bch2_writepoint_stop(c, ca, &c->copygc_write_point); - bch2_writepoint_stop(c, ca, &c->rebalance_write_point); - bch2_writepoint_stop(c, ca, &c->btree_write_point); - - mutex_lock(&c->btree_reserve_cache_lock); - while (c->btree_reserve_cache_nr) { - struct btree_alloc *a = - &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - - bch2_open_buckets_put(c, &a->ob); - } - mutex_unlock(&c->btree_reserve_cache_lock); - - while (1) { - struct open_bucket *ob; - - spin_lock(&c->freelist_lock); - if (!ca->open_buckets_partial_nr) { - spin_unlock(&c->freelist_lock); - break; - } - ob = c->open_buckets + - ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - - bch2_open_bucket_put(c, ob); - } - - bch2_ec_stop_dev(c, ca); + bch2_open_buckets_stop(c, ca, false); /* * Wake up threads that were blocked on allocation, so they can notice diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index ee683bd..c0914fe 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -8,6 +8,8 @@ #include "debug.h" #include "super.h" +enum bkey_invalid_flags; + /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -23,6 +25,16 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) pos.offset < ca->mi.nbuckets; } +static inline u64 bucket_to_u64(struct bpos bucket) +{ + return (bucket.inode << 48) | bucket.offset; +} + +static inline struct bpos u64_to_bucket(u64 bucket) +{ + return POS(bucket >> 48, bucket & ~(~0ULL << 48)); +} + static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) { return a.gen - a.oldest_gen; @@ -34,10 +46,10 @@ static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, struct bch_alloc_v4 a, enum bch_data_type data_type) { + if (stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (dirty_sectors) return data_type; - if (stripe) - return BCH_DATA_stripe; if (cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) @@ -54,11 +66,36 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, a.stripe, a, data_type); } -static inline u64 alloc_lru_idx(struct bch_alloc_v4 a) +static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) +{ + return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; +} + +static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; } +#define DATA_TYPES_MOVABLE \ + ((1U << BCH_DATA_btree)| \ + (1U << BCH_DATA_user)| \ + (1U << BCH_DATA_stripe)) + +static inline bool data_type_movable(enum bch_data_type type) +{ + return (1U << type) & DATA_TYPES_MOVABLE; +} + +static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, + struct bch_dev *ca) +{ + if (!data_type_movable(a.data_type) || + a.dirty_sectors >= ca->mi.bucket_size) + return 0; + + return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); +} + static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) { return ((u64) alloc_gc_gen(a) >> 4) << 56; @@ -89,17 +126,37 @@ static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); -void bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); +void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); + +static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) +{ + const struct bch_alloc_v4 *ret; + + if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) + goto slowpath; + + ret = bkey_s_c_to_alloc_v4(k).v; + if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) + goto slowpath; + + return ret; +slowpath: + __bch2_alloc_to_v4(k, convert); + return convert; +} + struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) - -int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -108,6 +165,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ @@ -115,6 +173,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ @@ -122,6 +181,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_alloc_to_text, \ .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ @@ -130,8 +190,20 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .swab = bch2_alloc_v4_swab, \ .trans_trigger = bch2_trans_mark_alloc, \ .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 48, \ +}) + +int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ + .key_invalid = bch2_bucket_gens_invalid, \ + .val_to_text = bch2_bucket_gens_to_text, \ }) +int bch2_bucket_gens_init(struct bch_fs *); + static inline bool bkey_is_alloc(const struct bkey *k) { return k->type == KEY_TYPE_alloc || @@ -154,7 +226,7 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, u64 free = max_t(s64, 0, u.d[BCH_DATA_free].buckets + u.d[BCH_DATA_need_discard].buckets - - bch2_dev_buckets_reserved(ca, RESERVE_none)); + - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); } @@ -163,7 +235,9 @@ void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) { - return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); + return (void *) ((u64 *) &a->v + + (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0)); } static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index c4f971c..1f4c5b3 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -28,15 +28,25 @@ #include "io.h" #include "journal.h" #include "movinggc.h" +#include "nocow_locking.h" +#include "trace.h" #include #include #include -#include -const char * const bch2_alloc_reserves[] = { +static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, + struct mutex *lock) +{ + if (!mutex_trylock(lock)) { + bch2_trans_unlock(trans); + mutex_lock(lock); + } +} + +const char * const bch2_watermarks[] = { #define x(t) #t, - BCH_ALLOC_RESERVES() + BCH_WATERMARKS() #undef x NULL }; @@ -57,6 +67,17 @@ const char * const bch2_alloc_reserves[] = { * reference _after_ doing the index update that makes its allocation reachable. */ +void bch2_reset_alloc_cursors(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + ca->alloc_cursor = 0; + rcu_read_unlock(); +} + static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) { open_bucket_idx_t idx = ob - c->open_buckets; @@ -85,7 +106,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); if (ob->ec) { - bch2_ec_bucket_written(c, ob); + ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); return; } @@ -138,30 +159,19 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } -static void open_bucket_free_unused(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob) +static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - bool may_realloc = wp->data_type == BCH_DATA_user; - - BUG_ON(ca->open_buckets_partial_nr > - ARRAY_SIZE(ca->open_buckets_partial)); - - if (ca->open_buckets_partial_nr < - ARRAY_SIZE(ca->open_buckets_partial) && - may_realloc) { - spin_lock(&c->freelist_lock); - ob->on_partial_list = true; - ca->open_buckets_partial[ca->open_buckets_partial_nr++] = - ob - c->open_buckets; - spin_unlock(&c->freelist_lock); + BUG_ON(c->open_buckets_partial_nr >= + ARRAY_SIZE(c->open_buckets_partial)); - closure_wake_up(&c->open_buckets_wait); - closure_wake_up(&c->freelist_wait); - } else { - bch2_open_bucket_put(c, ob); - } + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + c->open_buckets_partial[c->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); } /* _only_ for allocating the journal on a new device: */ @@ -178,14 +188,16 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca) return -1; } -static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { - switch (reserve) { - case RESERVE_btree: - case RESERVE_btree_movinggc: + switch (watermark) { + case BCH_WATERMARK_reclaim: return 0; - case RESERVE_movinggc: + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; default: return OPEN_BUCKETS_COUNT / 2; } @@ -193,8 +205,8 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, u64 bucket, - enum alloc_reserve reserve, - struct bch_alloc_v4 *a, + enum bch_watermark watermark, + const struct bch_alloc_v4 *a, struct bucket_alloc_state *s, struct closure *cl) { @@ -223,7 +235,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -247,7 +259,6 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ob->valid = true; ob->sectors_free = ca->mi.bucket_size; - ob->alloc_reserve = reserve; ob->dev = ca->dev_idx; ob->gen = a->gen; ob->bucket = bucket; @@ -271,12 +282,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } spin_unlock(&c->freelist_lock); - return ob; } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum alloc_reserve reserve, u64 free_entry, + enum bch_watermark watermark, u64 free_entry, struct bucket_alloc_state *s, struct bkey_s_c freespace_k, struct closure *cl) @@ -285,7 +295,8 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc struct btree_iter iter = { NULL }; struct bkey_s_c k; struct open_bucket *ob; - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; u64 b = free_entry & ~(~0ULL << 56); unsigned genbits = free_entry >> 56; struct printbuf buf = PRINTBUF; @@ -301,32 +312,38 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc goto err; } - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_alloc, POS(ca->dev_idx, b), + BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) { ob = ERR_PTR(ret); goto err; } - bch2_alloc_to_v4(k, &a); + a = bch2_alloc_to_v4(k, &a_convert); - if (genbits != (alloc_freespace_genbits(a) >> 56)) { - prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" - " freespace key ", - genbits, alloc_freespace_genbits(a) >> 56); + if (a->data_type != BCH_DATA_free) { + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + ob = NULL; + goto err; + } + + prt_printf(&buf, "non free bucket in freespace btree\n" + " freespace key "); bch2_bkey_val_to_text(&buf, c, freespace_k); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); bch2_trans_inconsistent(trans, "%s", buf.buf); ob = ERR_PTR(-EIO); goto err; - } - if (a.data_type != BCH_DATA_free) { - prt_printf(&buf, "non free bucket in freespace btree\n" - " freespace key "); + if (genbits != (alloc_freespace_genbits(*a) >> 56) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(*a) >> 56); bch2_bkey_val_to_text(&buf, c, freespace_k); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); @@ -335,19 +352,19 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc goto err; } - if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { struct bch_backpointer bp; - u64 bp_offset = 0; + struct bpos bp_pos = POS_MIN; ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, - &bp_offset, &bp, + &bp_pos, &bp, BTREE_ITER_NOPRESERVE); if (ret) { ob = ERR_PTR(ret); goto err; } - if (bp_offset != U64_MAX) { + if (!bkey_eq(bp_pos, POS_MAX)) { /* * Bucket may have data in it - we don't call * bc2h_trans_inconnsistent() because fsck hasn't @@ -358,42 +375,17 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc } } - ob = __try_alloc_bucket(c, ca, b, reserve, &a, s, cl); + ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); if (!ob) iter.path->preserve = false; err: - set_btree_iter_dontneed(&iter); + if (iter.trans && iter.path) + set_btree_iter_dontneed(&iter); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ob; } -static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve) -{ - struct open_bucket *ob; - int i; - - spin_lock(&c->freelist_lock); - - for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { - ob = c->open_buckets + ca->open_buckets_partial[i]; - - if (reserve <= ob->alloc_reserve) { - array_remove_item(ca->open_buckets_partial, - ca->open_buckets_partial_nr, - i); - ob->on_partial_list = false; - ob->alloc_reserve = reserve; - spin_unlock(&c->freelist_lock); - return ob; - } - } - - spin_unlock(&c->freelist_lock); - return NULL; -} - /* * This path is for before the freespace btree is initialized: * @@ -403,83 +395,92 @@ static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, struct bch_dev *ca, - enum alloc_reserve reserve, + enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); int ret; - - s->cur_bucket = max_t(u64, s->cur_bucket, ca->mi.first_bucket); - s->cur_bucket = max_t(u64, s->cur_bucket, ca->new_fs_bucket_idx); - - for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, s->cur_bucket), +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), BTREE_ITER_SLOTS, k, ret) { - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; - if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; if (ca->new_fs_bucket_idx && is_superblock_bucket(ca, k.k->p.offset)) continue; - bch2_alloc_to_v4(k, &a); + a = bch2_alloc_to_v4(k, &a_convert); - if (a.data_type != BCH_DATA_free) + if (a->data_type != BCH_DATA_free) continue; s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a, s, cl); + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); if (ob) break; } bch2_trans_iter_exit(trans, &iter); - s->cur_bucket = iter.pos.offset; + ca->alloc_cursor = alloc_cursor; - return ob ?: ERR_PTR(ret ?: -BCH_ERR_no_buckets_found); + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_cursor > alloc_start) { + alloc_cursor = alloc_start; + goto again; + } + + return ob; } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, struct bch_dev *ca, - enum alloc_reserve reserve, + enum bch_watermark watermark, struct bucket_alloc_state *s, struct closure *cl) { struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; int ret; BUG_ON(ca->new_fs_bucket_idx); - - /* - * XXX: - * On transaction restart, we'd like to restart from the bucket we were - * at previously - */ +again: for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, s->cur_bucket), 0, k, ret) { + POS(ca->dev_idx, alloc_cursor), 0, k, ret) { if (k.k->p.inode != ca->dev_idx) break; - for (s->cur_bucket = max(s->cur_bucket, bkey_start_offset(k.k)); - s->cur_bucket < k.k->p.offset; - s->cur_bucket++) { + for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); + alloc_cursor < k.k->p.offset; + alloc_cursor++) { ret = btree_trans_too_many_iters(trans); - if (ret) + if (ret) { + ob = ERR_PTR(ret); break; + } s->buckets_seen++; - ob = try_alloc_bucket(trans, ca, reserve, - s->cur_bucket, s, k, cl); - if (ob) + ob = try_alloc_bucket(trans, ca, watermark, + alloc_cursor, s, k, cl); + if (ob) { + iter.path->preserve = false; break; + } } if (ob || ret) @@ -487,7 +488,17 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - return ob ?: ERR_PTR(ret); + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { + alloc_cursor = alloc_start = ca->mi.first_bucket; + goto again; + } + + return ob; } /** @@ -497,21 +508,19 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, + enum bch_watermark watermark, struct closure *cl, struct bch_dev_usage *usage) { struct bch_fs *c = trans->c; struct open_bucket *ob = NULL; - bool freespace_initialized = READ_ONCE(ca->mi.freespace_initialized); - u64 start = freespace_initialized ? 0 : ca->bucket_alloc_trans_early_cursor; + bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { .cur_bucket = start }; + struct bucket_alloc_state s = { 0 }; bool waiting = false; again: bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, reserve); + avail = dev_buckets_free(ca, *usage, watermark); if (usage->d[BCH_DATA_need_discard].buckets > avail) bch2_do_discards(c); @@ -538,37 +547,37 @@ again: if (waiting) closure_wake_up(&c->freelist_wait); - - if (may_alloc_partial) { - ob = try_alloc_partial_bucket(c, ca, reserve); - if (ob) - return ob; - } - - ob = likely(ca->mi.freespace_initialized) - ? bch2_bucket_alloc_freelist(trans, ca, reserve, &s, cl) - : bch2_bucket_alloc_early(trans, ca, reserve, &s, cl); +alloc: + ob = likely(freespace) + ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) + : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); if (s.skipped_need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && !freespace_initialized && start) { - start = s.cur_bucket = 0; - goto again; + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + freespace = false; + goto alloc; } - - if (!freespace_initialized) - ca->bucket_alloc_trans_early_cursor = s.cur_bucket; err: if (!ob) ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - trace_and_count(c, bucket_alloc, ca, bch2_alloc_reserves[reserve], - may_alloc_partial, ob->bucket); + trace_and_count(c, bucket_alloc, ca, + bch2_watermarks[watermark], + ob->bucket, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + ""); else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) - trace_and_count(c, bucket_alloc_fail, - ca, bch2_alloc_reserves[reserve], + trace_and_count(c, bucket_alloc_fail, ca, + bch2_watermarks[watermark], + 0, usage->d[BCH_DATA_free].buckets, avail, bch2_copygc_wait_amount(c), @@ -581,16 +590,15 @@ err: } struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve, - bool may_alloc_partial, + enum bch_watermark watermark, struct closure *cl) { struct bch_dev_usage usage; struct open_bucket *ob; bch2_trans_do(c, NULL, NULL, 0, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve, - may_alloc_partial, cl, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + cl, &usage))); return ob; } @@ -622,7 +630,7 @@ static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct bch_dev_usage *usage) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, RESERVE_none); + u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -647,12 +655,10 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, bch2_dev_stripe_increment_inlined(ca, stripe, &usage); } -#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) -#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) - -static void add_new_bucket(struct bch_fs *c, +static int add_new_bucket(struct bch_fs *c, struct open_buckets *ptrs, struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, unsigned flags, @@ -661,23 +667,33 @@ static void add_new_bucket(struct bch_fs *c, unsigned durability = bch_dev_bkey_exists(c, ob->dev)->mi.durability; + BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) + *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? durability : 1; *have_cache |= !durability; ob_push(c, ptrs, ob); + + if (*nr_effective >= nr_replicas) + return 1; + if (ob->ec) + return 1; + return 0; } -static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, +int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct open_buckets *ptrs, struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - enum alloc_reserve reserve, unsigned flags, + enum bch_data_type data_type, + enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; @@ -710,8 +726,7 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, reserve, - flags & BUCKET_MAY_ALLOC_PARTIAL, cl, &usage); + ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); percpu_ref_put(&ca->ref); @@ -723,10 +738,11 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, flags, ob); + ob->data_type = data_type; - if (*nr_effective >= nr_replicas) { + if (add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob)) { ret = 0; break; } @@ -735,24 +751,6 @@ static int bch2_bucket_alloc_set_trans(struct btree_trans *trans, return ret; } -int bch2_bucket_alloc_set(struct bch_fs *c, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) -{ - return bch2_trans_do(c, NULL, NULL, 0, - bch2_bucket_alloc_set_trans(&trans, ptrs, stripe, - devs_may_alloc, nr_replicas, - nr_effective, have_cache, reserve, - flags, cl)); -} - /* Allocate from stripes: */ /* @@ -761,26 +759,25 @@ int bch2_bucket_alloc_set(struct bch_fs *c, * it's to a device we don't want: */ -static int bucket_alloc_from_stripe(struct bch_fs *c, +static int bucket_alloc_from_stripe(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_mask *devs_may_alloc, u16 target, - unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, + enum bch_watermark watermark, unsigned flags, struct closure *cl) { + struct bch_fs *c = trans->c; struct dev_alloc_list devs_sorted; struct ec_stripe_head *h; struct open_bucket *ob; struct bch_dev *ca; unsigned i, ec_idx; - - if (!erasure_code) - return 0; + int ret = 0; if (nr_replicas < 2) return 0; @@ -788,11 +785,9 @@ static int bucket_alloc_from_stripe(struct bch_fs *c, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1, - wp == &c->copygc_write_point, - cl); + h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) - return -PTR_ERR(h); + return PTR_ERR(h); if (!h) return 0; @@ -814,59 +809,130 @@ got_bucket: ob->ec_idx = ec_idx; ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, flags, ob); - atomic_inc(&h->s->pin); + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); out_put_head: bch2_ec_stripe_head_put(c, h); - return 0; + return ret; } /* Sector allocator */ -static void get_buckets_from_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - unsigned flags, - bool need_ec) +static bool want_bucket(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + bool *have_cache, bool ec, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (!test_bit(ob->dev, devs_may_alloc->d)) + return false; + + if (ob->data_type != wp->data_type) + return false; + + if (!ca->mi.durability && + (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + return false; + + if (ec != (ob->ec != NULL)) + return false; + + return true; +} + +static int bucket_alloc_set_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool ec, unsigned flags) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; + int ret = 0; open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - - if (*nr_effective < nr_replicas && - test_bit(ob->dev, devs_may_alloc->d) && - (ca->mi.durability || - (wp->data_type == BCH_DATA_user && !*have_cache)) && - (ob->ec || !need_ec)) { - add_new_bucket(c, ptrs, devs_may_alloc, - nr_effective, have_cache, - flags, ob); - } else { + if (!ret && want_bucket(c, wp, devs_may_alloc, + have_cache, ec, ob)) + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + else ob_push(c, &ptrs_skip, ob); - } } wp->ptrs = ptrs_skip; + + return ret; } -static int open_bucket_add_buckets(struct btree_trans *trans, +static int bucket_alloc_set_partial(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, bool ec, + enum bch_watermark watermark, + unsigned flags) +{ + int i, ret = 0; + + if (!c->open_buckets_partial_nr) + return 0; + + spin_lock(&c->freelist_lock); + + if (!c->open_buckets_partial_nr) + goto unlock; + + for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + + if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev_usage usage; + u64 avail; + + bch2_dev_usage_read_fast(ca, &usage); + avail = dev_buckets_free(ca, usage, watermark); + if (!avail) + continue; + + array_remove_item(c->open_buckets_partial, + c->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + if (ret) + break; + } + } +unlock: + spin_unlock(&c->freelist_lock); + return ret; +} + +static int __open_bucket_add_buckets(struct btree_trans *trans, struct open_buckets *ptrs, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, - unsigned erasure_code, + bool erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, - enum alloc_reserve reserve, + enum bch_watermark watermark, unsigned flags, struct closure *_cl) { @@ -874,12 +940,10 @@ static int open_bucket_add_buckets(struct btree_trans *trans, struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; - int ret; unsigned i; + int ret; - rcu_read_lock(); devs = target_rw_devs(c, wp->data_type, target); - rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) @@ -888,93 +952,179 @@ static int open_bucket_add_buckets(struct btree_trans *trans, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->dev, devs.d); + if (erasure_code && ec_open_bucket(c, ptrs)) + return 0; + + ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, flags); + if (ret) + return ret; + + ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, watermark, flags); + if (ret) + return ret; + if (erasure_code) { - if (!ec_open_bucket(c, ptrs)) { - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, true); - if (*nr_effective >= nr_replicas) - return 0; + ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, + target, + nr_replicas, nr_effective, + have_cache, + watermark, flags, _cl); + } else { +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + flags, wp->data_type, watermark, cl); + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && + !cl && _cl) { + cl = _cl; + goto retry_blocking; } - if (!ec_open_bucket(c, ptrs)) { - ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs, - target, erasure_code, - nr_replicas, nr_effective, - have_cache, flags, _cl); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - bch2_err_matches(ret, BCH_ERR_freelist_empty) || - bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - return ret; - if (*nr_effective >= nr_replicas) - return 0; - } } - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, false); - if (*nr_effective >= nr_replicas) - return 0; + return ret; +} -retry_blocking: - /* - * Try nonblocking first, so that if one device is full we'll try from - * other devices: - */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, +static int open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl) +{ + int ret; + + if (erasure_code) { + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, erasure_code, nr_replicas, nr_effective, have_cache, - reserve, flags, cl); - if (ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && - !cl && _cl) { - cl = _cl; - goto retry_blocking; + watermark, flags, cl); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; + if (*nr_effective >= nr_replicas) + return 0; } - return ret; + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, false, + nr_replicas, nr_effective, have_cache, + watermark, flags, cl); + return ret < 0 ? ret : 0; } -void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, - struct open_buckets *obs) +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + struct bch_dev *ca, bool ec) { - struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob, *ob2; - unsigned i, j; - - open_bucket_for_each(c, obs, ob, i) { - bool drop = !ca || ob->dev == ca->dev_idx; + if (ec) { + return ob->ec != NULL; + } else if (ca) { + bool drop = ob->dev == ca->dev_idx; + struct open_bucket *ob2; + unsigned i; if (!drop && ob->ec) { + unsigned nr_blocks; + mutex_lock(&ob->ec->lock); - for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) { - if (!ob->ec->blocks[j]) + nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + + for (i = 0; i < nr_blocks; i++) { + if (!ob->ec->blocks[i]) continue; - ob2 = c->open_buckets + ob->ec->blocks[j]; + ob2 = c->open_buckets + ob->ec->blocks[i]; drop |= ob2->dev == ca->dev_idx; } mutex_unlock(&ob->ec->lock); } - if (drop) - bch2_open_bucket_put(c, ob); - else - ob_push(c, &ptrs, ob); + return drop; + } else { + return true; } - - *obs = ptrs; } -void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec, struct write_point *wp) { + struct open_buckets ptrs = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + mutex_lock(&wp->lock); - bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (should_drop_bucket(ob, c, ca, ec)) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + wp->ptrs = ptrs; mutex_unlock(&wp->lock); } +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec) +{ + unsigned i; + + /* Next, close write points that point to this device... */ + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); + + bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); + bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + spin_lock(&c->freelist_lock); + i = 0; + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; + + if (should_drop_bucket(ob, c, ca, ec)) { + --c->open_buckets_partial_nr; + swap(c->open_buckets_partial[i], + c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; + } + } + spin_unlock(&c->freelist_lock); + + bch2_ec_stop_dev(c, ca); +} + static inline struct hlist_head *writepoint_hash(struct bch_fs *c, unsigned long write_point) { @@ -1020,10 +1170,12 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct bch_fs *c, - unsigned old_nr) +static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { + struct bch_fs *c = trans->c; struct write_point *wp; + struct open_bucket *ob; + unsigned i; mutex_lock(&c->write_points_hash_lock); if (c->write_points_nr < old_nr) { @@ -1042,19 +1194,14 @@ static bool try_decrease_writepoints(struct bch_fs *c, hlist_del_rcu(&wp->node); mutex_unlock(&c->write_points_hash_lock); - bch2_writepoint_stop(c, NULL, wp); + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + wp->ptrs.nr = 0; + mutex_unlock(&wp->lock); return true; } -static void bch2_trans_mutex_lock(struct btree_trans *trans, - struct mutex *lock) -{ - if (!mutex_trylock(lock)) { - bch2_trans_unlock(trans); - mutex_lock(lock); - } -} - static struct write_point *writepoint_find(struct btree_trans *trans, unsigned long write_point) { @@ -1064,7 +1211,7 @@ static struct write_point *writepoint_find(struct btree_trans *trans, if (!(write_point & 1UL)) { wp = (struct write_point *) write_point; - bch2_trans_mutex_lock(trans, &wp->lock); + bch2_trans_mutex_lock_norelock(trans, &wp->lock); return wp; } @@ -1073,7 +1220,7 @@ restart_find: wp = __writepoint_find(head, write_point); if (wp) { lock_wp: - bch2_trans_mutex_lock(trans, &wp->lock); + bch2_trans_mutex_lock_norelock(trans, &wp->lock); if (wp->write_point == write_point) goto out; mutex_unlock(&wp->lock); @@ -1086,8 +1233,8 @@ restart_find_oldest: if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; - bch2_trans_mutex_lock(trans, &oldest->lock); - bch2_trans_mutex_lock(trans, &c->write_points_hash_lock); + bch2_trans_mutex_lock_norelock(trans, &oldest->lock); + bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); if (oldest >= c->write_points + c->write_points_nr || try_increase_writepoints(c)) { mutex_unlock(&c->write_points_hash_lock); @@ -1116,29 +1263,27 @@ out: * Get us an open_bucket we can allocate from, return with it locked: */ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, - unsigned target, - unsigned erasure_code, - struct write_point_specifier write_point, - struct bch_devs_list *devs_have, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl, - struct write_point **wp_ret) + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) { struct bch_fs *c = trans->c; struct write_point *wp; struct open_bucket *ob; struct open_buckets ptrs; unsigned nr_effective, write_points_nr; - unsigned ob_flags = 0; bool have_cache; int ret; int i; - if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) - ob_flags |= BUCKET_ALLOC_USE_DURABILITY; + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); BUG_ON(!nr_replicas || !nr_replicas_required); retry: @@ -1149,34 +1294,42 @@ retry: *wp_ret = wp = writepoint_find(trans, write_point.v); - if (wp->data_type == BCH_DATA_user) - ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; - /* metadata may not allocate on cache devices: */ if (wp->data_type != BCH_DATA_user) have_cache = true; - if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, reserve, - ob_flags, cl); - } else { + if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, target, erasure_code, nr_replicas, &nr_effective, - &have_cache, reserve, - ob_flags, NULL); + &have_cache, watermark, + flags, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; + /* Don't retry from all devices if we're out of open buckets: */ + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto allocate_blocking; + + /* + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ + have_cache = true; + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 0, erasure_code, nr_replicas, &nr_effective, - &have_cache, reserve, - ob_flags, cl); + &have_cache, watermark, + flags, cl); + } else { +allocate_blocking: + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); @@ -1193,7 +1346,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob); wp->ptrs = ptrs; @@ -1210,24 +1363,21 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob); wp->ptrs = ptrs; mutex_unlock(&wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && - try_decrease_writepoints(c, write_points_nr)) + try_decrease_writepoints(trans, write_points_nr)) goto retry; if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || bch2_err_matches(ret, BCH_ERR_freelist_empty)) return cl - ? -EAGAIN + ? -BCH_ERR_bucket_alloc_blocked : -BCH_ERR_ENOSPC_bucket_alloc; - if (bch2_err_matches(ret, BCH_ERR_insufficient_devices)) - return -EROFS; - return ret; } @@ -1245,34 +1395,11 @@ struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) }; } -/* - * Append pointers to the space we just allocated to @k, and mark @sectors space - * as allocated out of @ob - */ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) - { - struct open_bucket *ob; - unsigned i; - - BUG_ON(sectors > wp->sectors_free); - wp->sectors_free -= sectors; - - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); - struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); - - ptr.cached = cached || - (!ca->mi.durability && - wp->data_type == BCH_DATA_user); - - bch2_bkey_append_ptr(k, ptr); - - BUG_ON(sectors > ob->sectors_free); - ob->sectors_free -= sectors; - } + bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); } /* @@ -1281,17 +1408,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, */ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) { - struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; - - open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); - wp->ptrs = keep; - - mutex_unlock(&wp->lock); - - bch2_open_buckets_put(c, &ptrs); + bch2_alloc_sectors_done_inlined(c, wp); } static inline void writepoint_init(struct write_point *wp, @@ -1340,21 +1457,84 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) } } +static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned data_type = ob->data_type; + barrier(); /* READ_ONCE() doesn't work on bitfields */ + + prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + ob - c->open_buckets, + atomic_read(&ob->pin), + data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + ob->dev, ob->bucket, ob->gen, + ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); + if (ob->ec) + prt_printf(out, " ec idx %llu", ob->ec->idx); + if (ob->on_partial_list) + prt_str(out, " partial"); + prt_newline(out); +} + void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) { struct open_bucket *ob; + out->atomic++; + for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list) { - prt_printf(out, "%zu ref %u type %s %u:%llu:%u\n", - ob - c->open_buckets, - atomic_read(&ob->pin), - bch2_data_types[ob->data_type], - ob->dev, ob->bucket, ob->gen); - } + if (ob->valid && !ob->on_partial_list) + bch2_open_bucket_to_text(out, c, ob); spin_unlock(&ob->lock); } + + --out->atomic; +} + +void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned i; + + out->atomic++; + spin_lock(&c->freelist_lock); + + for (i = 0; i < c->open_buckets_partial_nr; i++) + bch2_open_bucket_to_text(out, c, + c->open_buckets + c->open_buckets_partial[i]); + + spin_unlock(&c->freelist_lock); + --out->atomic; +} + +static const char * const bch2_write_point_states[] = { +#define x(n) #n, + WRITE_POINT_STATES() +#undef x + NULL +}; + +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct write_point *wp; + unsigned i; + + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); + wp++) { + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + } } diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 16490ff..fee195f 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -4,6 +4,8 @@ #include "bcachefs.h" #include "alloc_types.h" +#include "extents.h" +#include "super.h" #include @@ -12,7 +14,9 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; -extern const char * const bch2_alloc_reserves[]; +extern const char * const bch2_watermarks[]; + +void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; @@ -27,8 +31,7 @@ void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, - enum alloc_reserve, bool, - struct closure *); + enum bch_watermark, struct closure *); static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, struct open_bucket *ob) @@ -79,6 +82,21 @@ static inline void bch2_open_buckets_put(struct bch_fs *c, ptrs->nr = 0; } +static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + wp->ptrs = keep; + + mutex_unlock(&wp->lock); + + bch2_open_buckets_put(c, &ptrs); +} + static inline void bch2_open_bucket_get(struct bch_fs *c, struct write_point *wp, struct open_buckets *ptrs) @@ -131,31 +149,60 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 return ret; } -int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum alloc_reserve, - unsigned, struct closure *); + unsigned, unsigned *, bool *, unsigned, + enum bch_data_type, enum bch_watermark, + struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, struct write_point_specifier, struct bch_devs_list *, unsigned, unsigned, - enum alloc_reserve, + enum bch_watermark, unsigned, struct closure *, struct write_point **); struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +static inline void +bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +{ + struct open_bucket *ob; + unsigned i; + + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; + wp->sectors_allocated += sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); + + ptr.cached = cached || + (!ca->mi.durability && + wp->data_type == BCH_DATA_user); + + bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; + } +} + void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, struct bkey_i *, unsigned, bool); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, - struct open_buckets *); - -void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, - struct write_point *); +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); static inline struct write_point_specifier writepoint_hashed(unsigned long v) { @@ -170,5 +217,8 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp void bch2_fs_allocator_foreground_init(struct bch_fs *); void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); + +void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 271b4bf..804a843 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -9,7 +9,6 @@ #include "fifo.h" struct bucket_alloc_state { - u64 cur_bucket; u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; @@ -17,20 +16,24 @@ struct bucket_alloc_state { u64 skipped_nouse; }; -struct ec_bucket_buf; - -#define BCH_ALLOC_RESERVES() \ - x(btree_movinggc) \ +#define BCH_WATERMARKS() \ + x(stripe) \ + x(normal) \ + x(copygc) \ x(btree) \ - x(movinggc) \ - x(none) + x(btree_copygc) \ + x(reclaim) -enum alloc_reserve { -#define x(name) RESERVE_##name, - BCH_ALLOC_RESERVES() +enum bch_watermark { +#define x(name) BCH_WATERMARK_##name, + BCH_WATERMARKS() #undef x + BCH_WATERMARK_NR, }; +#define BCH_WATERMARK_BITS 3 +#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) + #define OPEN_BUCKETS_COUNT 1024 #define WRITE_POINT_HASH_NR 32 @@ -52,10 +55,9 @@ struct open_bucket { * the block in the stripe this open_bucket corresponds to: */ u8 ec_idx; - enum bch_data_type data_type:8; + enum bch_data_type data_type:6; unsigned valid:1; unsigned on_partial_list:1; - unsigned alloc_reserve:3; u8 dev; u8 gen; @@ -75,23 +77,46 @@ struct dev_stripe_state { u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; +#define WRITE_POINT_STATES() \ + x(stopped) \ + x(waiting_io) \ + x(waiting_work) \ + x(running) + +enum write_point_state { +#define x(n) WRITE_POINT_##n, + WRITE_POINT_STATES() +#undef x + WRITE_POINT_STATE_NR +}; + struct write_point { - struct hlist_node node; - struct mutex lock; - u64 last_used; - unsigned long write_point; - enum bch_data_type data_type; + struct { + struct hlist_node node; + struct mutex lock; + u64 last_used; + unsigned long write_point; + enum bch_data_type data_type; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; + + struct open_buckets ptrs; + struct dev_stripe_state stripe; - /* calculated based on how many pointers we're actually going to use: */ - unsigned sectors_free; + u64 sectors_allocated; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); - struct open_buckets ptrs; - struct dev_stripe_state stripe; + struct { + struct work_struct index_update_work; - struct work_struct index_update_work; + struct list_head writes; + spinlock_t writes_lock; - struct list_head writes; - spinlock_t writes_lock; + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); }; struct write_point_specifier { diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 614811e..8747c5e 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -5,42 +5,11 @@ #include "backpointers.h" #include "btree_cache.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "error.h" #include -/* - * Convert from pos in backpointer btree to pos of corresponding bucket in alloc - * btree: - */ -static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, - struct bpos bp_pos) -{ - struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); - u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - - return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); -} - -/* - * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: - */ -static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, - struct bpos bucket, - u64 bucket_offset) -{ - struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); - struct bpos ret; - - ret = POS(bucket.inode, - (bucket_to_sector(ca, bucket.offset) << - MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); - - BUG_ON(bkey_cmp(bucket, bp_pos_to_bucket(c, ret))); - - return ret; -} - static bool extent_matches_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, @@ -60,7 +29,7 @@ static bool extent_matches_bp(struct bch_fs *c, bch2_extent_ptr_to_bp(c, btree_id, level, k, p, &bucket2, &bp2); - if (!bpos_cmp(bucket, bucket2) && + if (bpos_eq(bucket, bucket2) && !memcmp(&bp, &bp2, sizeof(bp))) return true; } @@ -69,19 +38,15 @@ static bool extent_matches_bp(struct bch_fs *c, } int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); - if (bkey_val_bytes(bp.k) < sizeof(*bp.v)) { - prt_str(err, "incorrect value size"); - return -EINVAL; - } - - if (bpos_cmp(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { + if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { prt_str(err, "backpointer at wrong pos"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -100,6 +65,10 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); } @@ -112,136 +81,30 @@ void bch2_backpointer_swab(struct bkey_s k) bch2_bpos_swab(&bp.v->pos); } -#define BACKPOINTER_OFFSET_MAX ((1ULL << 40) - 1) - -static inline int backpointer_cmp(struct bch_backpointer l, struct bch_backpointer r) -{ - return cmp_int(l.bucket_offset, r.bucket_offset); -} - -static int bch2_backpointer_del_by_offset(struct btree_trans *trans, - struct bpos bucket, - u64 bp_offset, - struct bch_backpointer bp) +static noinline int backpointer_mod_err(struct btree_trans *trans, + struct bch_backpointer bp, + struct bkey_s_c bp_k, + struct bkey_s_c orig_k, + bool insert) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - if (bp_offset < BACKPOINTER_OFFSET_MAX) { - struct bch_backpointer *bps; - struct bkey_i_alloc_v4 *a; - unsigned i, nr; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_alloc_v4) { - ret = -ENOENT; - goto err; - } - - a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; - bps = alloc_v4_backpointers(&a->v); - nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); - - for (i = 0; i < nr; i++) { - if (bps[i].bucket_offset == bp_offset) - goto found; - if (bps[i].bucket_offset > bp_offset) - break; - } - - ret = -ENOENT; - goto err; -found: - if (memcmp(&bps[i], &bp, sizeof(bp))) { - ret = -ENOENT; - goto err; - } - array_remove_item(bps, nr, i); - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); - set_alloc_v4_u64s(a); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); - } else { - bp_offset -= BACKPOINTER_OFFSET_MAX; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket, bp_offset), - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { - ret = -ENOENT; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_bucket_backpointer_del(struct btree_trans *trans, - struct bkey_i_alloc_v4 *a, - struct bch_backpointer bp, - struct bkey_s_c orig_k) -{ - struct bch_fs *c = trans->c; - struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); - unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); - struct btree_iter bp_iter; - struct bkey_s_c k; - int ret; + struct printbuf buf = PRINTBUF; - for (i = 0; i < nr; i++) { - int cmp = backpointer_cmp(bps[i], bp) ?: - memcmp(&bps[i], &bp, sizeof(bp)); - if (!cmp) - goto found; - if (cmp >= 0) - break; - } + if (insert) { + prt_printf(&buf, "existing backpointer found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); - goto btree; -found: - array_remove_item(bps, nr, i); - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); - set_alloc_v4_u64s(a); - return 0; -btree: - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, a->k.p, bp.bucket_offset), - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&bp_iter); - ret = bkey_err(k); - if (ret) - goto err; + prt_printf(&buf, "found "); + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); - if (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp))) { - struct printbuf buf = PRINTBUF; + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + bch_err(c, "%s", buf.buf); + } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { prt_printf(&buf, "backpointer not found when deleting"); prt_newline(&buf); printbuf_indent_add(&buf, 2); @@ -251,132 +114,51 @@ btree: prt_newline(&buf); prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_str(&buf, "alloc "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + bch2_bkey_val_to_text(&buf, c, bp_k); prt_newline(&buf); prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { - bch_err(c, "%s", buf.buf); - } else { - ret = -EIO; - bch2_trans_inconsistent(trans, "%s", buf.buf); - } - printbuf_exit(&buf); - goto err; + bch_err(c, "%s", buf.buf); } - ret = bch2_btree_delete_at(trans, &bp_iter, 0); -err: - bch2_trans_iter_exit(trans, &bp_iter); - return ret; + printbuf_exit(&buf); + + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + bch2_inconsistent_error(c); + return -EIO; + } else { + return 0; + } } -int bch2_bucket_backpointer_add(struct btree_trans *trans, - struct bkey_i_alloc_v4 *a, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bkey_i_backpointer *bp_k, struct bch_backpointer bp, - struct bkey_s_c orig_k) + struct bkey_s_c orig_k, + bool insert) { - struct bch_fs *c = trans->c; - struct bch_dev *ca; - struct bch_backpointer *bps = alloc_v4_backpointers(&a->v); - unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v); - struct bkey_i_backpointer *bp_k; struct btree_iter bp_iter; struct bkey_s_c k; int ret; - /* Check for duplicates: */ - for (i = 0; i < nr; i++) { - int cmp = backpointer_cmp(bps[i], bp); - if (cmp >= 0) - break; - } - - if ((i && - (bps[i - 1].bucket_offset + - bps[i - 1].bucket_len > bp.bucket_offset)) || - (i < nr && - (bp.bucket_offset + bp.bucket_len > bps[i].bucket_offset))) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "overlapping backpointer found when inserting "); - bch2_backpointer_to_text(&buf, &bp); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "into "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - - if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) - bch_err(c, "%s", buf.buf); - else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; - } - } - - if (nr < BCH_ALLOC_V4_NR_BACKPOINTERS_MAX) { - array_insert_item(bps, nr, i, bp); - SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&a->v, nr); - set_alloc_v4_u64s(a); - return 0; - } - - /* Overflow: use backpointer btree */ - bp_k = bch2_trans_kmalloc(trans, sizeof(*bp_k)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; - - ca = bch_dev_bkey_exists(c, a->k.p.inode); - - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, a->k.p, bp.bucket_offset); - bp_k->v = bp; - - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, - BTREE_ITER_INTENT| - BTREE_ITER_SLOTS| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&bp_iter); + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp_k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); ret = bkey_err(k); if (ret) goto err; - if (k.k->type) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "existing btree backpointer key found when inserting "); - bch2_backpointer_to_text(&buf, &bp); - prt_newline(&buf); - printbuf_indent_add(&buf, 2); - - prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - - prt_printf(&buf, "for "); - bch2_bkey_val_to_text(&buf, c, orig_k); - - if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) - bch_err(c, "%s", buf.buf); - else { - bch2_trans_inconsistent(trans, "%s", buf.buf); - printbuf_exit(&buf); - ret = -EIO; + if (insert + ? k.k->type + : (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { + ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + if (ret) goto err; - } } ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); @@ -390,62 +172,44 @@ err: */ int bch2_get_next_backpointer(struct btree_trans *trans, struct bpos bucket, int gen, - u64 *bp_offset, - struct bch_backpointer *dst, + struct bpos *bp_pos, + struct bch_backpointer *bp, unsigned iter_flags) { struct bch_fs *c = trans->c; - struct bpos bp_pos, bp_end_pos; - struct btree_iter alloc_iter, bp_iter = { NULL }; + struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; struct bkey_s_c k; - struct bkey_s_c_alloc_v4 a; - size_t i; - int ret; - - if (*bp_offset == U64_MAX) - return 0; - - bp_pos = bucket_pos_to_bp(c, bucket, - max(*bp_offset, BACKPOINTER_OFFSET_MAX) - BACKPOINTER_OFFSET_MAX); - bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); - - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(k); - if (ret) - goto out; - - if (k.k->type != KEY_TYPE_alloc_v4) - goto done; + int ret = 0; - a = bkey_s_c_to_alloc_v4(k); - if (gen >= 0 && a.v->gen != gen) + if (bpos_ge(*bp_pos, bp_end_pos)) goto done; - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) { - if (alloc_v4_backpointers_c(a.v)[i].bucket_offset < *bp_offset) - continue; + if (gen >= 0) { + k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED|iter_flags); + ret = bkey_err(k); + if (ret) + goto out; - *dst = alloc_v4_backpointers_c(a.v)[i]; - *bp_offset = dst->bucket_offset; - goto out; + if (k.k->type != KEY_TYPE_alloc_v4 || + bkey_s_c_to_alloc_v4(k).v->gen != gen) + goto done; } + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, - bp_pos, 0, k, ret) { - if (bpos_cmp(k.k->p, bp_end_pos) >= 0) + *bp_pos, iter_flags, k, ret) { + if (bpos_ge(k.k->p, bp_end_pos)) break; - if (k.k->type != KEY_TYPE_backpointer) - continue; - - *dst = *bkey_s_c_to_backpointer(k).v; - *bp_offset = dst->bucket_offset + BACKPOINTER_OFFSET_MAX; + *bp_pos = k.k->p; + *bp = *bkey_s_c_to_backpointer(k).v; goto out; } done: - *bp_offset = U64_MAX; + *bp_pos = SPOS_MAX; out: bch2_trans_iter_exit(trans, &bp_iter); bch2_trans_iter_exit(trans, &alloc_iter); @@ -453,14 +217,17 @@ out: } static void backpointer_not_found(struct btree_trans *trans, - struct bpos bucket, - u64 bp_offset, + struct bpos bp_pos, struct bch_backpointer bp, struct bkey_s_c k, const char *thing_it_points_to) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + + if (likely(!bch2_backpointers_no_use_write_buffer)) + return; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", thing_it_points_to); @@ -468,19 +235,14 @@ static void backpointer_not_found(struct btree_trans *trans, bch2_bpos_to_text(&buf, bucket); prt_printf(&buf, "\n "); - if (bp_offset >= BACKPOINTER_OFFSET_MAX) { - struct bpos bp_pos = - bucket_pos_to_bp(c, bucket, - bp_offset - BACKPOINTER_OFFSET_MAX); - prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp_pos); - prt_printf(&buf, "\n "); - } + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); bch2_backpointer_to_text(&buf, &bp); prt_printf(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, k); - if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) + if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) bch_err_ratelimited(c, "%s", buf.buf); else bch2_trans_inconsistent(trans, "%s", buf.buf); @@ -490,64 +252,69 @@ static void backpointer_not_found(struct btree_trans *trans, struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct btree_iter *iter, - struct bpos bucket, - u64 bp_offset, - struct bch_backpointer bp) + struct bpos bp_pos, + struct bch_backpointer bp, + unsigned iter_flags) { struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct bkey_s_c k; bch2_trans_node_iter_init(trans, iter, bp.btree_id, bp.pos, 0, - min(bp.level, c->btree_roots[bp.btree_id].level), - 0); + min(bp.level, r->level), + iter_flags); k = bch2_btree_iter_peek_slot(iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; } - if (bp.level == c->btree_roots[bp.btree_id].level + 1) - k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); + if (bp.level == r->level + 1) + k = bkey_i_to_s_c(&r->key); if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) return k; bch2_trans_iter_exit(trans, iter); - if (bp.level) { - struct btree *b; - - /* - * If a backpointer for a btree node wasn't found, it may be - * because it was overwritten by a new btree node that hasn't - * been written out yet - backpointer_get_node() checks for - * this: - */ - b = bch2_backpointer_get_node(trans, iter, bucket, bp_offset, bp); - if (!IS_ERR_OR_NULL(b)) - return bkey_i_to_s_c(&b->key); - - bch2_trans_iter_exit(trans, iter); + if (unlikely(bch2_backpointers_no_use_write_buffer)) { + if (bp.level) { + struct btree *b; + + /* + * If a backpointer for a btree node wasn't found, it may be + * because it was overwritten by a new btree node that hasn't + * been written out yet - backpointer_get_node() checks for + * this: + */ + b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + if (!IS_ERR_OR_NULL(b)) + return bkey_i_to_s_c(&b->key); + + bch2_trans_iter_exit(trans, iter); + + if (IS_ERR(b)) + return bkey_s_c_err(PTR_ERR(b)); + return bkey_s_c_null; + } - if (IS_ERR(b)) - return bkey_s_c_err(PTR_ERR(b)); - return bkey_s_c_null; + backpointer_not_found(trans, bp_pos, bp, k, "extent"); } - backpointer_not_found(trans, bucket, bp_offset, bp, k, "extent"); return bkey_s_c_null; } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct btree_iter *iter, - struct bpos bucket, - u64 bp_offset, + struct bpos bp_pos, struct bch_backpointer bp) { struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); struct btree *b; BUG_ON(!bp.level); @@ -570,7 +337,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (b && btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bucket, bp_offset, bp, + backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key), "btree node"); b = NULL; } @@ -598,10 +365,8 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ ca = bch_dev_bkey_exists(c, k.k->p.inode); - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - bp_pos_to_bucket(c, k.k->p), 0); - - alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bp_pos_to_bucket(c, k.k->p), 0); ret = bkey_err(alloc_k); if (ret) goto out; @@ -625,88 +390,81 @@ int bch2_check_btree_backpointers(struct bch_fs *c) { struct btree_iter iter; struct bkey_s_c k; + int ret; - return bch2_trans_run(c, + ret = bch2_trans_run(c, for_each_btree_key_commit(&trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, bch2_check_btree_backpointer(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; } +struct bpos_level { + unsigned level; + struct bpos pos; +}; + static int check_bp_exists(struct btree_trans *trans, - struct bpos bucket_pos, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, struct bpos bucket_start, - struct bpos bucket_end) + struct bpos bucket_end, + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter, bp_iter = { NULL }; + struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; - struct bkey_s_c alloc_k, bp_k; + struct bkey_s_c bp_k; int ret; - if (bpos_cmp(bucket_pos, bucket_start) < 0 || - bpos_cmp(bucket_pos, bucket_end) > 0) + if (bpos_lt(bucket, bucket_start) || + bpos_gt(bucket, bucket_end)) return 0; - bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, bucket_pos, 0); - alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); - ret = bkey_err(alloc_k); - if (ret) - goto err; - - if (alloc_k.k->type == KEY_TYPE_alloc_v4) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(alloc_k); - const struct bch_backpointer *bps = alloc_v4_backpointers_c(a.v); - unsigned i, nr = BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); - - for (i = 0; i < nr; i++) { - int cmp = backpointer_cmp(bps[i], bp) ?: - memcmp(&bps[i], &bp, sizeof(bp)); - if (!cmp) - goto out; - if (cmp >= 0) - break; - } - } else { + if (!bch2_dev_bucket_exists(c, bucket)) goto missing; - } - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(c, bucket_pos, bp.bucket_offset), - 0); - bp_k = bch2_btree_iter_peek_slot(&bp_iter); + bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket, bp.bucket_offset), + 0); ret = bkey_err(bp_k); if (ret) goto err; if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { + if (last_flushed->level != bp.level || + !bpos_eq(last_flushed->pos, orig_k.k->p)) { + last_flushed->level = bp.level; + last_flushed->pos = orig_k.k->p; + + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } goto missing; + } out: err: fsck_err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; missing: prt_printf(&buf, "missing backpointer for btree=%s l=%u ", bch2_btree_ids[bp.btree_id], bp.level); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\nin alloc key "); - bch2_bkey_val_to_text(&buf, c, alloc_k); + prt_printf(&buf, "\nbp pos "); + bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->sb.version < bcachefs_metadata_version_backpointers || + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || c->opts.reconstruct_alloc || - fsck_err(c, "%s", buf.buf)) { - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, alloc_k); - - ret = PTR_ERR_OR_ZERO(a) ?: - bch2_bucket_backpointer_add(trans, a, bp, orig_k) ?: - bch2_trans_update(trans, &alloc_iter, &a->k_i, 0); - } + fsck_err(c, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; } @@ -714,7 +472,8 @@ missing: static int check_extent_to_backpointers(struct btree_trans *trans, struct btree_iter *iter, struct bpos bucket_start, - struct bpos bucket_end) + struct bpos bucket_end, + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs; @@ -741,7 +500,9 @@ static int check_extent_to_backpointers(struct btree_trans *trans, bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); if (ret) return ret; } @@ -752,9 +513,11 @@ static int check_extent_to_backpointers(struct btree_trans *trans, static int check_btree_root_to_backpointers(struct btree_trans *trans, enum btree_id btree_id, struct bpos bucket_start, - struct bpos bucket_end) + struct bpos bucket_end, + struct bpos_level *last_flushed) { struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, btree_id); struct btree_iter iter; struct btree *b; struct bkey_s_c k; @@ -763,8 +526,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, const union bch_extent_entry *entry; int ret; - bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, - c->btree_roots[btree_id].level, 0); + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); b = bch2_btree_iter_peek_node(&iter); ret = PTR_ERR_OR_ZERO(b); if (ret) @@ -781,10 +543,12 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, if (p.ptr.cached) continue; - bch2_extent_ptr_to_bp(c, iter.btree_id, iter.path->level + 1, + bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, k, p, &bucket_pos, &bp); - ret = check_bp_exists(trans, bucket_pos, bp, k, bucket_start, bucket_end); + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); if (ret) goto err; } @@ -808,13 +572,13 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c) si_meminfo(&i); mem_bytes = i.totalram * i.mem_unit; - return (mem_bytes >> 1) / btree_bytes(c); + return div_u64(mem_bytes >> 1, btree_bytes(c)); } -int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, - struct bbpos start, struct bbpos *end) +static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + unsigned btree_leaf_mask, + unsigned btree_interior_mask, + struct bbpos start, struct bbpos *end) { struct btree_iter iter; struct bkey_s_c k; @@ -862,11 +626,13 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct bpos bucket_start, struct bpos bucket_end) { + struct bch_fs *c = trans->c; struct btree_iter iter; enum btree_id btree_id; + struct bpos_level last_flushed = { UINT_MAX }; int ret = 0; - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, @@ -879,7 +645,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL, check_extent_to_backpointers(trans, &iter, - bucket_start, bucket_end)); + bucket_start, bucket_end, + &last_flushed)); if (ret) break; } while (!bch2_btree_iter_advance(&iter)); @@ -893,15 +660,24 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL, check_btree_root_to_backpointers(trans, btree_id, - bucket_start, bucket_end)); + bucket_start, bucket_end, + &last_flushed)); if (ret) break; } return ret; } -int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) +static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, + struct bpos bucket) +{ + return bch2_dev_exists2(c, bucket.inode) + ? bucket_pos_to_bp(c, bucket, 0) + : bucket; +} + +static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, + struct bpos start, struct bpos *end) { struct btree_iter alloc_iter; struct btree_iter bp_iter; @@ -913,7 +689,7 @@ int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, start, 0, 1, 0); bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(trans->c, start, 0), 0, 1, 0); + bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); while (1) { alloc_k = !alloc_end ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) @@ -934,8 +710,8 @@ int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, break; } - if (bpos_cmp(alloc_iter.pos, SPOS_MAX) && - bpos_cmp(bucket_pos_to_bp(trans->c, alloc_iter.pos, 0), bp_iter.pos) < 0) { + if (bpos_lt(alloc_iter.pos, SPOS_MAX) && + bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { if (!bch2_btree_iter_advance(&alloc_iter)) alloc_end = true; } else { @@ -960,11 +736,11 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) if (ret) break; - if (!bpos_cmp(start, POS_MIN) && bpos_cmp(end, SPOS_MAX)) + if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (bpos_cmp(start, POS_MIN) || bpos_cmp(end, SPOS_MAX)) { + if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); @@ -977,56 +753,58 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) } ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); - if (ret || !bpos_cmp(end, SPOS_MAX)) + if (ret || bpos_eq(end, SPOS_MAX)) break; start = bpos_successor(end); } bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); return ret; } static int check_one_backpointer(struct btree_trans *trans, - struct bpos bucket, - u64 *bp_offset, struct bbpos start, - struct bbpos end) + struct bbpos end, + struct bkey_s_c_backpointer bp, + struct bpos *last_flushed_pos) { + struct bch_fs *c = trans->c; struct btree_iter iter; - struct bch_backpointer bp; - struct bbpos pos; + struct bbpos pos = bp_to_bbpos(*bp.v); struct bkey_s_c k; struct printbuf buf = PRINTBUF; int ret; - ret = bch2_get_next_backpointer(trans, bucket, -1, bp_offset, &bp, 0); - if (ret || *bp_offset == U64_MAX) - return ret; - - pos = bp_to_bbpos(bp); if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; - k = bch2_backpointer_get_key(trans, &iter, bucket, *bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) return ret; - if (fsck_err_on(!k.k, trans->c, - "%s backpointer points to missing extent\n%s", - *bp_offset < BACKPOINTER_OFFSET_MAX ? "alloc" : "btree", - (bch2_backpointer_to_text(&buf, &bp), buf.buf))) { - ret = bch2_backpointer_del_by_offset(trans, bucket, *bp_offset, bp); - if (ret == -ENOENT) - bch_err(trans->c, "backpointer at %llu not found", *bp_offset); + if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { + *last_flushed_pos = bp.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; } - bch2_trans_iter_exit(trans, &iter); + if (fsck_err_on(!k.k, c, + "backpointer for missing extent\n %s", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } +out: fsck_err: + bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; } @@ -1037,24 +815,14 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, { struct btree_iter iter; struct bkey_s_c k; - int ret = 0; - - for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - u64 bp_offset = 0; - - while (!(ret = commit_do(trans, NULL, NULL, - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_NOFAIL, - check_one_backpointer(trans, iter.pos, &bp_offset, start, end))) && - bp_offset < U64_MAX) - bp_offset++; - - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - return ret < 0 ? ret : 0; + struct bpos last_flushed_pos = SPOS_MAX; + + return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed_pos)); } int bch2_check_backpointers_to_extents(struct bch_fs *c) @@ -1099,5 +867,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index 48a48b7..547e061 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -2,11 +2,13 @@ #ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H #define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#include "btree_iter.h" +#include "btree_update.h" #include "buckets.h" #include "super.h" int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); @@ -15,16 +17,90 @@ void bch2_backpointer_swab(struct bkey_s); .key_invalid = bch2_backpointer_invalid, \ .val_to_text = bch2_backpointer_k_to_text, \ .swab = bch2_backpointer_swab, \ + .min_val_size = 32, \ }) #define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc + * btree: + */ +static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, + struct bpos bp_pos) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); +} + +/* + * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: + */ +static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, + struct bpos bucket, + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + struct bpos ret; + + ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + + return ret; +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, + struct bch_backpointer, struct bkey_s_c, bool); + +static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct bkey_i_backpointer *bp_k; + int ret; + + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); + + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); +} + +static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p) +{ + return level ? BCH_DATA_btree : + p.has_ec ? BCH_DATA_stripe : + BCH_DATA_user; +} + static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, struct bpos *bucket_pos, struct bch_backpointer *bp) { - enum bch_data_type data_type = level ? BCH_DATA_btree : BCH_DATA_user; + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); s64 sectors = level ? btree_sectors(c) : k.k->size; u32 bucket_offset; @@ -40,16 +116,13 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } -int bch2_bucket_backpointer_del(struct btree_trans *, struct bkey_i_alloc_v4 *, - struct bch_backpointer, struct bkey_s_c); -int bch2_bucket_backpointer_add(struct btree_trans *, struct bkey_i_alloc_v4 *, - struct bch_backpointer, struct bkey_s_c); int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, - u64 *, struct bch_backpointer *, unsigned); + struct bpos *, struct bch_backpointer *, unsigned); struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, - struct bpos, u64, struct bch_backpointer); + struct bpos, struct bch_backpointer, + unsigned); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, - struct bpos, u64, struct bch_backpointer); + struct bpos, struct bch_backpointer); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 3334100..e1f1e8e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -206,12 +206,21 @@ #include "bcachefs_format.h" #include "errcode.h" #include "fifo.h" -#include "nocow_locking.h" +#include "nocow_locking_types.h" #include "opts.h" +#include "recovery_types.h" +#include "seqmutex.h" #include "util.h" +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_WRITE_REF_DEBUG +#endif + +#ifndef dynamic_fault #define dynamic_fault(...) 0 -#define race_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") #define trace_and_count(_c, _name, ...) \ do { \ @@ -283,6 +292,11 @@ do { \ #define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) +#define bch_err_fn(_c, _ret) \ + bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) +#define bch_err_msg(_c, _ret, _msg) \ + bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret)) + #define bch_verbose(c, fmt, ...) \ do { \ if ((c)->opts.verbose) \ @@ -312,7 +326,10 @@ do { \ "done in memory") \ BCH_DEBUG_PARAM(verify_all_btree_replicas, \ "When reading btree nodes, read all replicas and " \ - "compare them") + "compare them") \ + BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ + "Don't use the write buffer for backpointers, enabling "\ + "extra runtime checks") /* Parameters that should only be compiled in debug mode: */ #define BCH_DEBUG_PARAMS_DEBUG() \ @@ -389,6 +406,7 @@ enum bch_time_stats { #include "alloc_types.h" #include "btree_types.h" +#include "btree_write_buffer_types.h" #include "buckets_types.h" #include "buckets_waiting_for_journal_types.h" #include "clock_types.h" @@ -433,6 +451,9 @@ enum gc_phase { GC_PHASE_BTREE_freespace, GC_PHASE_BTREE_need_discard, GC_PHASE_BTREE_backpointers, + GC_PHASE_BTREE_bucket_gens, + GC_PHASE_BTREE_snapshot_trees, + GC_PHASE_BTREE_deleted_inodes, GC_PHASE_PENDING_DELETE, }; @@ -470,7 +491,7 @@ struct bch_dev { * Committed by bch2_write_super() -> bch_fs_mi_update() */ struct bch_member_cpu mi; - uuid_le uuid; + __uuid_t uuid; char name[BDEVNAME_SIZE]; struct bch_sb_handle disk_sb; @@ -502,14 +523,11 @@ struct bch_dev { /* Allocator: */ u64 new_fs_bucket_idx; - u64 bucket_alloc_trans_early_cursor; + u64 alloc_cursor; unsigned nr_open_buckets; unsigned nr_btree_reserve; - open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; - open_bucket_idx_t open_buckets_partial_nr; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; size_t buckets_waiting_on_journal; @@ -523,7 +541,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct time_stats io_latency[2]; + struct bch2_time_stats io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -542,15 +560,11 @@ enum { /* shutdown: */ BCH_FS_STOPPING, BCH_FS_EMERGENCY_RO, + BCH_FS_GOING_RO, BCH_FS_WRITE_DISABLE_COMPLETE, BCH_FS_CLEAN_SHUTDOWN, /* fsck passes: */ - BCH_FS_TOPOLOGY_REPAIR_DONE, - BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ - BCH_FS_CHECK_LRUS_DONE, - BCH_FS_CHECK_BACKPOINTERS_DONE, - BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, BCH_FS_FSCK_DONE, BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ BCH_FS_NEED_ANOTHER_GC, @@ -571,9 +585,10 @@ struct btree_debug { #define BCH_TRANSACTIONS_NR 128 struct btree_transaction_stats { + struct bch2_time_stats lock_hold_times; struct mutex lock; - struct time_stats lock_hold_times; unsigned nr_max_paths; + unsigned wb_updates_size; unsigned max_mem; char *max_paths_text; }; @@ -617,21 +632,31 @@ struct btree_path_buf { #define REPLICAS_DELTA_LIST_MAX (1U << 16) -struct snapshot_t { - u32 parent; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 equiv; -}; - -typedef struct { - u32 subvol; - u64 inum; -} subvol_inum; - #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) +#define BCH_WRITE_REFS() \ + x(trans) \ + x(write) \ + x(promote) \ + x(node_rewrite) \ + x(stripe_create) \ + x(stripe_delete) \ + x(reflink) \ + x(fallocate) \ + x(discard) \ + x(invalidate) \ + x(delete_dead_snapshots) \ + x(snapshot_delete_pagecache) \ + x(sysfs) + +enum bch_write_ref { +#define x(n) BCH_WRITE_REF_##n, + BCH_WRITE_REFS() +#undef x + BCH_WRITE_REF_NR, +}; + struct bch_fs { struct closure cl; @@ -653,7 +678,11 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_t writes[BCH_WRITE_REF_NR]; +#else struct percpu_ref writes; +#endif struct work_struct read_only_work; struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; @@ -674,11 +703,12 @@ struct bch_fs { /* Updated by bch2_sb_update():*/ struct { - uuid_le uuid; - uuid_le user_uuid; + __uuid_t uuid; + __uuid_t user_uuid; u16 version; u16 version_min; + u16 version_upgrade_complete; u8 nr_devices; u8 clean; @@ -704,9 +734,10 @@ struct bch_fs { struct mutex sb_lock; /* snapshot.c: */ - GENRADIX(struct snapshot_t) snapshots; - struct bch_snapshot_table __rcu *snapshot_table; + struct snapshot_table __rcu *snapshots; + size_t snapshot_table_size; struct mutex snapshot_table_lock; + struct work_struct snapshot_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; @@ -716,7 +747,8 @@ struct bch_fs { struct bio_set btree_bio; struct workqueue_struct *io_complete_wq; - struct btree_root btree_roots[BTREE_ID_NR]; + struct btree_root btree_roots_known[BTREE_ID_NR]; + DARRAY(struct btree_root) btree_roots_extra; struct mutex btree_root_lock; struct btree_cache btree_cache; @@ -741,6 +773,9 @@ struct bch_fs { struct workqueue_struct *btree_interior_update_worker; struct work_struct btree_interior_update_work; + struct list_head pending_node_rewrites; + struct mutex pending_node_rewrites_lock; + /* btree_io.c: */ spinlock_t btree_write_error_lock; struct btree_write_stats { @@ -749,7 +784,7 @@ struct bch_fs { } btree_write_stats[BTREE_WRITE_TYPE_NR]; /* btree_iter.c: */ - struct mutex btree_trans_lock; + struct seqmutex btree_trans_lock; struct list_head btree_trans_list; mempool_t btree_paths_pool; mempool_t btree_trans_mem_pool; @@ -761,10 +796,18 @@ struct bch_fs { struct btree_key_cache btree_key_cache; unsigned btree_key_cache_btrees; + struct btree_write_buffer btree_write_buffer; + struct workqueue_struct *btree_update_wq; struct workqueue_struct *btree_io_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; + /* + * Use a dedicated wq for write ref holder tasks. Required to avoid + * dependency problems with other wq tasks that can block on ref + * draining, such as read-only transition. + */ + struct workqueue_struct *write_ref_wq; /* ALLOCATION */ struct bch_devs_mask rw_devs[BCH_DATA_NR]; @@ -815,6 +858,9 @@ struct bch_fs { struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + struct write_point btree_write_point; struct write_point rebalance_write_point; @@ -855,6 +901,7 @@ struct bch_fs { struct mutex gc_gens_lock; /* IO PATH */ + struct semaphore io_in_flight; struct bio_set bio_read; struct bio_set bio_read_split; struct bio_set bio_write; @@ -877,27 +924,33 @@ struct bch_fs { mempool_t large_bkey_pool; + /* MOVE.C */ + struct list_head moving_context_list; + struct mutex moving_context_lock; + + struct list_head data_progress_list; + struct mutex data_progress_lock; + /* REBALANCE */ struct bch_fs_rebalance rebalance; /* COPYGC */ struct task_struct *copygc_thread; - copygc_heap copygc_heap; struct write_point copygc_write_point; + s64 copygc_wait_at; s64 copygc_wait; bool copygc_running; wait_queue_head_t copygc_running_wq; - /* DATA PROGRESS STATS */ - struct list_head data_progress_list; - struct mutex data_progress_lock; - /* STRIPES: */ GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; + struct hlist_head ec_stripes_new[32]; + spinlock_t ec_stripes_new_lock; + ec_stripes_heap ec_stripes_heap; - spinlock_t ec_stripes_heap_lock; + struct mutex ec_stripes_heap_lock; /* ERASURE CODING */ struct list_head ec_stripe_head_list; @@ -905,20 +958,23 @@ struct bch_fs { struct list_head ec_stripe_new_list; struct mutex ec_stripe_new_lock; + wait_queue_head_t ec_stripe_new_wait; struct work_struct ec_stripe_create_work; u64 ec_stripe_hint; - struct bio_set ec_bioset; - struct work_struct ec_stripe_delete_work; - struct llist_head ec_stripe_delete_list; + + struct bio_set ec_bioset; /* REFLINK */ - u64 reflink_hint; reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; + /* fs.c */ + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; @@ -933,6 +989,13 @@ struct bch_fs { /* QUOTAS */ struct bch_memquota_type quotas[QTYP_NR]; + /* RECOVERY */ + u64 journal_replay_seq_start; + u64 journal_replay_seq_end; + enum bch_recovery_pass curr_recovery_pass; + /* bitmap of explicitly enabled recovery passes: */ + u64 recovery_passes_explicit; + /* DEBUG JUNK */ struct dentry *fs_debug_dir; struct dentry *btree_debug_dir; @@ -967,11 +1030,51 @@ struct bch_fs { unsigned copy_gc_enabled:1; bool promote_whole_extents; - struct time_stats times[BCH_TIME_STAT_NR]; + struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; }; +extern struct wait_queue_head bch2_read_only_wait; + +static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_inc(&c->writes[ref]); +#else + percpu_ref_get(&c->writes); +#endif +} + +static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_GOING_RO, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget_live(&c->writes); +#endif +} + +static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + long v = atomic_long_dec_return(&c->writes[ref]); + + BUG_ON(v < 0); + if (v) + return; + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + if (atomic_long_read(&c->writes[i])) + return; + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +#else + percpu_ref_put(&c->writes); +#endif +} + static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) { #ifndef NO_BCACHEFS_FS @@ -1036,4 +1139,23 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) return dev < c->sb.nr_devices && c->devs[dev]; } +/* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + c->recovery_passes_explicit |= BIT_ULL(pass); + + if (c->curr_recovery_pass >= pass) { + c->curr_recovery_pass = pass; + return -BCH_ERR_restart_recovery; + } else { + return 0; + } +} + +#define BKEY_PADDED_ONSTACK(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + #endif /* _BCACHEFS_H */ diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 0aa522b..5ec218e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -78,6 +78,10 @@ #include #include "vstructs.h" +#ifdef __KERNEL__ +typedef uuid_t __uuid_t; +#endif + #define BITMASK(name, type, field, offset, end) \ static const unsigned name##_OFFSET = offset; \ static const unsigned name##_BITS = (end - offset); \ @@ -250,6 +254,11 @@ struct bkey_packed { __u8 pad[sizeof(struct bkey) - 3]; } __packed __aligned(8); +typedef struct { + __le64 lo; + __le64 hi; +} bch_le128; + #define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) #define BKEY_U64s_MAX U8_MAX #define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) @@ -290,16 +299,8 @@ enum bch_bkey_fields { struct bkey_i { __u64 _data[0]; - union { - struct { - /* Size of combined key and value, in u64s */ - __u8 u64s; - }; - struct { - struct bkey k; - struct bch_val v; - }; - }; + struct bkey k; + struct bch_val v; }; #define KEY(_inode, _offset, _size) \ @@ -318,7 +319,7 @@ static inline void bkey_init(struct bkey *k) #define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) #define __BKEY_PADDED(key, pad) \ - struct { struct bkey_i key; __u64 key ## _pad[pad]; } + struct bkey_i key; __u64 key ## _pad[pad] /* * - DELETED keys are used internally to mark keys that should be ignored but @@ -367,7 +368,9 @@ static inline void bkey_init(struct bkey *k) x(lru, 26) \ x(alloc_v4, 27) \ x(backpointer, 28) \ - x(inode_v3, 29) + x(inode_v3, 29) \ + x(bucket_gens, 30) \ + x(snapshot_tree, 31) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -485,8 +488,9 @@ struct bch_csum { x(crc32, 1) \ x(crc64, 2) \ x(crc128, 3) \ - x(stripe_ptr, 4) -#define BCH_EXTENT_ENTRY_MAX 5 + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -621,6 +625,20 @@ struct bch_extent_reservation { #endif }; +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + unused:33, + compression:8, + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:33, + type:7; +#endif +}; + union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 unsigned long type; @@ -677,7 +695,7 @@ struct bch_reservation { /* Maximum size (in u64s) a single pointer could be: */ #define BKEY_EXTENT_PTR_U64s_MAX\ ((sizeof(struct bch_extent_crc128) + \ - sizeof(struct bch_extent_ptr)) / sizeof(u64)) + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) /* Maximum possible size of an entire extent value: */ #define BKEY_EXTENT_VAL_U64s_MAX \ @@ -689,7 +707,7 @@ struct bch_reservation { /* Btree pointers don't carry around checksums: */ #define BKEY_BTREE_PTR_VAL_U64s_MAX \ ((sizeof(struct bch_btree_ptr_v2) + \ - sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) #define BKEY_BTREE_PTR_U64s_MAX \ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) @@ -731,7 +749,7 @@ struct bch_inode_v3 { } __packed __aligned(8); #define INODEv3_FIELDS_START_INITIAL 6 -#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64)) +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) struct bch_inode_generation { struct bch_val v; @@ -898,7 +916,7 @@ struct bch_dirent { #define DT_SUBVOL 16 #define BCH_DT_MAX 17 -#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \ +#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \ sizeof(struct bkey) - \ offsetof(struct bch_dirent, d_name))) @@ -987,10 +1005,11 @@ struct bch_alloc_v4 { __u64 io_time[2]; __u32 stripe; __u32 nr_external_backpointers; + __u64 fragmentation_lru; } __packed __aligned(8); #define BCH_ALLOC_V4_U64s_V0 6 -#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) @@ -1009,6 +1028,15 @@ struct bch_backpointer { struct bpos pos; } __packed __aligned(8); +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + /* Quotas: */ enum quota_types { @@ -1098,6 +1126,9 @@ struct bch_subvolume { __le32 flags; __le32 snapshot; __le64 inode; + __le32 parent; + __le32 pad; + bch_le128 otime; }; LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) @@ -1116,7 +1147,9 @@ struct bch_snapshot { __le32 parent; __le32 children[2]; __le32 subvol; - __le32 pad; + __le32 tree; + __le32 depth; + __le32 skip[3]; }; LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) @@ -1124,6 +1157,19 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + /* LRU btree: */ struct bch_lru { @@ -1190,7 +1236,7 @@ struct bch_sb_field_journal_v2 { #define BCH_MIN_NR_NBUCKETS (1 << 6) struct bch_member { - uuid_le uuid; + __uuid_t uuid; __le64 nbuckets; /* device size */ __le16 first_bucket; /* index of first bucket used */ __le16 bucket_size; /* sectors */ @@ -1243,10 +1289,10 @@ struct bch_key { }; #define BCH_KEY_MAGIC \ - (((u64) 'b' << 0)|((u64) 'c' << 8)| \ - ((u64) 'h' << 16)|((u64) '*' << 24)| \ - ((u64) '*' << 32)|((u64) 'k' << 40)| \ - ((u64) 'e' << 48)|((u64) 'y' << 56)) + (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ + ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ + ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ + ((__u64) 'e' << 48)|((__u64) 'y' << 56)) struct bch_encrypted_key { __le64 magic; @@ -1327,19 +1373,19 @@ static inline bool data_type_is_hidden(enum bch_data_type type) struct bch_replicas_entry_v0 { __u8 data_type; __u8 nr_devs; - __u8 devs[]; + __u8 devs[0]; } __packed; struct bch_sb_field_replicas_v0 { struct bch_sb_field field; - struct bch_replicas_entry_v0 entries[]; + struct bch_replicas_entry_v0 entries[0]; } __packed __aligned(8); struct bch_replicas_entry { __u8 data_type; __u8 nr_devs; __u8 nr_required; - __u8 devs[]; + __u8 devs[0]; } __packed; #define replicas_entry_bytes(_i) \ @@ -1426,7 +1472,7 @@ struct bch_sb_field_disk_groups { x(move_extent_read, 35) \ x(move_extent_write, 36) \ x(move_extent_finish, 37) \ - x(move_extent_race, 38) \ + x(move_extent_fail, 38) \ x(move_extent_alloc_mem_fail, 39) \ x(copygc, 40) \ x(copygc_wait, 41) \ @@ -1462,7 +1508,9 @@ struct bch_sb_field_disk_groups { x(trans_traverse_all, 71) \ x(transaction_commit, 72) \ x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, @@ -1528,41 +1576,80 @@ struct bch_sb_field_journal_seq_blacklist { * One common version number for all on disk data structures - superblock, btree * nodes, journal entries */ -#define BCH_JSET_VERSION_OLD 2 -#define BCH_BSET_VERSION_OLD 3 - -#define BCH_METADATA_VERSIONS() \ - x(bkey_renumber, 10) \ - x(inode_btree_change, 11) \ - x(snapshot, 12) \ - x(inode_backpointers, 13) \ - x(btree_ptr_sectors_written, 14) \ - x(snapshot_2, 15) \ - x(reflink_p_fix, 16) \ - x(subvol_dirent, 17) \ - x(inode_v2, 18) \ - x(freespace, 19) \ - x(alloc_v4, 20) \ - x(new_data_types, 21) \ - x(backpointers, 22) \ - x(inode_v3, 23) \ - x(unwritten_extents, 24) +#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) +#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) +#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) + +#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) + +#define BCH_METADATA_VERSIONS() \ + x(bkey_renumber, BCH_VERSION(0, 10), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_btree_change, BCH_VERSION(0, 11), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot, BCH_VERSION(0, 12), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_backpointers, BCH_VERSION(0, 13), \ + RECOVERY_PASS_ALL_FSCK) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_2, BCH_VERSION(0, 15), \ + BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ + BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(reflink_p_fix, BCH_VERSION(0, 16), \ + BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ + x(subvol_dirent, BCH_VERSION(0, 17), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v2, BCH_VERSION(0, 18), \ + RECOVERY_PASS_ALL_FSCK) \ + x(freespace, BCH_VERSION(0, 19), \ + RECOVERY_PASS_ALL_FSCK) \ + x(alloc_v4, BCH_VERSION(0, 20), \ + RECOVERY_PASS_ALL_FSCK) \ + x(new_data_types, BCH_VERSION(0, 21), \ + RECOVERY_PASS_ALL_FSCK) \ + x(backpointers, BCH_VERSION(0, 22), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, BCH_VERSION(0, 23), \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, BCH_VERSION(0, 24), \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, BCH_VERSION(0, 25), \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, BCH_VERSION(0, 26), \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, BCH_VERSION(0, 27), \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, BCH_VERSION(0, 29), \ + RECOVERY_PASS_ALL_FSCK) \ + x(major_minor, BCH_VERSION(1, 0), \ + 0) \ + x(snapshot_skiplists, BCH_VERSION(1, 1), \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ + x(deleted_inodes, BCH_VERSION(1, 2), \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, -#define x(t, n) bcachefs_metadata_version_##t = n, +#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, BCH_METADATA_VERSIONS() #undef x bcachefs_metadata_version_max }; +static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; + #define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) #define BCH_SB_SECTOR 8 #define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ struct bch_sb_layout { - uuid_le magic; /* bcachefs superblock UUID */ + __uuid_t magic; /* bcachefs superblock UUID */ __u8 layout_type; __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ __u8 nr_superblocks; @@ -1578,7 +1665,7 @@ struct bch_sb_layout { * @version_min - Oldest metadata version this filesystem contains; so we can * safely drop compatibility code and refuse to mount filesystems * we'd need it for - * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) + * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) * @seq - incremented each time superblock is written * @uuid - used for generating various magic numbers and identifying * member devices, never changes @@ -1593,9 +1680,9 @@ struct bch_sb { __le16 version; __le16 version_min; __le16 pad[2]; - uuid_le magic; - uuid_le uuid; - uuid_le user_uuid; + __uuid_t magic; + __uuid_t uuid; + __uuid_t user_uuid; __u8 label[BCH_SB_LABEL_SIZE]; __le64 offset; __le64 seq; @@ -1662,7 +1749,7 @@ LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); @@ -1682,7 +1769,7 @@ LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); -LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, struct bch_sb, flags[2], 0, 4); LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); @@ -1693,9 +1780,40 @@ LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -/* Obsolete, always enabled: */ LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); +LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); +LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); + +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, + struct bch_sb, flags[4], 60, 64); + +LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, + struct bch_sb, flags[5], 0, 16); + +static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); +} + +static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | + (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); +} /* * Features: @@ -1763,6 +1881,17 @@ enum bch_sb_compat { /* options: */ +#define BCH_VERSION_UPGRADE_OPTS() \ + x(compatible, 0) \ + x(incompatible, 1) \ + x(none, 2) + +enum bch_version_upgrade_opts { +#define x(t, n) BCH_VERSION_UPGRADE_##t = n, + BCH_VERSION_UPGRADE_OPTS() +#undef x +}; + #define BCH_REPLICAS_MAX 4U #define BCH_BKEY_PTRS_MAX 16U @@ -1892,8 +2021,11 @@ enum bch_compression_opts { */ #define BCACHE_MAGIC \ - UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ - 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) + UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) +#define BCHFS_MAGIC \ + UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) #define BCACHEFS_STATFS_MAGIC 0xca451a4e @@ -2008,7 +2140,7 @@ struct jset_entry_dev_usage { __le64 _buckets_unavailable; /* No longer used */ struct jset_entry_dev_usage_type d[]; -} __packed; +}; static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) { @@ -2064,24 +2196,69 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); /* Btree: */ -#define BCH_BTREE_IDS() \ - x(extents, 0) \ - x(inodes, 1) \ - x(dirents, 2) \ - x(xattrs, 3) \ - x(alloc, 4) \ - x(quotas, 5) \ - x(stripes, 6) \ - x(reflink, 7) \ - x(subvolumes, 8) \ - x(snapshots, 9) \ - x(lru, 10) \ - x(freespace, 11) \ - x(need_discard, 12) \ - x(backpointers, 13) +enum btree_id_flags { + BTREE_ID_EXTENTS = BIT(0), + BTREE_ID_SNAPSHOTS = BIT(1), + BTREE_ID_DATA = BIT(2), +}; + +#define BCH_BTREE_IDS() \ + x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_error)| \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_extent)| \ + BIT_ULL(KEY_TYPE_reservation)| \ + BIT_ULL(KEY_TYPE_reflink_p)| \ + BIT_ULL(KEY_TYPE_inline_data)) \ + x(inodes, 1, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_inode)| \ + BIT_ULL(KEY_TYPE_inode_v2)| \ + BIT_ULL(KEY_TYPE_inode_v3)| \ + BIT_ULL(KEY_TYPE_inode_generation)) \ + x(dirents, 2, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ + BIT_ULL(KEY_TYPE_dirent)) \ + x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ + BIT_ULL(KEY_TYPE_xattr)) \ + x(alloc, 4, 0, \ + BIT_ULL(KEY_TYPE_alloc)| \ + BIT_ULL(KEY_TYPE_alloc_v2)| \ + BIT_ULL(KEY_TYPE_alloc_v3)| \ + BIT_ULL(KEY_TYPE_alloc_v4)) \ + x(quotas, 5, 0, \ + BIT_ULL(KEY_TYPE_quota)) \ + x(stripes, 6, 0, \ + BIT_ULL(KEY_TYPE_stripe)) \ + x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ + BIT_ULL(KEY_TYPE_reflink_v)| \ + BIT_ULL(KEY_TYPE_indirect_inline_data)) \ + x(subvolumes, 8, 0, \ + BIT_ULL(KEY_TYPE_subvolume)) \ + x(snapshots, 9, 0, \ + BIT_ULL(KEY_TYPE_snapshot)) \ + x(lru, 10, 0, \ + BIT_ULL(KEY_TYPE_set)) \ + x(freespace, 11, BTREE_ID_EXTENTS, \ + BIT_ULL(KEY_TYPE_set)) \ + x(need_discard, 12, 0, \ + BIT_ULL(KEY_TYPE_set)) \ + x(backpointers, 13, 0, \ + BIT_ULL(KEY_TYPE_backpointer)) \ + x(bucket_gens, 14, 0, \ + BIT_ULL(KEY_TYPE_bucket_gens)) \ + x(snapshot_trees, 15, 0, \ + BIT_ULL(KEY_TYPE_snapshot_tree)) \ + x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ + BIT_ULL(KEY_TYPE_set)) enum btree_id { -#define x(kwd, val) BTREE_ID_##kwd = val, +#define x(name, nr, ...) BTREE_ID_##name = nr, BCH_BTREE_IDS() #undef x BTREE_ID_NR @@ -2152,13 +2329,25 @@ struct btree_node { }; } __packed __aligned(8); -LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, struct btree_node, flags, 8, 9); -/* 9-32 unused */ +LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); +/* 25-32 unused */ LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); +static inline __u64 BTREE_NODE_ID(struct btree_node *n) +{ + return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); +} + +static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) +{ + SET_BTREE_NODE_ID_LO(n, v); + SET_BTREE_NODE_ID_HI(n, v >> 4); +} + struct btree_node_entry { struct bch_csum csum; @@ -2168,7 +2357,6 @@ struct btree_node_entry { __u8 pad[22]; __le16 u64s; __u64 _data[0]; - }; }; } __packed __aligned(8); diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index ad47a50..f05881f 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -93,7 +93,7 @@ struct bch_ioctl_incremental { * this UUID. */ struct bch_ioctl_query_uuid { - uuid_le uuid; + __uuid_t uuid; }; #if 0 diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index 630df06..ee7ba70 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -262,14 +262,6 @@ bool bch2_bkey_transform(const struct bkey_format *out_f, return true; } -#define bkey_fields() \ - x(BKEY_FIELD_INODE, p.inode) \ - x(BKEY_FIELD_OFFSET, p.offset) \ - x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ - x(BKEY_FIELD_SIZE, size) \ - x(BKEY_FIELD_VERSION_HI, version.hi) \ - x(BKEY_FIELD_VERSION_LO, version.lo) - struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, const struct bkey_packed *in) { @@ -331,15 +323,6 @@ bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, #define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; bkey_fields() #undef x - - /* - * Extents - we have to guarantee that if an extent is packed, a trimmed - * version will also pack: - */ - if (bkey_start_offset(in) < - le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) - return false; - pack_state_finish(&state, out); out->u64s = format->key_u64s + in->u64s - BKEY_U64s; out->format = KEY_FORMAT_LOCAL_BTREE; @@ -377,7 +360,7 @@ bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, memmove_u64s((u64 *) out + format->key_u64s, &in->v, bkey_val_u64s(&in->k)); - memcpy_u64s(out, &tmp, format->key_u64s); + memcpy_u64s_small(out, &tmp, format->key_u64s); return true; } @@ -505,18 +488,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) return BKEY_PACK_POS_FAIL; - if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { in.offset = KEY_OFFSET_MAX; in.snapshot = KEY_SNAPSHOT_MAX; exact = false; } - if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { in.snapshot = KEY_SNAPSHOT_MAX; exact = false; } - if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) exact = false; pack_state_finish(&state, out); @@ -553,24 +536,6 @@ void bch2_bkey_format_init(struct bkey_format_state *s) s->field_min[BKEY_FIELD_SIZE] = 0; } -static void __bkey_format_add(struct bkey_format_state *s, - unsigned field, u64 v) -{ - s->field_min[field] = min(s->field_min[field], v); - s->field_max[field] = max(s->field_max[field], v); -} - -/* - * Changes @format so that @k can be successfully packed with @format - */ -void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -{ -#define x(id, field) __bkey_format_add(s, id, k->field); - bkey_fields() -#undef x - __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); -} - void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) { unsigned field = 0; @@ -759,7 +724,7 @@ unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) return 0; } -#ifdef CONFIG_X86_64 +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK #define I(_x) (*(out)++ = (_x)) #define I1(i0) I(i0) diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 19b59ff..e81fb3e 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -9,9 +9,17 @@ #include "util.h" #include "vstructs.h" +#if 0 + +/* + * compiled unpack functions are disabled, pending a new interface for + * dynamically allocating executable memory: + */ + #ifdef CONFIG_X86_64 #define HAVE_BCACHEFS_COMPILED_UNPACK 1 #endif +#endif void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, @@ -34,7 +42,12 @@ struct bkey_s { }; }; -#define bkey_next(_k) vstruct_next(_k) +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) (k->_data + k->k.u64s); +} #define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) @@ -89,17 +102,6 @@ do { \ struct btree; -struct bkey_format_state { - u64 field_min[BKEY_NR_FIELDS]; - u64 field_max[BKEY_NR_FIELDS]; -}; - -void bch2_bkey_format_init(struct bkey_format_state *); -void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); -void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -const char *bch2_bkey_format_validate(struct bkey_format *); - __pure unsigned bch2_bkey_greatest_differing_bit(const struct btree *, const struct bkey_packed *, @@ -147,6 +149,37 @@ static inline int bkey_cmp_left_packed_byval(const struct btree *b, return bkey_cmp_left_packed(b, l, &r); } +static __always_inline bool bpos_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset) | + (l.snapshot ^ r.snapshot)); +} + +static __always_inline bool bpos_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; +} + +static __always_inline bool bpos_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; +} + +static __always_inline bool bpos_gt(struct bpos l, struct bpos r) +{ + return bpos_lt(r, l); +} + +static __always_inline bool bpos_ge(struct bpos l, struct bpos r) +{ + return bpos_le(r, l); +} + static __always_inline int bpos_cmp(struct bpos l, struct bpos r) { return cmp_int(l.inode, r.inode) ?: @@ -154,20 +187,60 @@ static __always_inline int bpos_cmp(struct bpos l, struct bpos r) cmp_int(l.snapshot, r.snapshot); } +static inline struct bpos bpos_min(struct bpos l, struct bpos r) +{ + return bpos_lt(l, r) ? l : r; +} + +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bpos_gt(l, r) ? l : r; +} + +static __always_inline bool bkey_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset)); +} + +static __always_inline bool bkey_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset < r.offset; +} + +static __always_inline bool bkey_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset <= r.offset; +} + +static __always_inline bool bkey_gt(struct bpos l, struct bpos r) +{ + return bkey_lt(r, l); +} + +static __always_inline bool bkey_ge(struct bpos l, struct bpos r) +{ + return bkey_le(r, l); +} + static __always_inline int bkey_cmp(struct bpos l, struct bpos r) { return cmp_int(l.inode, r.inode) ?: cmp_int(l.offset, r.offset); } -static inline struct bpos bpos_min(struct bpos l, struct bpos r) +static inline struct bpos bkey_min(struct bpos l, struct bpos r) { - return bpos_cmp(l, r) < 0 ? l : r; + return bkey_lt(l, r) ? l : r; } -static inline struct bpos bpos_max(struct bpos l, struct bpos r) +static inline struct bpos bkey_max(struct bpos l, struct bpos r) { - return bpos_cmp(l, r) > 0 ? l : r; + return bkey_gt(l, r) ? l : r; } void bch2_bpos_swab(struct bpos *); @@ -432,7 +505,7 @@ static inline struct bpos bkey_unpack_pos(const struct btree *b, /* Disassembled bkeys */ -static inline struct bkey_s_c bkey_disassemble(struct btree *b, +static inline struct bkey_s_c bkey_disassemble(const struct btree *b, const struct bkey_packed *k, struct bkey *u) { @@ -442,7 +515,7 @@ static inline struct bkey_s_c bkey_disassemble(struct btree *b, } /* non const version: */ -static inline struct bkey_s __bkey_disassemble(struct btree *b, +static inline struct bkey_s __bkey_disassemble(const struct btree *b, struct bkey_packed *k, struct bkey *u) { @@ -546,20 +619,20 @@ struct bkey_s_##name { \ \ static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ { \ - EBUG_ON(k->k.type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline const struct bkey_i_##name * \ bkey_i_to_##name##_c(const struct bkey_i *k) \ { \ - EBUG_ON(k->k.type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return container_of(&k->k, struct bkey_i_##name, k); \ } \ \ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ { \ - EBUG_ON(k.k->type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ @@ -568,7 +641,7 @@ static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ \ static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ { \ - EBUG_ON(k.k->type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = k.k, \ .v = container_of(k.v, struct bch_##name, v), \ @@ -594,7 +667,7 @@ name##_i_to_s_c(const struct bkey_i_##name *k) \ \ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ { \ - EBUG_ON(k->k.type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ @@ -604,7 +677,7 @@ static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ static inline struct bkey_s_c_##name \ bkey_i_to_s_c_##name(const struct bkey_i *k) \ { \ - EBUG_ON(k->k.type != KEY_TYPE_##name); \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ return (struct bkey_s_c_##name) { \ .k = &k->k, \ .v = container_of(&k->v, struct bch_##name, v), \ @@ -663,4 +736,39 @@ void bch2_bkey_pack_test(void); static inline void bch2_bkey_pack_test(void) {} #endif +#define bkey_fields() \ + x(BKEY_FIELD_INODE, p.inode) \ + x(BKEY_FIELD_OFFSET, p.offset) \ + x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ + x(BKEY_FIELD_SIZE, size) \ + x(BKEY_FIELD_VERSION_HI, version.hi) \ + x(BKEY_FIELD_VERSION_LO, version.lo) + +struct bkey_format_state { + u64 field_min[BKEY_NR_FIELDS]; + u64 field_max[BKEY_NR_FIELDS]; +}; + +void bch2_bkey_format_init(struct bkey_format_state *); + +static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) +{ + s->field_min[field] = min(s->field_min[field], v); + s->field_max[field] = max(s->field_max[field], v); +} + +/* + * Changes @format so that @k can be successfully packed with @format + */ +static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) +{ +#define x(id, field) __bkey_format_add(s, id, k->field); + bkey_fields() +#undef x +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); +const char *bch2_bkey_format_validate(struct bkey_format *); + #endif /* _BCACHEFS_BKEY_H */ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 6939d74..90557f4 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -24,7 +24,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { return 0; } @@ -38,12 +38,12 @@ static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != 0)", bkey_val_bytes(k.k)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -54,19 +54,14 @@ static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { - if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_cookie)); - return -EINVAL; - } - return 0; } #define bch2_bkey_ops_cookie ((struct bkey_ops) { \ - .key_invalid = key_type_cookie_invalid, \ + .key_invalid = key_type_cookie_invalid, \ + .min_val_size = 8, \ }) #define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ @@ -74,7 +69,7 @@ static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, }) static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { return 0; } @@ -95,12 +90,12 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { if (bkey_val_bytes(k.k)) { prt_printf(err, "incorrect value size (%zu != %zu)", bkey_val_bytes(k.k), sizeof(struct bch_cookie)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -123,136 +118,90 @@ const struct bkey_ops bch2_bkey_ops[] = { #undef x }; +const struct bkey_ops bch2_bkey_null_ops = { + .min_val_size = U8_MAX, +}; + int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - if (k.k->type >= KEY_TYPE_MAX) { - prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); - return -EINVAL; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (bkey_val_bytes(k.k) < ops->min_val_size) { + prt_printf(err, "bad val size (%zu < %u)", + bkey_val_bytes(k.k), ops->min_val_size); + return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ops[k.k->type].key_invalid(c, k, rw, err); + if (!ops->key_invalid) + return 0; + + return ops->key_invalid(c, k, flags, err); } -static unsigned bch2_key_types_allowed[] = { - [BKEY_TYPE_extents] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_whiteout)| - (1U << KEY_TYPE_error)| - (1U << KEY_TYPE_cookie)| - (1U << KEY_TYPE_extent)| - (1U << KEY_TYPE_reservation)| - (1U << KEY_TYPE_reflink_p)| - (1U << KEY_TYPE_inline_data), - [BKEY_TYPE_inodes] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_whiteout)| - (1U << KEY_TYPE_inode)| - (1U << KEY_TYPE_inode_v2)| - (1U << KEY_TYPE_inode_v3)| - (1U << KEY_TYPE_inode_generation), - [BKEY_TYPE_dirents] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_whiteout)| - (1U << KEY_TYPE_hash_whiteout)| - (1U << KEY_TYPE_dirent), - [BKEY_TYPE_xattrs] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_whiteout)| - (1U << KEY_TYPE_cookie)| - (1U << KEY_TYPE_hash_whiteout)| - (1U << KEY_TYPE_xattr), - [BKEY_TYPE_alloc] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_alloc)| - (1U << KEY_TYPE_alloc_v2)| - (1U << KEY_TYPE_alloc_v3)| - (1U << KEY_TYPE_alloc_v4), - [BKEY_TYPE_quotas] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_quota), - [BKEY_TYPE_stripes] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_stripe), - [BKEY_TYPE_reflink] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_reflink_v)| - (1U << KEY_TYPE_indirect_inline_data), - [BKEY_TYPE_subvolumes] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_subvolume), - [BKEY_TYPE_snapshots] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_snapshot), - [BKEY_TYPE_lru] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_lru), - [BKEY_TYPE_freespace] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_set), - [BKEY_TYPE_need_discard] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_set), - [BKEY_TYPE_backpointers] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_backpointer), +static u64 bch2_key_types_allowed[] = { +#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, + BCH_BTREE_IDS() +#undef x [BKEY_TYPE_btree] = - (1U << KEY_TYPE_deleted)| - (1U << KEY_TYPE_btree_ptr)| - (1U << KEY_TYPE_btree_ptr_v2), + BIT_ULL(KEY_TYPE_deleted)| + BIT_ULL(KEY_TYPE_btree_ptr)| + BIT_ULL(KEY_TYPE_btree_ptr_v2), }; int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { if (k.k->u64s < BKEY_U64s) { prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { + if (flags & BKEY_INVALID_COMMIT && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { prt_printf(err, "invalid key type for btree %s (%s)", - bch2_btree_ids[type], bch2_bkey_types[type]); - return -EINVAL; + bch2_btree_ids[type], bch2_bkey_types[k.k->type]); + return -BCH_ERR_invalid_bkey; } if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { if (k.k->size == 0) { prt_printf(err, "size == 0"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (k.k->size > k.k->p.offset) { prt_printf(err, "size greater than offset (%u > %llu)", k.k->size, k.k->p.offset); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } } else { if (k.k->size) { prt_printf(err, "size != 0"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } } - if (type != BKEY_TYPE_btree && - !btree_type_has_snapshots(type) && - k.k->p.snapshot) { - prt_printf(err, "nonzero snapshot"); - return -EINVAL; - } + if (type != BKEY_TYPE_btree) { + if (!btree_type_has_snapshots((enum btree_id) type) && + k.k->p.snapshot) { + prt_printf(err, "nonzero snapshot"); + return -BCH_ERR_invalid_bkey; + } - if (type != BKEY_TYPE_btree && - btree_type_has_snapshots(type) && - !k.k->p.snapshot) { - prt_printf(err, "snapshot == 0"); - return -EINVAL; - } + if (btree_type_has_snapshots((enum btree_id) type) && + !k.k->p.snapshot) { + prt_printf(err, "snapshot == 0"); + return -BCH_ERR_invalid_bkey; + } - if (type != BKEY_TYPE_btree && - !bkey_cmp(k.k->p, POS_MAX)) { - prt_printf(err, "key at POS_MAX"); - return -EINVAL; + if (bkey_eq(k.k->p, POS_MAX)) { + prt_printf(err, "key at POS_MAX"); + return -BCH_ERR_invalid_bkey; + } } return 0; @@ -260,23 +209,24 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, enum btree_node_type type, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - return __bch2_bkey_invalid(c, k, type, rw, err) ?: - bch2_bkey_val_invalid(c, k, rw, err); + return __bch2_bkey_invalid(c, k, type, flags, err) ?: + bch2_bkey_val_invalid(c, k, flags, err); } int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, struct printbuf *err) { - if (bpos_cmp(k.k->p, b->data->min_key) < 0) { + if (bpos_lt(k.k->p, b->data->min_key)) { prt_printf(err, "key before start of btree node"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - if (bpos_cmp(k.k->p, b->data->max_key) > 0) { + if (bpos_gt(k.k->p, b->data->max_key)) { prt_printf(err, "key past end of btree node"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -284,11 +234,11 @@ int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) { - if (!bpos_cmp(pos, POS_MIN)) + if (bpos_eq(pos, POS_MIN)) prt_printf(out, "POS_MIN"); - else if (!bpos_cmp(pos, POS_MAX)) + else if (bpos_eq(pos, POS_MAX)) prt_printf(out, "POS_MAX"); - else if (!bpos_cmp(pos, SPOS_MAX)) + else if (bpos_eq(pos, SPOS_MAX)) prt_printf(out, "SPOS_MAX"); else { if (pos.inode == U64_MAX) @@ -329,14 +279,10 @@ void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - if (k.k->type < KEY_TYPE_MAX) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); - if (likely(ops->val_to_text)) - ops->val_to_text(out, c, k); - } else { - prt_printf(out, "(invalid type %u)", k.k->type); - } + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); } void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, @@ -352,7 +298,7 @@ void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, void bch2_bkey_swab_val(struct bkey_s k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); if (ops->swab) ops->swab(k); @@ -360,7 +306,7 @@ void bch2_bkey_swab_val(struct bkey_s k) bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) { - const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); return ops->key_normalize ? ops->key_normalize(c, k) @@ -369,9 +315,13 @@ bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) { - const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); - return bch2_bkey_maybe_mergable(l.k, r.k) && ops->key_merge(c, l, r); + return ops->key_merge && + bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && + !bch2_key_merging_disabled && + ops->key_merge(c, l, r); } static const struct old_bkey_type { @@ -469,7 +419,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, u->k.p.snapshot = write ? 0 : U32_MAX; } else { - u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; + u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); u64 max_packed = min_packed + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); @@ -494,7 +444,7 @@ void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, if (big_endian != CPU_BIG_ENDIAN) bch2_bkey_swab_val(u); - ops = &bch2_bkey_ops[k->type]; + ops = bch2_bkey_type_ops(k->type); if (ops->compat) ops->compat(btree_id, version, big_endian, write, u); diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 4739b3c..d7b6376 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -11,6 +11,13 @@ struct bkey; enum btree_node_type; extern const char * const bch2_bkey_types[]; +extern const struct bkey_ops bch2_bkey_null_ops; + +enum bkey_invalid_flags { + BKEY_INVALID_WRITE = (1U << 0), + BKEY_INVALID_COMMIT = (1U << 1), + BKEY_INVALID_JOURNAL = (1U << 2), +}; /* * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If @@ -21,7 +28,7 @@ extern const char * const bch2_bkey_types[]; */ struct bkey_ops { int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err); + enum bkey_invalid_flags flags, struct printbuf *err); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -29,20 +36,31 @@ struct bkey_ops { bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); - int (*atomic_trigger)(struct btree_trans *, struct bkey_s_c, - struct bkey_s_c, unsigned); + int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); void (*compat)(enum btree_id id, unsigned version, unsigned big_endian, int write, struct bkey_s); + + /* Size of value type when first created: */ + unsigned min_val_size; }; extern const struct bkey_ops bch2_bkey_ops[]; -int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type, int, struct printbuf *); -int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, - enum btree_node_type, int, struct printbuf *); +static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) +{ + return likely(type < KEY_TYPE_MAX) + ? &bch2_bkey_ops[type] + : &bch2_bkey_null_ops; +} + +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bkey_invalid_flags, struct printbuf *); +int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, + enum bkey_invalid_flags, struct printbuf *); int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); void bch2_bpos_to_text(struct printbuf *, struct bpos); @@ -60,28 +78,27 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b { return l->type == r->type && !bversion_cmp(l->version, r->version) && - !bpos_cmp(l->p, bkey_start_pos(r)) && - (u64) l->size + r->size <= KEY_SIZE_MAX && - bch2_bkey_ops[l->type].key_merge && - !bch2_key_merging_disabled; + bpos_eq(l->p, bkey_start_pos(r)); } bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); static inline int bch2_mark_key(struct btree_trans *trans, - struct bkey_s_c old, - struct bkey_s_c new, - unsigned flags) + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) { - const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); return ops->atomic_trigger - ? ops->atomic_trigger(trans, old, new, flags) + ? ops->atomic_trigger(trans, btree, level, old, new, flags) : 0; } enum btree_update_flags { - __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, + __BTREE_UPDATE_NOJOURNAL, + __BTREE_UPDATE_PREJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ @@ -95,6 +112,8 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) +#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) @@ -121,7 +140,7 @@ static inline int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old, struct bkey_i *new, unsigned flags) { - const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); return ops->trans_trigger ? ops->trans_trigger(trans, btree_id, level, old, new, flags) diff --git a/libbcachefs/bkey_sort.c b/libbcachefs/bkey_sort.c index 557a79c..b9aa027 100644 --- a/libbcachefs/bkey_sort.c +++ b/libbcachefs/bkey_sort.c @@ -46,7 +46,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) BUG_ON(!iter->used); - i->k = bkey_next(i->k); + i->k = bkey_p_next(i->k); BUG_ON(i->k > i->end); @@ -108,7 +108,7 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, !should_drop_next_key(iter)) { bkey_copy(out, k); btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); + out = bkey_p_next(out); } sort_iter_advance(iter, key_sort_fix_overlapping_cmp); @@ -147,7 +147,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src, out->needs_whiteout = false; btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); + out = bkey_p_next(out); } dst->u64s = cpu_to_le16((u64 *) out - dst->_data); @@ -188,13 +188,13 @@ unsigned bch2_sort_keys(struct bkey_packed *dst, } if (bkey_deleted(in)) { - memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); + memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); set_bkeyp_val_u64s(f, out, 0); } else { bkey_copy(out, in); } out->needs_whiteout |= needs_whiteout; - out = bkey_next(out); + out = bkey_p_next(out); } return (u64 *) out - (u64 *) dst; diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 0942353..bcdf28f 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -10,6 +10,7 @@ #include "btree_cache.h" #include "bset.h" #include "eytzinger.h" +#include "trace.h" #include "util.h" #include @@ -17,10 +18,6 @@ #include #include -/* hack.. */ -#include "alloc_types.h" -#include - static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); @@ -36,16 +33,7 @@ static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) { - unsigned offset = __btree_node_key_to_offset(b, k); - struct bset_tree *t; - - for_each_bset(b, t) - if (offset <= t->end_offset) { - EBUG_ON(offset < btree_bkey_first_offset(t)); - return t; - } - - BUG(); + return bch2_bkey_to_bset_inlined(b, k); } /* @@ -78,7 +66,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, for (_k = i->start; _k < vstruct_last(i); _k = _n) { - _n = bkey_next(_k); + _n = bkey_p_next(_k); k = bkey_disassemble(b, _k, &uk); @@ -95,13 +83,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, n = bkey_unpack_key(b, _n); - if (bpos_cmp(n.p, k.k->p) < 0) { + if (bpos_lt(n.p, k.k->p)) { printk(KERN_ERR "Key skipped backwards\n"); continue; } - if (!bkey_deleted(k.k) && - !bpos_cmp(n.p, k.k->p)) + if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) printk(KERN_ERR "Duplicate keys\n"); } @@ -542,7 +529,7 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, goto start; while (1) { if (rw_aux_to_bkey(b, t, j) == k) { - BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k, + BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, bkey_unpack_pos(b, k))); start: if (++j == t->size) @@ -552,7 +539,7 @@ start: rw_aux_tree(b, t)[j - 1].offset); } - k = bkey_next(k); + k = bkey_p_next(k); BUG_ON(k >= btree_bkey_last(b, t)); } } @@ -612,11 +599,10 @@ static inline unsigned bkey_mantissa(const struct bkey_packed *k, return (u16) v; } -__always_inline -static inline void make_bfloat(struct btree *b, struct bset_tree *t, - unsigned j, - struct bkey_packed *min_key, - struct bkey_packed *max_key) +static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) { struct bkey_float *f = bkey_float(b, t, j); struct bkey_packed *m = tree_to_bkey(b, t, j); @@ -743,7 +729,7 @@ retry: /* First we figure out where the first key in each cacheline is */ eytzinger1_for_each(j, t->size - 1) { while (bkey_to_cacheline(b, t, k) < cacheline) - prev = k, k = bkey_next(k); + prev = k, k = bkey_p_next(k); if (k >= btree_bkey_last(b, t)) { /* XXX: this path sucks */ @@ -760,7 +746,7 @@ retry: } while (k != btree_bkey_last(b, t)) - prev = k, k = bkey_next(k); + prev = k, k = bkey_p_next(k); if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { bkey_init(&min_key.k); @@ -898,7 +884,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; while ((p = __bkey_prev(b, t, k)) && !ret) { - for (i = p; i != k; i = bkey_next(i)) + for (i = p; i != k; i = bkey_p_next(i)) if (i->type >= min_key_type) ret = i; @@ -909,10 +895,10 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, BUG_ON(ret >= orig_k); for (i = ret - ? bkey_next(ret) + ? bkey_p_next(ret) : btree_bkey_first(b, t); i != orig_k; - i = bkey_next(i)) + i = bkey_p_next(i)) BUG_ON(i->type >= min_key_type); } @@ -984,7 +970,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b, struct bkey_packed *k = start; while (1) { - k = bkey_next(k); + k = bkey_p_next(k); if (k == end) break; @@ -1035,7 +1021,7 @@ void bch2_bset_insert(struct btree *b, set_btree_bset_end(b, t); } - memcpy_u64s(where, src, + memcpy_u64s_small(where, src, bkeyp_key_u64s(f, src)); memcpy_u64s(bkeyp_val(f, where), &insert->v, bkeyp_val_u64s(f, src)); @@ -1077,7 +1063,7 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b, while (l + 1 != r) { unsigned m = (l + r) >> 1; - if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) + if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) l = m; else r = m; @@ -1218,12 +1204,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, while (m != btree_bkey_last(b, t) && bkey_iter_cmp_p_or_unp(b, m, lossy_packed_search, search) < 0) - m = bkey_next(m); + m = bkey_p_next(m); if (!packed_search) while (m != btree_bkey_last(b, t) && bkey_iter_pos_cmp(b, m, search) < 0) - m = bkey_next(m); + m = bkey_p_next(m); if (bch2_expensive_debug_checks) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); @@ -1330,8 +1316,8 @@ void bch2_btree_node_iter_init(struct btree_node_iter *iter, struct bkey_packed *k[MAX_BSETS]; unsigned i; - EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0); - EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0); + EBUG_ON(bpos_lt(*search, b->data->min_key)); + EBUG_ON(bpos_gt(*search, b->data->max_key)); bset_aux_tree_verify(b); memset(iter, 0, sizeof(*iter)); @@ -1546,9 +1532,9 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, /* Mergesort */ -void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) +void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) { - struct bset_tree *t; + const struct bset_tree *t; for_each_bset(b, t) { enum bset_aux_tree_type type = bset_aux_tree_type(t); diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index acef143..632c2b8 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -211,9 +211,9 @@ static inline size_t btree_aux_data_u64s(const struct btree *b) #define bset_tree_for_each_key(_b, _t, _k) \ for (_k = btree_bkey_first(_b, _t); \ _k != btree_bkey_last(_b, _t); \ - _k = bkey_next(_k)) + _k = bkey_p_next(_k)) -static inline bool bset_has_ro_aux_tree(struct bset_tree *t) +static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) { return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; } @@ -291,6 +291,21 @@ static inline int bkey_cmp_p_or_unp(const struct btree *b, return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); } +static inline struct bset_tree * +bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) +{ + unsigned offset = __btree_node_key_to_offset(b, k); + struct bset_tree *t; + + for_each_bset(b, t) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); + return t; + } + + BUG(); +} + struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, @@ -489,7 +504,7 @@ struct bset_stats { size_t failed; }; -void bch2_btree_keys_stats(struct btree *, struct bset_stats *); +void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); void bch2_bfloat_to_text(struct printbuf *, struct btree *, struct bkey_packed *); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 2ca0a9d..13c88d9 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -9,10 +9,11 @@ #include "debug.h" #include "errcode.h" #include "error.h" +#include "trace.h" #include #include -#include +#include #define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ do { \ @@ -31,13 +32,15 @@ void bch2_recalc_btree_reserve(struct bch_fs *c) { unsigned i, reserve = 16; - if (!c->btree_roots[0].b) + if (!c->btree_roots_known[0].b) reserve += 8; - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - reserve += min_t(unsigned, 1, - c->btree_roots[i].b->c.level) * 8; + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + reserve += min_t(unsigned, 1, r->b->c.level) * 8; + } c->btree_cache.reserve = reserve; } @@ -61,10 +64,12 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) EBUG_ON(btree_node_write_in_flight(b)); + clear_btree_node_just_written(b); + kvpfree(b->data, btree_bytes(c)); b->data = NULL; #ifdef __KERNEL__ - vfree(b->aux_data); + kvfree(b->aux_data); #else munmap(b->aux_data, btree_aux_data_bytes(b)); #endif @@ -97,9 +102,9 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvpmalloc(btree_bytes(c), gfp); if (!b->data) - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ - b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else b->aux_data = mmap(NULL, btree_aux_data_bytes(b), PROT_READ|PROT_WRITE|PROT_EXEC, @@ -110,7 +115,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } return 0; @@ -125,10 +130,6 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) return NULL; bkey_btree_ptr_init(&b->key); - __six_lock_init(&b->c.lock, "b->c.lock", &bch2_btree_node_lock_key); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - lockdep_set_no_check_recursion(&b->c.lock.dep_map); -#endif INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); b->byte_order = ilog2(btree_bytes(c)); @@ -149,6 +150,8 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } + bch2_btree_lock_init(&b->c, 0); + bc->used++; list_add(&b->list, &bc->freeable); return b; @@ -222,7 +225,7 @@ wait_on_io: BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); else if (btree_node_write_in_flight(b)) BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_reclaim; } /* XXX: waiting on IO with btree cache lock held */ @@ -232,7 +235,7 @@ wait_on_io: if (!six_trylock_intent(&b->c.lock)) { BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_reclaim; } if (!six_trylock_write(&b->c.lock)) { @@ -298,7 +301,7 @@ out_unlock: six_unlock_write(&b->c.lock); out_unlock_intent: six_unlock_intent(&b->c.lock); - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_btree_node_reclaim; goto out; } @@ -427,12 +430,16 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(bc); } -static void bch2_btree_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { struct bch_fs *c = container_of(shrink, struct bch_fs, btree_cache.shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - bch2_btree_cache_to_text(out, &c->btree_cache); + bch2_btree_cache_to_text(&out, &c->btree_cache); + seq_buf_commit(s, out.pos); } void bch2_fs_btree_cache_exit(struct bch_fs *c) @@ -441,8 +448,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) struct btree *b; unsigned i, flags; - if (bc->shrink.list.next) - unregister_shrinker(&bc->shrink); + unregister_shrinker(&bc->shrink); /* vfree() can allocate memory: */ flags = memalloc_nofs_save(); @@ -453,9 +459,12 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) kvpfree(c->verify_ondisk, btree_bytes(c)); - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].b) - list_add(&c->btree_roots[i].b->list, &bc->live); + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + list_add(&r->b->list, &bc->live); + } list_splice(&bc->freeable, &bc->live); @@ -479,7 +488,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) while (!list_empty(&bc->freed_nonpcpu)) { b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); list_del(&b->list); - six_lock_pcpu_free(&b->c.lock); + six_lock_exit(&b->c.lock); kfree(b); } @@ -496,21 +505,17 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) unsigned i; int ret = 0; - pr_verbose_init(c->opts, ""); - ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) - goto out; + goto err; bc->table_init_done = true; bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!__bch2_btree_node_mem_alloc(c)) { - ret = -ENOMEM; - goto out; - } + if (!__bch2_btree_node_mem_alloc(c)) + goto err; list_splice_init(&bc->live, &bc->freeable); @@ -521,9 +526,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; bc->shrink.seeks = 4; ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); -out: - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + if (ret) + goto err; + + return 0; +err: + return -BCH_ERR_ENOMEM_fs_btree_cache_init; } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -563,7 +571,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, c); - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; } closure_wait(&bc->alloc_wait, cl); @@ -577,7 +585,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) } trace_and_count(c, btree_cache_cannibalize_lock_fail, c); - return -EAGAIN; + return -BCH_ERR_btree_cache_cannibalize_lock_blocked; success: trace_and_count(c, btree_cache_cannibalize_lock, c); @@ -607,8 +615,9 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) } } -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct list_head *freed = pcpu_read_locks ? &bc->freed_pcpu @@ -630,17 +639,17 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) goto got_node; } - b = __btree_node_mem_alloc(c, __GFP_NOWARN); + b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); if (!b) { mutex_unlock(&bc->lock); + bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; mutex_lock(&bc->lock); } - if (pcpu_read_locks) - six_lock_pcpu_alloc(&b->c.lock); + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); @@ -662,8 +671,11 @@ got_node: mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) - goto err; + if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + bch2_trans_unlock(trans); + if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) + goto err; + } mutex_lock(&bc->lock); bc->used++; @@ -694,6 +706,7 @@ err: /* Try to cannibalize another cached btree node: */ if (bc->alloc_lock == current) { b2 = btree_node_cannibalize(c); + clear_btree_node_just_written(b2); bch2_btree_node_hash_remove(bc, b2); if (b) { @@ -715,12 +728,11 @@ err: mutex_unlock(&bc->lock); memalloc_nofs_restore(flags); - return ERR_PTR(-ENOMEM); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); } /* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, - struct btree_trans *trans, +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, @@ -728,6 +740,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, enum six_lock_type lock_type, bool sync) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; u32 seq; @@ -737,14 +750,14 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, * Parent node must be locked, else we could read in a btree node that's * been freed: */ - if (trans && !bch2_btree_node_relock(trans, path, level + 1)) { + if (path && !bch2_btree_node_relock(trans, path, level + 1)) { trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); } - b = bch2_btree_node_mem_alloc(c, level != 0); + b = bch2_btree_node_mem_alloc(trans, level != 0); - if (trans && b == ERR_PTR(-ENOMEM)) { + if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -753,6 +766,12 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, if (IS_ERR(b)) return b; + /* + * Btree nodes read in from disk should not have the accessed bit set + * initially, so that linear scans don't thrash the cache: + */ + clear_btree_node_accessed(b); + bkey_copy(&b->key, k); if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { /* raced with another fill: */ @@ -772,19 +791,19 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, set_btree_node_read_in_flight(b); six_unlock_write(&b->c.lock); - seq = b->c.lock.state.seq; + seq = six_lock_seq(&b->c.lock); six_unlock_intent(&b->c.lock); /* Unlock before doing IO: */ if (trans && sync) - bch2_trans_unlock(trans); + bch2_trans_unlock_noassert(trans); bch2_btree_node_read(c, b, sync); if (!sync) return NULL; - if (trans) { + if (path) { int ret = bch2_trans_relock(trans) ?: bch2_btree_path_relock_intent(trans, path); if (ret) { @@ -794,7 +813,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (trans) + if (path) trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -806,7 +825,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, @@ -833,46 +852,26 @@ static inline void btree_check_header(struct bch_fs *c, struct btree *b) { if (b->c.btree_id != BTREE_NODE_ID(b->data) || b->c.level != BTREE_NODE_LEVEL(b->data) || - bpos_cmp(b->data->max_key, b->key.k.p) || + !bpos_eq(b->data->max_key, b->key.k.p) || (b->key.k.type == KEY_TYPE_btree_ptr_v2 && - bpos_cmp(b->data->min_key, + !bpos_eq(b->data->min_key, bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) btree_bad_header(c, b); } -/** - * bch_btree_node_get - find a btree node in the cache and lock it, reading it - * in from disk if necessary. - * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. - * - * The btree node will have either a read or a write lock held, depending on - * the @write parameter. - */ -struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, - const struct bkey_i *k, unsigned level, - enum six_lock_type lock_type, - unsigned long trace_ip) +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; struct bset_tree *t; + bool need_relock = false; int ret; EBUG_ON(level >= BTREE_MAX_DEPTH); - - b = btree_node_mem_ptr(k); - - /* - * Check b->hash_val _before_ calling btree_node_lock() - this might not - * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: - */ - if (likely(c->opts.btree_node_mem_ptr_optimization && - b && - b->hash_val == btree_ptr_hash_val(k))) - goto lock_node; retry: b = btree_cache_find(bc, k); if (unlikely(!b)) { @@ -881,8 +880,9 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(c, trans, path, k, path->btree_id, + b = bch2_btree_node_fill(trans, path, k, path->btree_id, level, lock_type, true); + need_relock = true; /* We raced and found the btree node in the cache */ if (!b) @@ -891,35 +891,6 @@ retry: if (IS_ERR(b)) return b; } else { -lock_node: - /* - * There's a potential deadlock with splits and insertions into - * interior nodes we have to avoid: - * - * The other thread might be holding an intent lock on the node - * we want, and they want to update its parent node so they're - * going to upgrade their intent lock on the parent node to a - * write lock. - * - * But if we're holding a read lock on the parent, and we're - * trying to get the intent lock they're holding, we deadlock. - * - * So to avoid this we drop the read locks on parent nodes when - * we're starting to take intent locks - and handle the race. - * - * The race is that they might be about to free the node we - * want, and dropping our read lock on the parent node lets them - * update the parent marking the node we want as freed, and then - * free it: - * - * To guard against this, btree nodes are evicted from the cache - * when they're freed - and b->hash_val is zeroed out, which we - * check for after we lock the node. - * - * Then, bch2_btree_node_relock() on the parent will fail - because - * the parent was modified, when the pointer to the node we want - * was removed - and we'll bail out: - */ if (btree_node_read_locked(path, level + 1)) btree_node_unlock(trans, path, level + 1); @@ -939,10 +910,113 @@ lock_node: trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } if (unlikely(btree_node_read_in_flight(b))) { - u32 seq = b->c.lock.state.seq; + u32 seq = six_lock_seq(&b->c.lock); + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(trans); + need_relock = true; + + bch2_btree_node_wait_on_read(b); + + /* + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; + } + + if (unlikely(need_relock)) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(ret); + } + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + */ +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (unlikely(!c->opts.btree_node_mem_ptr_optimization || + !b || + b->hash_val != btree_ptr_hash_val(k))) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = six_lock_seq(&b->c.lock); six_unlock_type(&b->c.lock, lock_type); bch2_trans_unlock(trans); @@ -963,7 +1037,7 @@ lock_node: } if (!six_relock_type(&b->c.lock, lock_type, seq)) - goto retry; + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); } prefetch(b->aux_data); @@ -1017,7 +1091,7 @@ retry: if (nofill) goto out; - b = bch2_btree_node_fill(c, NULL, NULL, k, btree_id, + b = bch2_btree_node_fill(trans, NULL, k, btree_id, level, SIX_LOCK_read, true); /* We raced and found the btree node in the cache */ @@ -1032,7 +1106,7 @@ retry: goto out; } else { lock_node: - ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read); + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ERR_PTR(ret); @@ -1077,12 +1151,12 @@ out: return b; } -int bch2_btree_node_prefetch(struct bch_fs *c, - struct btree_trans *trans, +int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_path *path, const struct bkey_i *k, enum btree_id btree_id, unsigned level) { + struct bch_fs *c = trans->c; struct btree_cache *bc = &c->btree_cache; struct btree *b; @@ -1093,7 +1167,7 @@ int bch2_btree_node_prefetch(struct bch_fs *c, if (b) return 0; - b = bch2_btree_node_fill(c, trans, path, k, btree_id, + b = bch2_btree_node_fill(trans, path, k, btree_id, level, SIX_LOCK_read, false); return PTR_ERR_OR_ZERO(b); } @@ -1138,7 +1212,7 @@ wait_on_io: } void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, - struct btree *b) + const struct btree *b) { const struct bkey_format *f = &b->format; struct bset_stats stats; @@ -1183,7 +1257,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, struct btree_cache *bc) +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) { prt_printf(out, "nr nodes:\t\t%u\n", bc->used); prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index b623c70..00c9b92 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -21,7 +21,7 @@ void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_mem_alloc(struct bch_fs *, bool); +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, const struct bkey_i *, unsigned, @@ -30,7 +30,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, enum btree_id, unsigned, bool); -int bch2_btree_node_prefetch(struct bch_fs *, struct btree_trans *, struct btree_path *, +int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, const struct bkey_i *, enum btree_id, unsigned); void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); @@ -45,7 +45,11 @@ static inline u64 btree_ptr_hash_val(const struct bkey_i *k) case KEY_TYPE_btree_ptr: return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); case KEY_TYPE_btree_ptr_v2: - return bkey_i_to_btree_ptr_v2_c(k)->v.seq; + /* + * The cast/deref is only necessary to avoid sparse endianness + * warnings: + */ + return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); default: return 0; } @@ -97,10 +101,30 @@ static inline unsigned btree_blocks(struct bch_fs *c) (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) -#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) +static inline unsigned btree_id_nr_alive(struct bch_fs *c) +{ + return BTREE_ID_NR + c->btree_roots_extra.nr; +} + +static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) +{ + if (likely(id < BTREE_ID_NR)) { + return &c->btree_roots_known[id]; + } else { + unsigned idx = id - BTREE_ID_NR; + + EBUG_ON(idx >= c->btree_roots_extra.nr); + return &c->btree_roots_extra.data[idx]; + } +} + +static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) +{ + return bch2_btree_id_root(c, b->c.btree_id)->b; +} void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, - struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, struct btree_cache *); + const struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 20e804e..49e9822 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -27,6 +27,7 @@ #include "reflink.h" #include "replicas.h" #include "super-io.h" +#include "trace.h" #include #include @@ -35,11 +36,16 @@ #include #include #include -#include #define DROP_THIS_NODE 10 #define DROP_PREV_NODE 11 +static bool should_restart_for_topology_repair(struct bch_fs *c) +{ + return c->opts.fix_errors != FSCK_FIX_no && + !(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); +} + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) { preempt_disable(); @@ -76,7 +82,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); - if (bpos_cmp(expected_start, bp->v.min_key)) { + if (!bpos_eq(expected_start, bp->v.min_key)) { bch2_topology_error(c); if (bkey_deleted(&prev->k->k)) { @@ -96,9 +102,9 @@ static int bch2_gc_check_topology(struct bch_fs *c, " cur %s", bch2_btree_ids[b->c.btree_id], b->c.level, buf1.buf, buf2.buf) && - !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = -BCH_ERR_need_topology_repair; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); @@ -106,7 +112,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, } } - if (is_last && bpos_cmp(cur.k->k.p, node_end)) { + if (is_last && !bpos_eq(cur.k->k.p, node_end)) { bch2_topology_error(c); printbuf_reset(&buf1); @@ -124,9 +130,9 @@ static int bch2_gc_check_topology(struct bch_fs *c, " expected %s", bch2_btree_ids[b->c.btree_id], b->c.level, buf1.buf, buf2.buf) && - !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); - ret = -BCH_ERR_need_topology_repair; + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); @@ -201,7 +207,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -ENOMEM; + return -BCH_ERR_ENOMEM_gc_repair_key; btree_ptr_to_v2(b, new); b->data->min_key = new_min; @@ -230,7 +236,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -ENOMEM; + return -BCH_ERR_ENOMEM_gc_repair_key; btree_ptr_to_v2(b, new); b->data->max_key = new_max; @@ -274,12 +280,12 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); if (prev && - bpos_cmp(expected_start, cur->data->min_key) > 0 && + bpos_gt(expected_start, cur->data->min_key) && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { /* cur overwrites prev: */ - if (mustfix_fsck_err_on(bpos_cmp(prev->data->min_key, - cur->data->min_key) >= 0, c, + if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, + cur->data->min_key), c, "btree node overwritten by next node at btree %s level %u:\n" " node %s\n" " next %s", @@ -289,7 +295,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, goto out; } - if (mustfix_fsck_err_on(bpos_cmp(prev->key.k.p, + if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, bpos_predecessor(cur->data->min_key)), c, "btree node with incorrect max_key at btree %s level %u:\n" " node %s\n" @@ -301,8 +307,8 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, } else { /* prev overwrites cur: */ - if (mustfix_fsck_err_on(bpos_cmp(expected_start, - cur->data->max_key) >= 0, c, + if (mustfix_fsck_err_on(bpos_ge(expected_start, + cur->data->max_key), c, "btree node overwritten by prev node at btree %s level %u:\n" " prev %s\n" " node %s", @@ -312,7 +318,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, goto out; } - if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c, + if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, "btree node with incorrect min_key at btree %s level %u:\n" " prev %s\n" " node %s", @@ -336,7 +342,7 @@ static int btree_repair_node_end(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); bch2_bpos_to_text(&buf2, b->key.k.p); - if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c, + if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, "btree node with incorrect max_key at btree %s level %u:\n" " %s\n" " expected %s", @@ -374,8 +380,8 @@ again: bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); bch2_btree_and_journal_iter_advance(&iter); bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -404,8 +410,7 @@ again: } if (ret) { - bch_err(c, "%s: error getting btree node: %s", - __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "getting btree node"); break; } @@ -473,8 +478,7 @@ again: ret = PTR_ERR_OR_ZERO(cur); if (ret) { - bch_err(c, "%s: error getting btree node: %s", - __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "getting btree node"); goto err; } @@ -522,7 +526,7 @@ fsck_err: return ret; } -static int bch2_repair_topology(struct bch_fs *c) +int bch2_check_topology(struct bch_fs *c) { struct btree_trans trans; struct btree *b; @@ -531,8 +535,13 @@ static int bch2_repair_topology(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for (i = 0; i < BTREE_ID_NR && !ret; i++) { - b = c->btree_roots[i].b; + for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; + + b = r->b; if (btree_node_fake(b)) continue; @@ -572,15 +581,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); - if (c->opts.reconstruct_alloc || - fsck_err_on(!g->gen_valid, c, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (!g->gen_valid && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -589,14 +598,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (gen_cmp(p.ptr.gen, g->gen) > 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -609,32 +619,33 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) do_update = true; - if (fsck_err_on(!p.ptr.cached && - gen_cmp(p.ptr.gen, g->gen) < 0, c, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_types[ptr_data_type(k->k, &p.ptr)], - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) continue; - if (fsck_err_on(g->data_type && - g->data_type != data_type, c, + if (fsck_err_on(bucket_data_type(g->data_type) && + bucket_data_type(g->data_type) != data_type, c, "bucket %u:%zu different types of data in same bucket: %s, %s\n" "while marking %s", p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), @@ -661,7 +672,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c, + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, "pointer does not match stripe %llu\n" "while marking %s", (u64) p.ec.idx, @@ -685,8 +696,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); if (!new) { - bch_err(c, "%s: error allocating new key", __func__); - ret = -ENOMEM; + bch_err_msg(c, ret, "allocating new key"); + ret = -BCH_ERR_ENOMEM_gc_repair_key; goto err; } @@ -757,7 +768,7 @@ found: if (level) bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); - if (c->opts.verbose) { + if (0) { printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, *k); bch_info(c, "updated %s", buf.buf); @@ -808,11 +819,11 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, } ret = commit_do(trans, NULL, NULL, 0, - bch2_mark_key(trans, old, *k, flags)); + bch2_mark_key(trans, btree_id, level, old, *k, flags)); fsck_err: err: if (ret) - bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -883,11 +894,11 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, return ret; mutex_lock(&c->btree_root_lock); - b = c->btree_roots[btree_id].b; + b = bch2_btree_id_root(c, btree_id)->b; if (!btree_node_fake(b)) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, &k, initial); } gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); @@ -912,16 +923,13 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b bkey_init(&prev.k->k); while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, &k, true); - if (ret) { - bch_err(c, "%s: error from bch2_gc_mark_key: %s", - __func__, bch2_err_str(ret)); + if (ret) goto fsck_err; - } if (b->c.level) { bch2_bkey_buf_reassemble(&cur, c, k); @@ -967,9 +975,9 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b b->c.level - 1, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && - !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { - ret = -BCH_ERR_need_topology_repair; + should_restart_for_topology_repair(c)) { bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto fsck_err; } else { /* Continue marking when opted to not @@ -979,8 +987,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b continue; } } else if (ret) { - bch_err(c, "%s: error getting btree node: %s", - __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "getting btree node"); break; } @@ -1010,7 +1017,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - b = c->btree_roots[btree_id].b; + b = bch2_btree_id_root(c, btree_id)->b; if (btree_node_fake(b)) return 0; @@ -1018,7 +1025,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, six_lock_read(&b->c.lock, NULL, NULL); printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); - if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c, + if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, "btree root with incorrect min_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1027,7 +1034,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans, printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); - if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, SPOS_MAX), c, + if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, "btree root with incorrect max_key: %s", buf.buf)) { bch_err(c, "repair unimplemented"); ret = -BCH_ERR_fsck_repair_unimplemented; @@ -1040,14 +1047,14 @@ static int bch2_gc_btree_init(struct btree_trans *trans, if (!ret) { struct bkey_s_c k = bkey_i_to_s_c(&b->key); - ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, true, + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, &k, true); } fsck_err: six_unlock_read(&b->c.lock); if (ret < 0) - bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); printbuf_exit(&buf); return ret; } @@ -1079,8 +1086,17 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) ? bch2_gc_btree_init(&trans, ids[i], metadata_only) : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { + if (!bch2_btree_id_root(c, i)->alive) + continue; + + ret = initial + ? bch2_gc_btree_init(&trans, i, metadata_only) + : bch2_gc_btree(&trans, i, initial, metadata_only); + } + if (ret < 0) - bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); bch2_trans_exit(&trans); return ret; @@ -1222,7 +1238,7 @@ static int bch2_gc_done(struct bch_fs *c, for_each_member_device(ca, c, dev) { struct bch_dev_usage *dst = ca->usage_base; struct bch_dev_usage *src = (void *) - bch2_acc_percpu_u64s((void *) ca->usage_gc, + bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, dev_usage_u64s()); copy_dev_field(buckets_ec, "buckets_ec"); @@ -1238,7 +1254,7 @@ static int bch2_gc_done(struct bch_fs *c, unsigned nr = fs_usage_u64s(c); struct bch_fs_usage *dst = c->usage_base; struct bch_fs_usage *src = (void *) - bch2_acc_percpu_u64s((void *) c->usage_gc, nr); + bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); copy_fs_field(hidden, "hidden"); copy_fs_field(btree, "btree"); @@ -1278,15 +1294,14 @@ fsck_err: if (ca) percpu_ref_put(&ca->ref); if (ret) - bch_err(c, "error from %s(): %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); percpu_up_write(&c->mark_lock); printbuf_exit(&buf); return ret; } -static int bch2_gc_start(struct bch_fs *c, - bool metadata_only) +static int bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca = NULL; unsigned i; @@ -1297,18 +1312,17 @@ static int bch2_gc_start(struct bch_fs *c, sizeof(u64), GFP_KERNEL); if (!c->usage_gc) { bch_err(c, "error allocating c->usage_gc"); - return -ENOMEM; + return -BCH_ERR_ENOMEM_gc_start; } for_each_member_device(ca, c, i) { - BUG_ON(ca->buckets_gc); BUG_ON(ca->usage_gc); ca->usage_gc = alloc_percpu(struct bch_dev_usage); if (!ca->usage_gc) { bch_err(c, "error allocating ca->usage_gc"); percpu_ref_put(&ca->ref); - return -ENOMEM; + return -BCH_ERR_ENOMEM_gc_start; } this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, @@ -1318,6 +1332,22 @@ static int bch2_gc_start(struct bch_fs *c, return 0; } +static int bch2_gc_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; + + return bch2_gc_start(c); +} + /* returns true if not equal */ static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, struct bch_alloc_v4 r) @@ -1340,15 +1370,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); struct bucket gc, *b; struct bkey_i_alloc_v4 *a; - struct bch_alloc_v4 old, new; + struct bch_alloc_v4 old_convert, new; + const struct bch_alloc_v4 *old; enum bch_data_type type; int ret; - if (bkey_cmp(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0) + if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) return 1; - bch2_alloc_to_v4(k, &old); - new = old; + old = bch2_alloc_to_v4(k, &old_convert); + new = *old; percpu_down_read(&c->mark_lock); b = gc_bucket(ca, iter->pos.offset); @@ -1360,7 +1391,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, type = __alloc_data_type(b->dirty_sectors, b->cached_sectors, b->stripe, - old, + *old, b->data_type); if (b->data_type != type) { struct bch_dev_usage *u; @@ -1382,9 +1413,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc.data_type != BCH_DATA_btree) return 0; - if (gen_after(old.gen, gc.gen)) + if (gen_after(old->gen, gc.gen)) return 0; + if (c->opts.reconstruct_alloc || + fsck_err_on(new.data_type != gc.data_type, c, + "bucket %llu:%llu gen %u has wrong data_type" + ": got %s, should be %s", + iter->pos.inode, iter->pos.offset, + gc.gen, + bch2_data_types[new.data_type], + bch2_data_types[gc.data_type])) + new.data_type = gc.data_type; + #define copy_bucket_field(_f) \ if (c->opts.reconstruct_alloc || \ fsck_err_on(new._f != gc._f, c, \ @@ -1397,14 +1438,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new._f = gc._f; \ copy_bucket_field(gen); - copy_bucket_field(data_type); copy_bucket_field(dirty_sectors); copy_bucket_field(cached_sectors); copy_bucket_field(stripe_redundancy); copy_bucket_field(stripe); #undef copy_bucket_field - if (!bch2_alloc_v4_cmp(old, new)) + if (!bch2_alloc_v4_cmp(*old, new)) return 0; a = bch2_alloc_to_v4_mut(trans, k); @@ -1462,7 +1502,8 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) struct btree_iter iter; struct bkey_s_c k; struct bucket *g; - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; unsigned i; int ret; @@ -1473,7 +1514,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) if (!buckets) { percpu_ref_put(&ca->ref); bch_err(c, "error allocating ca->buckets[gc]"); - return -ENOMEM; + return -BCH_ERR_ENOMEM_gc_alloc_start; } buckets->first_bucket = ca->mi.first_bucket; @@ -1488,20 +1529,20 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) ca = bch_dev_bkey_exists(c, k.k->p.inode); g = gc_bucket(ca, k.k->p.offset); - bch2_alloc_to_v4(k, &a); + a = bch2_alloc_to_v4(k, &a_convert); g->gen_valid = 1; - g->gen = a.gen; + g->gen = a->gen; if (metadata_only && - (a.data_type == BCH_DATA_user || - a.data_type == BCH_DATA_cached || - a.data_type == BCH_DATA_parity)) { - g->data_type = a.data_type; - g->dirty_sectors = a.dirty_sectors; - g->cached_sectors = a.cached_sectors; - g->stripe = a.stripe; - g->stripe_redundancy = a.stripe_redundancy; + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; } } bch2_trans_iter_exit(&trans, &iter); @@ -1567,21 +1608,16 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, " should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { - struct bkey_i *new; + struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; - bkey_reassemble(new, k); - if (!r->refcount) new->k.type = KEY_TYPE_deleted; else *bkey_refcount(new) = cpu_to_le64(r->refcount); - - ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: printbuf_exit(&buf); @@ -1637,7 +1673,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, GFP_KERNEL); if (!r) { - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_gc_reflink_start; break; } @@ -1668,6 +1704,7 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct printbuf buf = PRINTBUF; const struct bch_stripe *s; struct gc_stripe *m; + bool bad = false; unsigned i; int ret = 0; @@ -1677,18 +1714,21 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, s = bkey_s_c_to_stripe(k).v; m = genradix_ptr(&c->gc_stripes, k.k->p.offset); - for (i = 0; i < s->nr_blocks; i++) - if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0)) - goto inconsistent; - return 0; -inconsistent: - if (fsck_err_on(true, c, - "stripe has wrong block sector count %u:\n" - " %s\n" - " should be %u", i, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf), - m ? m->block_sectors[i] : 0)) { + for (i = 0; i < s->nr_blocks; i++) { + u32 old = stripe_blockcount_get(s, i); + u32 new = (m ? m->block_sectors[i] : 0); + + if (old != new) { + prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", + i, old, new); + bad = true; + } + } + + if (bad) + bch2_bkey_val_to_text(&buf, c, k); + + if (fsck_err_on(bad, c, "%s", buf.buf)) { struct bkey_i_stripe *new; new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); @@ -1764,7 +1804,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) bch2_btree_interior_updates_flush(c); - ret = bch2_gc_start(c, metadata_only) ?: + ret = bch2_gc_start(c) ?: bch2_gc_alloc_start(c, metadata_only) ?: bch2_gc_reflink_start(c, metadata_only); if (ret) @@ -1774,31 +1814,8 @@ again: bch2_mark_superblocks(c); - if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && - c->opts.fix_errors != FSCK_OPT_NO) { - bch_info(c, "Starting topology repair pass"); - ret = bch2_repair_topology(c); - if (ret) - goto out; - bch_info(c, "Topology repair pass done"); - - set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); - } - ret = bch2_gc_btrees(c, initial, metadata_only); - if (ret == -BCH_ERR_need_topology_repair && - !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); - SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true); - ret = 0; - } - - if (ret == -BCH_ERR_need_topology_repair) - ret = -BCH_ERR_fsck_errors_not_fixed; - if (ret) goto out; @@ -1825,6 +1842,9 @@ again: bch2_gc_stripes_reset(c, metadata_only); bch2_gc_alloc_reset(c, metadata_only); bch2_gc_reflink_reset(c, metadata_only); + ret = bch2_gc_reset(c); + if (ret) + goto out; /* flush fsck errors, reset counters */ bch2_flush_fsck_errs(c); @@ -1856,6 +1876,9 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + + if (ret) + bch_err_fn(c, ret); return ret; } @@ -1889,28 +1912,25 @@ static int gc_btree_gens_key(struct btree_trans *trans, percpu_up_read(&c->mark_lock); return 0; update: - u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + u = bch2_bkey_make_mut(trans, iter, &k, 0); ret = PTR_ERR_OR_ZERO(u); if (ret) return ret; - bkey_reassemble(u, k); - bch2_extent_normalize(c, bkey_i_to_s(u)); - return bch2_trans_update(trans, iter, u, 0); + return 0; } static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); struct bkey_i_alloc_v4 *a_mut; int ret; - bch2_alloc_to_v4(k, &a); - - if (a.oldest_gen == ca->oldest_gen[iter->pos.offset]) + if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) return 0; a_mut = bch2_alloc_to_v4_mut(trans, k); @@ -1954,7 +1974,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { percpu_ref_put(&ca->ref); - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_gc_gens; goto err; } @@ -1979,7 +1999,7 @@ int bch2_gc_gens(struct bch_fs *c) NULL, NULL, BTREE_INSERT_NOFAIL, gc_btree_gens_key(&trans, &iter, k)); - if (ret && ret != -EROFS) + if (ret && !bch2_err_matches(ret, EROFS)) bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); if (ret) goto err; @@ -1992,7 +2012,7 @@ int bch2_gc_gens(struct bch_fs *c) NULL, NULL, BTREE_INSERT_NOFAIL, bch2_alloc_write_oldest_gen(&trans, &iter, k)); - if (ret && ret != -EROFS) + if (ret && !bch2_err_matches(ret, EROFS)) bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); if (ret) goto err; diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 95d803b..b45e382 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -4,6 +4,7 @@ #include "btree_types.h" +int bch2_check_topology(struct bch_fs *); int bch2_gc(struct bch_fs *, bool, bool); int bch2_gc_gens(struct bch_fs *); void bch2_gc_thread_stop(struct bch_fs *); @@ -50,7 +51,7 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) { switch (id) { -#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; +#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; BCH_BTREE_IDS() #undef x default: diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 48f213f..c049876 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -18,9 +18,9 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "super-io.h" +#include "trace.h" #include -#include void bch2_btree_node_io_unlock(struct btree *b) { @@ -33,7 +33,7 @@ void bch2_btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_lock(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); @@ -53,7 +53,7 @@ void __bch2_btree_node_wait_on_write(struct btree *b) void bch2_btree_node_wait_on_read(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); @@ -61,7 +61,7 @@ void bch2_btree_node_wait_on_read(struct btree *b) void bch2_btree_node_wait_on_write(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); @@ -77,13 +77,13 @@ static void verify_no_dups(struct btree *b, if (start == end) return; - for (p = start, k = bkey_next(start); + for (p = start, k = bkey_p_next(start); k != end; - p = k, k = bkey_next(k)) { + p = k, k = bkey_p_next(k)) { struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); + BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); } #endif } @@ -92,7 +92,7 @@ static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) k->needs_whiteout = v; } @@ -105,8 +105,8 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, vpfree(p, size); } -static void *btree_bounce_alloc(struct bch_fs *c, size_t size, - bool *used_mempool) +static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, + bool *used_mempool) { unsigned flags = memalloc_nofs_save(); void *p; @@ -114,14 +114,16 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; - p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); } memalloc_nofs_restore(flags); return p; } +#define btree_bounce_alloc(_c, _size, _used_mempool) \ + alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) static void sort_bkey_ptrs(const struct btree *bt, struct bkey_packed **ptrs, unsigned nr) @@ -175,7 +177,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) for (k = unwritten_whiteouts_start(c, b); k != unwritten_whiteouts_end(c, b); - k = bkey_next(k)) + k = bkey_p_next(k)) *--ptrs = k; sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); @@ -184,7 +186,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) while (ptrs != ptrs_end) { bkey_copy(k, *ptrs); - k = bkey_next(k); + k = bkey_p_next(k); ptrs++; } @@ -256,11 +258,11 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_p_next(k); if (!bkey_deleted(k)) { bkey_copy(out, k); - out = bkey_next(out); + out = bkey_p_next(out); } else { BUG_ON(k->needs_whiteout); } @@ -483,7 +485,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) struct btree_node_entry *bne; bool reinit_iter = false; - EBUG_ON(!(b->c.lock.state.seq & 1)); + EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); BUG_ON(bset_written(b, bset(b, &b->set[1]))); BUG_ON(btree_node_just_written(b)); @@ -517,7 +519,7 @@ static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "%s level %u/%u\n ", bch2_btree_ids[b->c.btree_id], b->c.level, - c->btree_roots[b->c.btree_id].level); + bch2_btree_id_root(c, b->c.btree_id)->level); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } @@ -526,11 +528,10 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - prt_printf(out, bch2_log_msg(c, "")); - if (!write) - prt_str(out, "error validating btree node "); - else - prt_str(out, "corrupt btree node before write "); + prt_printf(out, bch2_log_msg(c, "%s"), + write == READ + ? "error validating btree node " + : "corrupt btree node before write "); if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); @@ -543,63 +544,96 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, } enum btree_err_type { + /* + * We can repair this locally, and we're after the checksum check so + * there's no need to try another replica: + */ BTREE_ERR_FIXABLE, + /* + * We can repair this if we have to, but we should try reading another + * replica if we can: + */ BTREE_ERR_WANT_RETRY, + /* + * Read another replica if we have one, otherwise consider the whole + * node bad: + */ BTREE_ERR_MUST_RETRY, - BTREE_ERR_FATAL, + BTREE_ERR_BAD_NODE, + BTREE_ERR_INCOMPATIBLE, }; enum btree_validate_ret { BTREE_RETRY_READ = 64, }; +static int __btree_err(enum btree_err_type type, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + int write, + bool have_retry, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; + va_list args; + int ret = -BCH_ERR_fsck_fix; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + + va_start(args, fmt); + prt_vprintf(&out, fmt, args); + va_end(args); + + if (write == WRITE) { + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = c->opts.errors == BCH_ON_ERROR_continue + ? 0 + : -BCH_ERR_fsck_errors_not_fixed; + goto out; + } + + if (!have_retry && type == BTREE_ERR_WANT_RETRY) + type = BTREE_ERR_FIXABLE; + if (!have_retry && type == BTREE_ERR_MUST_RETRY) + type = BTREE_ERR_BAD_NODE; + + switch (type) { + case BTREE_ERR_FIXABLE: + mustfix_fsck_err(c, "%s", out.buf); + ret = -BCH_ERR_fsck_fix; + break; + case BTREE_ERR_WANT_RETRY: + case BTREE_ERR_MUST_RETRY: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = BTREE_RETRY_READ; + break; + case BTREE_ERR_BAD_NODE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + break; + case BTREE_ERR_INCOMPATIBLE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; + default: + BUG(); + } +out: +fsck_err: + printbuf_exit(&out); + return ret; +} + #define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ - __label__ out; \ - struct printbuf out = PRINTBUF; \ - \ - btree_err_msg(&out, c, ca, b, i, b->written, write); \ - prt_printf(&out, msg, ##__VA_ARGS__); \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ \ - if (type == BTREE_ERR_FIXABLE && \ - write == READ && \ - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", out.buf); \ - goto out; \ - } \ - \ - bch2_print_string_as_lines(KERN_ERR, out.buf); \ - \ - switch (write) { \ - case READ: \ - switch (type) { \ - case BTREE_ERR_FIXABLE: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - case BTREE_ERR_WANT_RETRY: \ - if (have_retry) { \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - } \ - break; \ - case BTREE_ERR_MUST_RETRY: \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - case BTREE_ERR_FATAL: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - case WRITE: \ - if (bch2_fs_inconsistent(c)) { \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - } \ -out: \ - printbuf_exit(&out); \ - true; \ + if (_ret != -BCH_ERR_fsck_fix) \ + goto fsck_err; \ + *saw_error = true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -608,6 +642,7 @@ out: \ * When btree topology repair changes the start or end of a node, that might * mean we have to drop keys that are no longer inside the node: */ +__cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; @@ -619,7 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) struct bset *i = bset(b, t); struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) break; @@ -632,7 +667,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) set_btree_bset_end(b, t); } - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) break; @@ -650,15 +685,15 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) bch2_btree_build_aux_trees(b); for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); } } static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, unsigned offset, unsigned sectors, - int write, bool have_retry) + int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); const char *err; @@ -666,11 +701,11 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct printbuf buf2 = PRINTBUF; int ret = 0; - btree_err_on((version != BCH_BSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, ca, b, i, - "unsupported bset version"); + btree_err_on(!bch2_version_compatible(version), + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + "unsupported bset version %u.%u", + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); if (btree_err_on(version < c->sb.version_min, BTREE_ERR_FIXABLE, c, NULL, b, i, @@ -682,7 +717,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, mutex_unlock(&c->sb_lock); } - if (btree_err_on(version > c->sb.version, + if (btree_err_on(BCH_VERSION_MAJOR(version) > + BCH_VERSION_MAJOR(c->sb.version), BTREE_ERR_FIXABLE, c, NULL, b, i, "bset version %u newer than superblock version %u", version, c->sb.version)) { @@ -693,7 +729,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), @@ -749,7 +785,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, b->data->max_key = b->key.k.p; } - btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -758,7 +794,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); } - btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", (printbuf_reset(&buf1), @@ -770,7 +806,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_BAD_NODE, c, ca, b, i, "invalid bkey format: %s", err); compat_bformat(b->c.level, b->c.btree_id, version, @@ -795,7 +831,8 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, } static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, bool have_retry) + struct bset *i, int write, + bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -809,7 +846,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bkey_s u; struct bkey tmp; - if (btree_err_on(bkey_next(k) > vstruct_last(i), + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), BTREE_ERR_FIXABLE, c, NULL, b, i, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -820,7 +857,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -844,7 +881,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -867,14 +904,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } } prev = k; - k = bkey_next(k); + k = bkey_p_next(k); } fsck_err: printbuf_exit(&buf); @@ -882,7 +919,7 @@ fsck_err: } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry) + struct btree *b, bool have_retry, bool *saw_error) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -897,13 +934,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; - int ret, retry_read = 0, write = READ; + int ret = 0, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ b->written = 0; - iter = mempool_alloc(&c->fill_iter, GFP_NOIO); + iter = mempool_alloc(&c->fill_iter, GFP_NOFS); sort_iter_init(iter, b); iter->size = (btree_blocks(c) + 1) * 2; @@ -958,7 +995,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - BTREE_ERR_FATAL, c, NULL, b, NULL, + BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -993,14 +1030,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le16_to_cpu(i->version)); ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry); + READ, have_retry, saw_error); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry); + ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); if (ret) goto fsck_err; @@ -1105,7 +1142,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); continue; @@ -1117,7 +1154,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bp.v->mem_ptr = 0; } - k = bkey_next(k); + k = bkey_p_next(k); } bch2_bset_build_aux_tree(b, b->set, false); @@ -1140,12 +1177,10 @@ out: printbuf_exit(&buf); return retry_read; fsck_err: - if (ret == BTREE_RETRY_READ) { + if (ret == BTREE_RETRY_READ) retry_read = 1; - } else { - bch2_inconsistent_error(c); + else set_btree_node_read_error(b); - } goto out; } @@ -1195,7 +1230,7 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry)) { + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { if (retry) bch_info(c, "retry success"); break; @@ -1214,8 +1249,16 @@ start: bio_put(&rb->bio); printbuf_exit(&buf); - if (saw_error && !btree_node_read_error(b)) + if (saw_error && !btree_node_read_error(b)) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, b->key.k.p); + bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", + __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); + printbuf_exit(&buf); + bch2_btree_node_rewrite_async(c, b); + } clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1243,7 +1286,7 @@ struct btree_node_read_all { unsigned nr; void *buf[BCH_REPLICAS_MAX]; struct bio *bio[BCH_REPLICAS_MAX]; - int err[BCH_REPLICAS_MAX]; + blk_status_t err[BCH_REPLICAS_MAX]; }; static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) @@ -1301,6 +1344,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1387,13 +1431,15 @@ fsck_err: if (best >= 0) { memcpy(b->data, ra->buf[best], btree_bytes(c)); - ret = bch2_btree_node_read_done(c, NULL, b, false); + ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; } if (ret) set_btree_node_read_error(b); + else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { mempool_free(ra->buf[i], &c->btree_bounce_pool); @@ -1440,7 +1486,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; closure_init(&ra->cl, NULL); ra->c = c; @@ -1522,7 +1568,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, btree_pos_to_text(&buf, c, b); bch_err(c, "%s", buf.buf); - if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) + if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) bch2_fatal_error(c); set_btree_node_read_error(b); @@ -1537,7 +1584,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_bytes(c)), REQ_OP_READ|REQ_SYNC|REQ_META, - GFP_NOIO, + GFP_NOFS, &c->btree_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; @@ -1573,9 +1620,10 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, } } -int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) +static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + const struct bkey_i *k, unsigned level) { + struct bch_fs *c = trans->c; struct closure cl; struct btree *b; int ret; @@ -1587,7 +1635,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c, level != 0); + b = bch2_btree_node_mem_alloc(trans, level != 0); bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); @@ -1618,6 +1666,13 @@ err: return ret; } +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); + +} + void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { @@ -1695,7 +1750,7 @@ static void btree_node_write_work(struct work_struct *work) struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; struct bch_extent_ptr *ptr; - int ret; + int ret = 0; btree_bounce_free(c, wbio->data_bytes, @@ -1715,7 +1770,11 @@ static void btree_node_write_work(struct work_struct *work) } else { ret = bch2_trans_do(c, NULL, NULL, 0, bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, - !wbio->wbio.failed.nr)); + BCH_WATERMARK_reclaim| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW, + !wbio->wbio.failed.nr)); if (ret) goto err; } @@ -1725,7 +1784,8 @@ out: return; err: set_btree_node_noevict(b); - bch2_fs_fatal_error(c, "fatal error writing btree node"); + if (!bch2_err_matches(ret, EROFS)) + bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); goto out; } @@ -1770,6 +1830,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { struct printbuf buf = PRINTBUF; + bool saw_error; int ret; ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), @@ -1781,8 +1842,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (ret) return ret; - ret = validate_bset_keys(c, b, i, WRITE, false) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); + ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -1795,7 +1856,7 @@ static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); struct bch_extent_ptr *ptr; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; bkey_copy(&tmp.k, &wbio->key); @@ -1962,9 +2023,7 @@ do_write: BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = c->sb.version < bcachefs_metadata_version_bkey_renumber - ? cpu_to_le16(BCH_BSET_VERSION_OLD) - : cpu_to_le16(c->sb.version); + i->version = cpu_to_le16(c->sb.version); SET_BSET_OFFSET(i, b->written); SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); @@ -2024,7 +2083,7 @@ do_write: wbio = container_of(bio_alloc_bioset(NULL, buf_pages(data, sectors_to_write << 9), REQ_OP_WRITE|REQ_META, - GFP_NOIO, + GFP_NOFS, &c->btree_bio), struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); @@ -2177,7 +2236,7 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c) return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } -const char * const bch2_btree_write_types[] = { +static const char * const bch2_btree_write_types[] = { #define x(t, n) [n] = #t, BCH_BTREE_WRITE_TYPES() NULL diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 4b1810a..0cadf65 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -129,7 +129,7 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool); + struct btree *, bool, bool *); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); @@ -178,7 +178,7 @@ static inline void compat_bformat(unsigned level, enum btree_id btree_id, f->field_offset[BKEY_FIELD_SNAPSHOT] = write ? 0 - : U32_MAX - max_packed; + : cpu_to_le64(U32_MAX - max_packed); } } @@ -200,8 +200,8 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, struct btree_node *bn) { if (version < bcachefs_metadata_version_inode_btree_change && - btree_node_type_is_extents(btree_id) && - bpos_cmp(bn->min_key, POS_MIN) && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && write) bn->min_key = bpos_nosnap_predecessor(bn->min_key); @@ -217,8 +217,8 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, bn->max_key.snapshot = U32_MAX; if (version < bcachefs_metadata_version_inode_btree_change && - btree_node_type_is_extents(btree_id) && - bpos_cmp(bn->min_key, POS_MIN) && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && !write) bn->min_key = bpos_nosnap_successor(bn->min_key); } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index b6a761d..dfb77b2 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -15,15 +15,10 @@ #include "recovery.h" #include "replicas.h" #include "subvolume.h" +#include "trace.h" -#include +#include #include -#include - -static void btree_trans_verify_sorted(struct btree_trans *); -inline void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); -static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *, - struct btree_path *, int); static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, @@ -31,7 +26,7 @@ static inline void btree_path_list_add(struct btree_trans *, struct btree_path * static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) { -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED return iter->ip_allocated; #else return 0; @@ -40,21 +35,6 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); -/* - * Unlocks before scheduling - * Note: does not revalidate iterator - */ -static inline int bch2_trans_cond_resched(struct btree_trans *trans) -{ - if (need_resched() || race_fault()) { - bch2_trans_unlock(trans); - schedule(); - return bch2_trans_relock(trans); - } else { - return 0; - } -} - static inline int __btree_path_cmp(const struct btree_path *l, enum btree_id r_btree_id, bool r_cached, @@ -107,7 +87,7 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) struct bpos pos = iter->pos; if ((iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(pos, POS_MAX)) + !bkey_eq(pos, POS_MAX)) pos = bkey_successor(iter, pos); return pos; } @@ -115,13 +95,13 @@ static inline struct bpos btree_iter_search_key(struct btree_iter *iter) static inline bool btree_path_pos_before_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(path->pos, b->data->min_key) < 0; + return bpos_lt(path->pos, b->data->min_key); } static inline bool btree_path_pos_after_node(struct btree_path *path, struct btree *b) { - return bpos_cmp(b->key.k.p, path->pos) < 0; + return bpos_gt(path->pos, b->key.k.p); } static inline bool btree_path_pos_in_node(struct btree_path *path, @@ -147,7 +127,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, ck = (void *) path->l[0].b; BUG_ON(ck->key.btree_id != path->btree_id || - bkey_cmp(ck->key.pos, path->pos)); + !bkey_eq(ck->key.pos, path->pos)); if (!locked) btree_node_unlock(trans, path, 0); @@ -246,7 +226,7 @@ static void bch2_btree_path_verify(struct btree_trans *trans, for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { if (!path->l[i].b) { BUG_ON(!path->cached && - c->btree_roots[path->btree_id].b->c.level > i); + bch2_btree_id_root(c, path->btree_id)->b->c.level > i); break; } @@ -292,8 +272,8 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && iter->pos.snapshot != iter->snapshot); - BUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || - bkey_cmp(iter->pos, iter->k.p) > 0); + BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p)); } static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) @@ -327,7 +307,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k if (ret) goto out; - if (!bkey_cmp(prev.k->p, k.k->p) && + if (bkey_eq(prev.k->p, k.k->p) && bch2_snapshot_is_ancestor(trans->c, iter->snapshot, prev.k->p.snapshot) > 0) { struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; @@ -353,6 +333,8 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, unsigned idx; struct printbuf buf = PRINTBUF; + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, idx) { int cmp = cmp_int(path->btree_id, id) ?: cmp_int(path->cached, key_cache); @@ -367,11 +349,11 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, continue; if (!key_cache) { - if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && - bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) + if (bkey_ge(pos, path->l[0].b->data->min_key) && + bkey_le(pos, path->l[0].b->key.k.p)) return; } else { - if (!bkey_cmp(pos, path->pos)) + if (bkey_eq(pos, path->pos)) return; } } @@ -540,7 +522,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, unsigned clobber_u64s, unsigned new_u64s) { - struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); struct btree_path *linked; if (node_iter != &path->l[b->c.level].iter) { @@ -595,6 +577,7 @@ static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, bch2_btree_node_iter_peek(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->key.k.p; + trans->paths_sorted = false; bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -608,6 +591,7 @@ static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, bch2_btree_node_iter_prev(&l->iter, l->b)); path->pos = k.k ? k.k->p : l->b->data->min_key; + trans->paths_sorted = false; bch2_btree_path_verify_level(trans, path, l - path->l); return k; } @@ -653,15 +637,40 @@ void bch2_btree_path_level_init(struct btree_trans *trans, BUG_ON(path->cached); EBUG_ON(!btree_path_pos_in_node(path, b)); - EBUG_ON(b->c.lock.state.seq & 1); - path->l[b->c.level].lock_seq = b->c.lock.state.seq; + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); path->l[b->c.level].b = b; __btree_path_level_init(path, b->c.level); } /* Btree path: fixups after btree node updates: */ +static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!i->cached && + i->level == b->c.level && + i->btree_id == b->c.btree_id && + bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && + bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { + i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, + i->k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } +} + /* * A btree node is being replaced - update the iterator to point to the new * node: @@ -679,12 +688,14 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) if (t != BTREE_NODE_UNLOCKED) { btree_node_unlock(trans, path, b->c.level); - six_lock_increment(&b->c.lock, t); - mark_btree_node_locked(trans, path, b->c.level, t); + six_lock_increment(&b->c.lock, (enum six_lock_type) t); + mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); } bch2_btree_path_level_init(trans, path, b); } + + bch2_trans_revalidate_updates_in_node(trans, b); } /* @@ -697,6 +708,8 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) trans_for_each_path_with_node(trans, b, path) __btree_path_level_init(path, b->c.level); + + bch2_trans_revalidate_updates_in_node(trans, b); } /* Btree path: traverse, set_pos: */ @@ -707,7 +720,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; + struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; enum six_lock_type lock_type; unsigned i; int ret; @@ -786,7 +799,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat break; bch2_bkey_buf_unpack(&tmp, c, l->b, k); - ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); } @@ -821,7 +834,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p break; bch2_bkey_buf_reassemble(&tmp, c, k); - ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id, + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); } @@ -888,7 +901,6 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - bool replay_done = test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); struct bkey_buf tmp; int ret; @@ -896,7 +908,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); - if (unlikely(!replay_done)) { + if (unlikely(trans->journal_replay_not_finished)) { ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); if (ret) goto err; @@ -916,7 +928,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, if (unlikely(ret)) goto err; - if (likely(replay_done && tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && + if (likely(!trans->journal_replay_not_finished && + tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && unlikely(b != btree_node_mem_ptr(tmp.k))) btree_node_mem_ptr_set(trans, path, level + 1, b); @@ -933,15 +946,13 @@ err: return ret; } -static int btree_path_traverse_one(struct btree_trans *, struct btree_path *, - unsigned, unsigned long); static int bch2_btree_path_traverse_all(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct btree_path *path; unsigned long trace_ip = _RET_IP_; - int ret = 0; + int i, ret = 0; if (trans->in_traverse_all) return -BCH_ERR_transaction_restart_in_traverse_all; @@ -949,12 +960,12 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) trans->in_traverse_all = true; retry_all: trans->restarted = 0; - trans->traverse_all_idx = U8_MAX; + trans->last_restarted_ip = 0; trans_for_each_path(trans, path) path->should_be_locked = false; - btree_trans_verify_sorted(trans); + btree_trans_sort_paths(trans); bch2_trans_unlock(trans); cond_resched(); @@ -971,34 +982,35 @@ retry_all: } /* Now, redo traversals in correct order: */ - trans->traverse_all_idx = 0; - while (trans->traverse_all_idx < trans->nr_sorted) { - path = trans->paths + trans->sorted[trans->traverse_all_idx]; + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; /* * Traversing a path can cause another path to be added at about * the same position: */ if (path->uptodate) { - ret = btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_get(path, false); + ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_put(path, false); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || - ret == -ENOMEM) + bch2_err_matches(ret, ENOMEM)) goto retry_all; if (ret) goto err; - BUG_ON(path->uptodate); } else { - trans->traverse_all_idx++; + i++; } } /* - * BTREE_ITER_NEED_RELOCK is ok here - if we called bch2_trans_unlock() - * and relock(), relock() won't relock since path->should_be_locked - * isn't set yet, which is all fine + * We used to assert that all paths had been traversed here + * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since + * path->Should_be_locked is not set yet, we we might have unlocked and + * then failed to relock a path - that's fine. */ - trans_for_each_path(trans, path) - BUG_ON(path->uptodate >= BTREE_ITER_NEED_TRAVERSE); err: bch2_btree_cache_cannibalize_unlock(c); @@ -1085,10 +1097,10 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, * On error, caller (peek_node()/peek_key()) must return NULL; the error is * stashed in the iterator and returned from bch2_trans_exit(). */ -static int btree_path_traverse_one(struct btree_trans *trans, - struct btree_path *path, - unsigned flags, - unsigned long trace_ip) +int bch2_btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) { unsigned depth_want = path->level; int ret = -((int) trans->restarted); @@ -1147,31 +1159,14 @@ static int btree_path_traverse_one(struct btree_trans *trans, path->uptodate = BTREE_ITER_UPTODATE; out: - BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) + panic("ret %s (%i) trans->restarted %s (%i)\n", + bch2_err_str(ret), ret, + bch2_err_str(trans->restarted), trans->restarted); bch2_btree_path_verify(trans, path); return ret; } -int __must_check bch2_btree_path_traverse(struct btree_trans *trans, - struct btree_path *path, unsigned flags) -{ - if (0 && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - unsigned restart_probability_bits = 4 << min(trans->restart_count, 32U); - u64 mask = ~(~0ULL << restart_probability_bits); - - if ((prandom_u32() & mask) == mask) { - trace_and_count(trans->c, trans_restart_injected, trans, _RET_IP_); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); - } - } - - if (path->uptodate < BTREE_ITER_NEED_RELOCK) - return 0; - - return bch2_trans_cond_resched(trans) ?: - btree_path_traverse_one(trans, path, flags, _RET_IP_); -} - static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, struct btree_path *src) { @@ -1207,10 +1202,6 @@ struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, __btree_path_put(path, intent); path = btree_path_clone(trans, path, intent); path->preserve = false; -#ifdef CONFIG_BCACHEFS_DEBUG - path->ip_allocated = ip; -#endif - btree_trans_verify_sorted(trans); return path; } @@ -1219,16 +1210,15 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos new_pos, bool intent, unsigned long ip, int cmp) { - unsigned l = path->level; + unsigned level = path->level; - EBUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); EBUG_ON(!path->ref); path = bch2_btree_path_make_mut(trans, path, intent, ip); - path->pos = new_pos; - - bch2_btree_path_check_sort_fast(trans, path, cmp); + path->pos = new_pos; + trans->paths_sorted = false; if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); @@ -1237,10 +1227,12 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, goto out; } - l = btree_path_up_until_good_node(trans, path, cmp); + level = btree_path_up_until_good_node(trans, path, cmp); - if (btree_path_node(path, l)) { - BUG_ON(!btree_node_locked(path, l)); + if (btree_path_node(path, level)) { + struct btree_path_level *l = &path->l[level]; + + BUG_ON(!btree_node_locked(path, level)); /* * We might have to skip over many keys, or just a few: try * advancing the node iterator, and if we have to skip over too @@ -1248,11 +1240,18 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, * is expensive). */ if (cmp < 0 || - !btree_path_advance_to_pos(path, &path->l[l], 8)) - __btree_path_level_init(path, l); + !btree_path_advance_to_pos(path, l, 8)) + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (unlikely(level)) + bch2_btree_node_iter_peek(&l->iter, l->b); } - if (unlikely(l != path->level)) { + if (unlikely(level != path->level)) { btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } @@ -1342,9 +1341,25 @@ static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *p __bch2_path_free(trans, path); } +void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +{ + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_begin_ip); +} + +void bch2_trans_in_restart_error(struct btree_trans *trans) +{ + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); +} + +noinline __cold void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) { struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; prt_printf(buf, "transaction updates for %s journal seq %llu", trans->fn, trans->journal_res.seq); @@ -1369,6 +1384,17 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } + trans_for_each_wb_update(trans, wb) { + prt_printf(buf, "update: btree=%s wb=1 %pS", + bch2_btree_ids[wb->btree], + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); + prt_newline(buf); + } + printbuf_indent_sub(buf, 2); } @@ -1382,6 +1408,7 @@ void bch2_dump_trans_updates(struct btree_trans *trans) printbuf_exit(&buf); } +noinline __cold void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) { prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", @@ -1393,39 +1420,59 @@ void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) bch2_bpos_to_text(out, path->pos); prt_printf(out, " locks %u", path->nodes_locked); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED prt_printf(out, " %pS", (void *) path->ip_allocated); #endif prt_newline(out); } -void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +static noinline __cold +void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, + bool nosort) { struct btree_path *path; unsigned idx; + if (!nosort) + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, idx) bch2_btree_path_to_text(out, path); } noinline __cold -void bch2_dump_trans_paths_updates(struct btree_trans *trans) +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + __bch2_trans_paths_to_text(out, trans, false); +} + +static noinline __cold +void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) { struct printbuf buf = PRINTBUF; - bch2_trans_paths_to_text(&buf, trans); + __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); } -noinline +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + __bch2_dump_trans_paths_updates(trans, false); +} + +noinline __cold static void bch2_trans_update_max_paths(struct btree_trans *trans) { struct btree_transaction_stats *s = btree_trans_stats(trans); struct printbuf buf = PRINTBUF; + if (!s) + return; + bch2_trans_paths_to_text(&buf, trans); if (!buf.allocation_failure) { @@ -1439,6 +1486,8 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans) } printbuf_exit(&buf); + + trans->nr_max_paths = hweight64(trans->paths_allocated); } static noinline void btree_path_overflow(struct btree_trans *trans) @@ -1458,19 +1507,24 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, btree_path_overflow(trans); idx = __ffs64(~trans->paths_allocated); - trans->paths_allocated |= 1ULL << idx; + /* + * Do this before marking the new path as allocated, since it won't be + * initialized yet: + */ if (unlikely(idx > trans->nr_max_paths)) bch2_trans_update_max_paths(trans); - path = &trans->paths[idx]; + trans->paths_allocated |= 1ULL << idx; + path = &trans->paths[idx]; path->idx = idx; path->ref = 0; path->intent_ref = 0; path->nodes_locked = 0; btree_path_list_add(trans, pos, path); + trans->paths_sorted = false; return path; } @@ -1484,10 +1538,11 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, bool intent = flags & BTREE_ITER_INTENT; int i; - BUG_ON(trans->restarted); - btree_trans_verify_sorted(trans); + bch2_trans_verify_not_in_restart(trans); bch2_trans_verify_locks(trans); + btree_trans_sort_paths(trans); + trans_for_each_path_inorder(trans, path, i) { if (__btree_path_cmp(path, btree_id, @@ -1520,10 +1575,10 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, path->nodes_locked = 0; for (i = 0; i < ARRAY_SIZE(path->l); i++) path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED path->ip_allocated = ip; #endif - btree_trans_verify_sorted(trans); + trans->paths_sorted = false; } if (!(flags & BTREE_ITER_NOPRESERVE)) @@ -1564,17 +1619,18 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; - EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0); + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); - if (!k.k || bpos_cmp(path->pos, k.k->p)) + if (!k.k || !bpos_eq(path->pos, k.k->p)) goto hole; } else { struct bkey_cached *ck = (void *) path->l[0].b; EBUG_ON(ck && (path->btree_id != ck->key.btree_id || - bkey_cmp(path->pos, ck->key.pos))); - EBUG_ON(!ck || !ck->valid); + !bkey_eq(path->pos, ck->key.pos))); + if (!ck || !ck->valid) + return bkey_s_c_null; *u = ck->k->k; k = bkey_i_to_s_c(ck->k); @@ -1632,7 +1688,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) if (!b) goto out; - BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0); + BUG_ON(bpos_lt(b->key.k.p, iter->pos)); bkey_init(&iter->k); iter->k.p = iter->pos = b->key.k.p; @@ -1651,6 +1707,17 @@ err: goto out; } +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +{ + struct btree *b; + + while (b = bch2_btree_iter_peek_node(iter), + bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) + bch2_trans_begin(iter->trans); + + return b; +} + struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -1658,7 +1725,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) struct btree *b = NULL; int ret; - BUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); EBUG_ON(iter->path->cached); bch2_btree_iter_verify(iter); @@ -1684,7 +1751,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) b = btree_path_node(path, path->level + 1); - if (!bpos_cmp(iter->pos, b->key.k.p)) { + if (bpos_eq(iter->pos, b->key.k.p)) { __btree_path_set_level_up(trans, path, path->level++); } else { /* @@ -1729,9 +1796,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) { if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { struct bpos pos = iter->k.p; - bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_cmp(pos, SPOS_MAX) - : bkey_cmp(pos, SPOS_MAX)) != 0; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) pos = bkey_successor(iter, pos); @@ -1749,9 +1816,9 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) inline bool bch2_btree_iter_rewind(struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); - bool ret = (iter->flags & BTREE_ITER_ALL_SNAPSHOTS - ? bpos_cmp(pos, POS_MIN) - : bkey_cmp(pos, POS_MIN)) != 0; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, POS_MIN) + : bkey_eq(pos, POS_MIN)); if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) pos = bkey_predecessor(iter, pos); @@ -1759,53 +1826,66 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) return ret; } -static inline struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans, - enum btree_id btree_id, - struct bpos pos) +static noinline +struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) { struct btree_insert_entry *i; struct bkey_i *ret = NULL; - trans_for_each_update(trans, i) { - if (i->btree_id < btree_id) + trans_for_each_update(iter->trans, i) { + if (i->btree_id < iter->btree_id) continue; - if (i->btree_id > btree_id) + if (i->btree_id > iter->btree_id) break; - if (bpos_cmp(i->k->k.p, pos) < 0) + if (bpos_lt(i->k->k.p, iter->path->pos)) continue; if (i->key_cache_already_flushed) continue; - if (!ret || bpos_cmp(i->k->k.p, ret->k.p) < 0) + if (!ret || bpos_lt(i->k->k.p, ret->k.p)) ret = i->k; } return ret; } -struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos start_pos, - struct bpos end_pos) +static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_WITH_UPDATES + ? __bch2_btree_trans_peek_updates(iter) + : NULL; +} + +static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) { struct bkey_i *k; - if (bpos_cmp(start_pos, iter->journal_pos) < 0) + if (bpos_lt(iter->path->pos, iter->journal_pos)) iter->journal_idx = 0; k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, iter->path->level, - start_pos, end_pos, + iter->path->pos, + end_pos, &iter->journal_idx); iter->journal_pos = k ? k->k.p : end_pos; return k; } -struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) +static noinline +struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) { - return bch2_btree_journal_peek(trans, iter, pos, pos); + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + + if (k) { + iter->k = k->k; + return bkey_i_to_s_c(k); + } else { + return bkey_s_c_null; + } } static noinline @@ -1814,7 +1894,7 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, struct bkey_s_c k) { struct bkey_i *next_journal = - bch2_btree_journal_peek(trans, iter, iter->path->pos, + bch2_btree_journal_peek(trans, iter, k.k ? k.k->p : path_l(iter->path)->b->key.k.p); if (next_journal) { @@ -1830,42 +1910,46 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c __btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) { struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; + struct bkey_s_c k; int ret; + if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bpos_eq(iter->pos, pos)) + return bkey_s_c_null; + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) return bkey_s_c_null; if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, iter->flags & BTREE_ITER_INTENT, 0, - iter->flags|BTREE_ITER_CACHED, + iter->flags|BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL, _THIS_IP_); iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, iter->flags & BTREE_ITER_INTENT, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED); + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + iter->flags|BTREE_ITER_CACHED) ?: + bch2_btree_path_relock(trans, iter->path, _THIS_IP_); if (unlikely(ret)) return bkey_s_c_err(ret); btree_path_set_should_be_locked(iter->key_cache_path); - return bch2_btree_path_peek_slot(iter->key_cache_path, &u); -} - -static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) -{ - struct bkey_s_c ret = __btree_trans_peek_key_cache(iter, pos); - int err = bkey_err(ret) ?: bch2_btree_path_relock(iter->trans, iter->path, _THIS_IP_); - - return err ? bkey_s_c_err(err) : ret; + k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + if (k.k && !bkey_err(k)) { + iter->k = u; + k.k = &iter->k; + } + return k; } static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) @@ -1920,12 +2004,11 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) k = btree_trans_peek_journal(trans, iter, k); - next_update = iter->flags & BTREE_ITER_WITH_UPDATES - ? btree_trans_peek_updates(trans, iter->btree_id, search_key) - : NULL; + next_update = btree_trans_peek_updates(iter); + if (next_update && - bpos_cmp(next_update->k.p, - k.k ? k.k->p : l->b->key.k.p) <= 0) { + bpos_le(next_update->k.p, + k.k ? k.k->p : l->b->key.k.p)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); } @@ -1938,7 +2021,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp * whiteout, with a real key at the same position, since * in the btree deleted keys sort before non deleted. */ - search_key = bpos_cmp(search_key, k.k->p) + search_key = !bpos_eq(search_key, k.k->p) ? k.k->p : bpos_successor(k.k->p); continue; @@ -1946,7 +2029,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (likely(k.k)) { break; - } else if (likely(bpos_cmp(l->b->key.k.p, SPOS_MAX))) { + } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { /* Advance to next leaf node: */ search_key = bpos_successor(l->b->key.k.p); } else { @@ -1975,6 +2058,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e int ret; EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, @@ -1986,7 +2070,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e while (1) { k = __bch2_btree_iter_peek(iter, search_key); - if (!k.k || bkey_err(k)) + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) goto out_no_locked; /* @@ -1996,19 +2082,16 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e */ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) iter_pos = k.k->p; - else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) - iter_pos = bkey_start_pos(k.k); else - iter_pos = iter->pos; + iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); - if (bkey_cmp(iter_pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - k = bkey_s_c_null; - goto out_no_locked; - } + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(iter_pos, end) + : bkey_ge(iter_pos, end))) + goto end; if (iter->update_path && - bkey_cmp(iter->update_path->pos, k.k->p)) { + !bkey_eq(iter->update_path->pos, k.k->p)) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); iter->update_path = NULL; @@ -2038,6 +2121,11 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->update_path, pos, iter->flags & BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } } /* @@ -2070,8 +2158,8 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e btree_path_set_should_be_locked(iter->path); out_no_locked: if (iter->update_path) { - if (iter->update_path->uptodate && - (ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_))) + ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + if (unlikely(ret)) k = bkey_s_c_err(ret); else btree_path_set_should_be_locked(iter->update_path); @@ -2089,6 +2177,10 @@ out_no_locked: bch2_btree_iter_verify_entry_exit(iter); return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; } /** @@ -2134,7 +2226,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) /* Check if we should go up to the parent node: */ if (!k.k || (iter->advanced && - !bpos_cmp(path_l(iter->path)->b->key.k.p, iter->pos))) { + bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { iter->pos = path_l(iter->path)->b->key.k.p; btree_path_set_level_up(trans, iter->path); iter->advanced = false; @@ -2150,7 +2242,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) if (iter->path->level != iter->min_depth && (iter->advanced || !k.k || - bpos_cmp(iter->pos, k.k->p))) { + !bpos_eq(iter->pos, k.k->p))) { btree_path_set_level_down(trans, iter->path, iter->min_depth); iter->pos = bpos_successor(iter->pos); iter->advanced = false; @@ -2161,7 +2253,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) if (iter->path->level == iter->min_depth && iter->advanced && k.k && - !bpos_cmp(iter->pos, k.k->p)) { + bpos_eq(iter->pos, k.k->p)) { iter->pos = bpos_successor(iter->pos); iter->advanced = false; continue; @@ -2169,7 +2261,7 @@ struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) if (iter->advanced && iter->path->level == iter->min_depth && - bpos_cmp(k.k->p, iter->pos)) + !bpos_eq(k.k->p, iter->pos)) iter->advanced = false; BUG_ON(iter->advanced); @@ -2240,13 +2332,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) &iter->path->l[0], &iter->k); if (!k.k || ((iter->flags & BTREE_ITER_IS_EXTENTS) - ? bpos_cmp(bkey_start_pos(k.k), search_key) >= 0 - : bpos_cmp(k.k->p, search_key) > 0)) + ? bpos_ge(bkey_start_pos(k.k), search_key) + : bpos_gt(k.k->p, search_key))) k = btree_path_level_prev(trans, iter->path, &iter->path->l[0], &iter->k); - bch2_btree_path_check_sort(trans, iter->path, 0); - if (likely(k.k)) { if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { if (k.k->p.snapshot == iter->snapshot) @@ -2257,7 +2347,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) * longer at the same _key_ (not pos), return * that candidate */ - if (saved_path && bkey_cmp(k.k->p, saved_k.p)) { + if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { bch2_path_put_nokeep(trans, iter->path, iter->flags & BTREE_ITER_INTENT); iter->path = saved_path; @@ -2292,7 +2382,7 @@ got_key: } break; - } else if (likely(bpos_cmp(iter->path->l[0].b->data->min_key, POS_MIN))) { + } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { /* Advance to previous leaf node: */ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); } else { @@ -2303,10 +2393,10 @@ got_key: } } - EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0); + EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); /* Extents can straddle iter->pos: */ - if (bkey_cmp(k.k->p, iter->pos) < 0) + if (bkey_lt(k.k->p, iter->pos)) iter->pos = k.k->p; if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) @@ -2371,25 +2461,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { struct bkey_i *next_update; - if ((iter->flags & BTREE_ITER_WITH_UPDATES) && - (next_update = btree_trans_peek_updates(trans, - iter->btree_id, search_key)) && - !bpos_cmp(next_update->k.p, iter->pos)) { + if ((next_update = btree_trans_peek_updates(iter)) && + bpos_eq(next_update->k.p, iter->pos)) { iter->k = next_update->k; k = bkey_i_to_s_c(next_update); goto out; } if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && - (next_update = bch2_btree_journal_peek_slot(trans, - iter, iter->pos))) { - iter->k = next_update->k; - k = bkey_i_to_s_c(next_update); + (k = btree_trans_peek_slot_journal(trans, iter)).k) goto out; - } if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && - (k = __btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ @@ -2401,15 +2485,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out_no_locked; } else { struct bpos next; + struct bpos end = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + end.offset = U64_MAX; EBUG_ON(iter->path->level); if (iter->flags & BTREE_ITER_INTENT) { struct btree_iter iter2; - struct bpos end = iter->pos; - - if (iter->flags & BTREE_ITER_IS_EXTENTS) - end.offset = U64_MAX; bch2_trans_copy_iter(&iter2, iter); k = bch2_btree_iter_peek_upto(&iter2, end); @@ -2422,7 +2506,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek_upto(iter, end); if (unlikely(bkey_err(k))) bch2_btree_iter_set_pos(iter, pos); else @@ -2434,7 +2518,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) next = k.k ? bkey_start_pos(k.k) : POS_MAX; - if (bkey_cmp(iter->pos, next) < 0) { + if (bkey_lt(iter->pos, next)) { bkey_init(&iter->k); iter->k.p = iter->pos; @@ -2479,29 +2563,43 @@ struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) return bch2_btree_iter_peek_slot(iter); } -/* new transactional stuff: */ - -static inline void btree_path_verify_sorted_ref(struct btree_trans *trans, - struct btree_path *path) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) { - EBUG_ON(path->sorted_idx >= trans->nr_sorted); - EBUG_ON(trans->sorted[path->sorted_idx] != path->idx); - EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + struct bkey_s_c k; + + while (btree_trans_too_many_iters(iter->trans) || + (k = bch2_btree_iter_peek_type(iter, iter->flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(iter->trans); + + return k; } -static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) -{ +/* new transactional stuff: */ + #ifdef CONFIG_BCACHEFS_DEBUG +static void btree_trans_verify_sorted_refs(struct btree_trans *trans) +{ + struct btree_path *path; unsigned i; - for (i = 0; i < trans->nr_sorted; i++) - btree_path_verify_sorted_ref(trans, trans->paths + trans->sorted[i]); -#endif + BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + + trans_for_each_path(trans, path) { + BUG_ON(path->sorted_idx >= trans->nr_sorted); + BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + } + + for (i = 0; i < trans->nr_sorted; i++) { + unsigned idx = trans->sorted[i]; + + EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(trans->paths[idx].sorted_idx != i); + } } static void btree_trans_verify_sorted(struct btree_trans *trans) { -#ifdef CONFIG_BCACHEFS_DEBUG struct btree_path *path, *prev = NULL; unsigned i; @@ -2510,80 +2608,54 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) trans_for_each_path_inorder(trans, path, i) { if (prev && btree_path_cmp(prev, path) > 0) { - bch2_dump_trans_paths_updates(trans); + __bch2_dump_trans_paths_updates(trans, true); panic("trans paths out of order!\n"); } prev = path; } -#endif -} - -static inline void btree_path_swap(struct btree_trans *trans, - struct btree_path *l, struct btree_path *r) -{ - swap(l->sorted_idx, r->sorted_idx); - swap(trans->sorted[l->sorted_idx], - trans->sorted[r->sorted_idx]); - - btree_path_verify_sorted_ref(trans, l); - btree_path_verify_sorted_ref(trans, r); -} - -static inline struct btree_path *sib_btree_path(struct btree_trans *trans, - struct btree_path *path, int sib) -{ - unsigned idx = (unsigned) path->sorted_idx + sib; - - EBUG_ON(sib != -1 && sib != 1); - - return idx < trans->nr_sorted - ? trans->paths + trans->sorted[idx] - : NULL; } +#else +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} +static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} +#endif -static __always_inline void bch2_btree_path_check_sort_fast(struct btree_trans *trans, - struct btree_path *path, - int cmp) +void __bch2_btree_trans_sort_paths(struct btree_trans *trans) { - struct btree_path *n; - int cmp2; - - EBUG_ON(!cmp); - - while ((n = sib_btree_path(trans, path, cmp)) && - (cmp2 = btree_path_cmp(n, path)) && - cmp2 != cmp) - btree_path_swap(trans, n, path); - - btree_trans_verify_sorted(trans); -} + int i, l = 0, r = trans->nr_sorted, inc = 1; + bool swapped; -inline void bch2_btree_path_check_sort(struct btree_trans *trans, struct btree_path *path, - int cmp) -{ - struct btree_path *n; + btree_trans_verify_sorted_refs(trans); - if (cmp <= 0) { - n = prev_btree_path(trans, path); - if (n && btree_path_cmp(n, path) > 0) { - do { - btree_path_swap(trans, n, path); - n = prev_btree_path(trans, path); - } while (n && btree_path_cmp(n, path) > 0); + if (trans->paths_sorted) + goto out; - goto out; + /* + * Cocktail shaker sort: this is efficient because iterators will be + * mostly sorted. + */ + do { + swapped = false; + + for (i = inc > 0 ? l : r - 2; + i + 1 < r && i >= l; + i += inc) { + if (btree_path_cmp(trans->paths + trans->sorted[i], + trans->paths + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], trans->sorted[i + 1]); + trans->paths[trans->sorted[i]].sorted_idx = i; + trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } } - } - if (cmp >= 0) { - n = next_btree_path(trans, path); - if (n && btree_path_cmp(path, n) > 0) { - do { - btree_path_swap(trans, path, n); - n = next_btree_path(trans, path); - } while (n && btree_path_cmp(path, n) > 0); - } - } + if (inc > 0) + --r; + else + l++; + inc = -inc; + } while (swapped); + + trans->paths_sorted = true; out: btree_trans_verify_sorted(trans); } @@ -2594,15 +2666,18 @@ static inline void btree_path_list_remove(struct btree_trans *trans, unsigned i; EBUG_ON(path->sorted_idx >= trans->nr_sorted); - +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + trans->nr_sorted--; + memmove_u64s_down_small(trans->sorted + path->sorted_idx, + trans->sorted + path->sorted_idx + 1, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); +#else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); - +#endif for (i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; path->sorted_idx = U8_MAX; - - btree_trans_verify_sorted_refs(trans); } static inline void btree_path_list_add(struct btree_trans *trans, @@ -2611,16 +2686,17 @@ static inline void btree_path_list_add(struct btree_trans *trans, { unsigned i; - btree_trans_verify_sorted_refs(trans); - - path->sorted_idx = pos ? pos->sorted_idx + 1 : 0; - - if (trans->in_traverse_all && - trans->traverse_all_idx != U8_MAX && - trans->traverse_all_idx >= path->sorted_idx) - trans->traverse_all_idx++; + path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, + trans->sorted + path->sorted_idx, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + trans->nr_sorted++; + trans->sorted[path->sorted_idx] = path->idx; +#else array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); +#endif for (i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; @@ -2630,12 +2706,12 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { - if (iter->path) - bch2_path_put(trans, iter->path, - iter->flags & BTREE_ITER_INTENT); if (iter->update_path) bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_INTENT); + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); if (iter->key_cache_path) bch2_path_put(trans, iter->key_cache_path, iter->flags & BTREE_ITER_INTENT); @@ -2644,72 +2720,14 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->key_cache_path = NULL; } -static inline void __bch2_trans_iter_init(struct btree_trans *trans, - struct btree_iter *iter, - unsigned btree_id, struct bpos pos, - unsigned locks_want, - unsigned depth, - unsigned flags, - unsigned long ip) -{ - if (unlikely(trans->restarted)) - panic("bch2_trans_iter_init(): in transaction restart, %s by %pS\n", - bch2_err_str(trans->restarted), - (void *) trans->last_restarted_ip); - - if (flags & BTREE_ITER_ALL_LEVELS) - flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; - - if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && - btree_node_type_is_extents(btree_id)) - flags |= BTREE_ITER_IS_EXTENTS; - - if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && - !btree_type_has_snapshots(btree_id)) - flags &= ~BTREE_ITER_ALL_SNAPSHOTS; - - if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && - btree_type_has_snapshots(btree_id)) - flags |= BTREE_ITER_FILTER_SNAPSHOTS; - - if (trans->journal_replay_not_finished) - flags |= BTREE_ITER_WITH_JOURNAL; - - iter->trans = trans; - iter->path = NULL; - iter->update_path = NULL; - iter->key_cache_path = NULL; - iter->btree_id = btree_id; - iter->min_depth = depth; - iter->flags = flags; - iter->snapshot = pos.snapshot; - iter->pos = pos; - iter->k.type = KEY_TYPE_deleted; - iter->k.p = pos; - iter->k.size = 0; - iter->journal_idx = 0; - iter->journal_pos = POS_MIN; -#ifdef CONFIG_BCACHEFS_DEBUG - iter->ip_allocated = ip; -#endif - - iter->path = bch2_path_get(trans, btree_id, iter->pos, - locks_want, depth, flags, ip); -} - -void bch2_trans_iter_init(struct btree_trans *trans, +void bch2_trans_iter_init_outlined(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, unsigned flags) { - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_CACHED; - flags &= ~BTREE_ITER_WITH_KEY_CACHE; - } else if (!(flags & BTREE_ITER_CACHED)) - flags |= BTREE_ITER_WITH_KEY_CACHE; - - __bch2_trans_iter_init(trans, iter, btree_id, pos, - 0, 0, flags, _RET_IP_); + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); } void bch2_trans_node_iter_init(struct btree_trans *trans, @@ -2720,11 +2738,16 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, unsigned depth, unsigned flags) { - __bch2_trans_iter_init(trans, iter, btree_id, pos, locks_want, depth, - BTREE_ITER_NOT_EXTENTS| - __BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_ALL_SNAPSHOTS| - flags, _RET_IP_); + flags |= BTREE_ITER_NOT_EXTENTS; + flags |= __BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_ALL_SNAPSHOTS; + + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, + __bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); + + iter->min_depth = depth; + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); BUG_ON(iter->path->level != depth); BUG_ON(iter->min_depth != depth); @@ -2745,6 +2768,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) unsigned new_top = trans->mem_top + size; size_t old_bytes = trans->mem_bytes; size_t new_bytes = roundup_pow_of_two(new_top); + int ret; void *new_mem; void *p; @@ -2752,15 +2776,27 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); - new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); - if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { - new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); - new_bytes = BTREE_TRANS_MEM_MAX; - kfree(trans->mem); - } + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } - if (!new_mem) - return ERR_PTR(-ENOMEM); + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } trans->mem = new_mem; trans->mem_bytes = new_bytes; @@ -2776,6 +2812,20 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) return p; } +static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; +} + /** * bch2_trans_begin() - reset a transaction after a interrupted attempt * @trans: transaction to reset @@ -2787,20 +2837,13 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) u32 bch2_trans_begin(struct btree_trans *trans) { struct btree_path *path; + u64 now; bch2_trans_reset_updates(trans); trans->restart_count++; trans->mem_top = 0; - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } - trans_for_each_path(trans, path) { path->should_be_locked = false; @@ -2823,30 +2866,27 @@ u32 bch2_trans_begin(struct btree_trans *trans) path->preserve = false; } + now = local_clock(); if (!trans->restarted && (need_resched() || - local_clock() - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { - bch2_trans_unlock(trans); - cond_resched(); - bch2_trans_relock(trans); + now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + drop_locks_do(trans, (cond_resched(), 0)); + now = local_clock(); } + trans->last_begin_time = now; + + if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_reset_srcu_lock(trans); - trans->last_restarted_ip = _RET_IP_; - if (trans->restarted) + trans->last_begin_ip = _RET_IP_; + if (trans->restarted) { bch2_btree_path_traverse_all(trans); + trans->notrace_relock_fail = false; + } - trans->last_begin_time = local_clock(); return trans->restart_count; } -void bch2_trans_verify_not_restarted(struct btree_trans *trans, u32 restart_count) -{ - if (trans_was_restarted(trans, restart_count)) - panic("trans->restart_count %u, should be %u, last restarted by %pS\n", - trans->restart_count, restart_count, - (void *) trans->last_restarted_ip); -} - static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) { size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; @@ -2860,6 +2900,10 @@ static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) #endif if (!p) p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); + /* + * paths need to be zeroed, bch2_check_for_deadlock looks at paths in + * other threads + */ trans->paths = p; p += paths_bytes; trans->updates = p; p += updates_bytes; @@ -2886,9 +2930,8 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ __acquires(&c->btree_trans_barrier) { struct btree_transaction_stats *s; - struct btree_trans *pos; - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); memset(trans, 0, sizeof(*trans)); trans->c = c; @@ -2916,21 +2959,38 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_ trans->mem_bytes = expected_mem_bytes; } } - if (s) + + if (s) { trans->nr_max_paths = s->nr_max_paths; + trans->wb_updates_size = s->wb_updates_size; + } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + struct btree_trans *pos; - mutex_lock(&c->btree_trans_lock); - list_for_each_entry(pos, &c->btree_trans_list, list) { - if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { - list_add_tail(&trans->list, &pos->list); - goto list_add_done; + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(pos, &c->btree_trans_list, list) { + /* + * We'd much prefer to be stricter here and completely + * disallow multiple btree_trans in the same thread - + * but the data move path calls bch2_write when we + * already have a btree_trans initialized. + */ + BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + bch2_trans_locked(pos)); + + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } } - } - list_add_tail(&trans->list, &c->btree_trans_list); + list_add_tail(&trans->list, &c->btree_trans_list); list_add_done: - mutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + } } static void check_btree_paths_leaked(struct btree_trans *trans) @@ -2964,6 +3024,12 @@ void bch2_trans_exit(struct btree_trans *trans) bch2_trans_unlock(trans); + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + closure_sync(&trans->ref); if (s) @@ -2975,10 +3041,6 @@ void bch2_trans_exit(struct btree_trans *trans) check_btree_paths_leaked(trans); - mutex_lock(&c->btree_trans_lock); - list_del(&trans->list); - mutex_unlock(&c->btree_trans_lock); - srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); bch2_journal_preres_put(&c->journal, &trans->journal_preres); @@ -3041,7 +3103,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) struct btree_path *path; struct btree_bkey_cached_common *b; static char lock_types[] = { 'r', 'i', 'w' }; - unsigned l; + unsigned l, idx; if (!out->nr_tabstops) { printbuf_tabstop_push(out, 16); @@ -3050,7 +3112,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); - trans_for_each_path(trans, path) { + trans_for_each_path_safe(trans, path, idx) { if (!path->nodes_locked) continue; @@ -3075,7 +3137,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) b = READ_ONCE(trans->locking); if (b) { - prt_str(out, " want"); + prt_printf(out, " blocked for %lluus on", + div_u64(local_clock() - trans->locking_wait.start_time, + 1000)); prt_newline(out); prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); bch2_btree_bkey_cached_common_to_text(out, b); @@ -3089,8 +3153,10 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); - s++) + s++) { kfree(s->max_paths_text); + bch2_time_stats_exit(&s->lock_hold_times); + } if (c->btree_trans_barrier_initialized) cleanup_srcu_struct(&c->btree_trans_barrier); @@ -3100,14 +3166,19 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) int bch2_fs_btree_iter_init(struct bch_fs *c) { - unsigned i, nr = BTREE_ITER_MAX; + struct btree_transaction_stats *s; + unsigned nr = BTREE_ITER_MAX; int ret; - for (i = 0; i < ARRAY_SIZE(c->btree_transaction_stats); i++) - mutex_init(&c->btree_transaction_stats[i].lock); + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + bch2_time_stats_init(&s->lock_hold_times); + mutex_init(&s->lock); + } INIT_LIST_HEAD(&c->btree_trans_list); - mutex_init(&c->btree_trans_lock); + seqmutex_init(&c->btree_trans_lock); ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, sizeof(struct btree_path) * nr + diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 8ed5aee..c472aa8 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -4,8 +4,14 @@ #include "bset.h" #include "btree_types.h" +#include "trace.h" -#include +static inline int __bkey_err(const struct bkey *k) +{ + return PTR_ERR_OR_ZERO(k); +} + +#define bkey_err(_k) __bkey_err((_k).k) static inline void __btree_path_get(struct btree_path *path, bool intent) { @@ -36,14 +42,7 @@ static inline struct btree *btree_path_node(struct btree_path *path, static inline bool btree_node_lock_seq_matches(const struct btree_path *path, const struct btree *b, unsigned level) { - /* - * We don't compare the low bits of the lock sequence numbers because - * @path might have taken a write lock on @b, and we don't want to skip - * the linked path if the sequence numbers were equal before taking that - * write lock. The lock sequence number is incremented by taking and - * releasing write locks and is even when unlocked: - */ - return path->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; + return path->l[level].lock_seq == six_lock_seq(&b->c.lock); } static inline struct btree *btree_node_parent(struct btree_path *path, @@ -54,6 +53,16 @@ static inline struct btree *btree_node_parent(struct btree_path *path, /* Iterate over paths within a transaction: */ +void __bch2_btree_trans_sort_paths(struct btree_trans *); + +static inline void btree_trans_sort_paths(struct btree_trans *trans) +{ + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + trans->paths_sorted) + return; + __bch2_btree_trans_sort_paths(trans); +} + static inline struct btree_path * __trans_next_path(struct btree_trans *trans, unsigned idx) { @@ -72,8 +81,6 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) return &trans->paths[idx]; } -void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); - #define trans_for_each_path_from(_trans, _path, _start) \ for (_path = __trans_next_path((_trans), _start); \ (_path); \ @@ -82,6 +89,35 @@ void bch2_btree_path_check_sort(struct btree_trans *, struct btree_path *, int); #define trans_for_each_path(_trans, _path) \ trans_for_each_path_from(_trans, _path, 0) +static inline struct btree_path * +__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +{ + u64 l; + + if (*idx == BTREE_ITER_MAX) + return NULL; + + l = trans->paths_allocated >> *idx; + if (!l) + return NULL; + + *idx += __ffs64(l); + EBUG_ON(*idx >= BTREE_ITER_MAX); + return &trans->paths[*idx]; +} + +/* + * This version is intended to be safe for use on a btree_trans that is owned by + * another thread, for bch2_btree_trans_to_text(); + */ +#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ + for (_idx = _start; \ + (_path = __trans_next_path_safe((_trans), &_idx)); \ + _idx++) + +#define trans_for_each_path_safe(_trans, _path, _idx) \ + trans_for_each_path_safe_from(_trans, _path, _idx, 0) + static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { unsigned idx = path ? path->sorted_idx + 1 : 0; @@ -95,9 +131,10 @@ static inline struct btree_path *next_btree_path(struct btree_trans *trans, stru static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) { - EBUG_ON(path->sorted_idx >= trans->nr_sorted); - return path->sorted_idx - ? trans->paths + trans->sorted[path->sorted_idx - 1] + unsigned idx = path ? path->sorted_idx : trans->nr_sorted; + + return idx + ? trans->paths + trans->sorted[idx - 1] : NULL; } @@ -106,6 +143,11 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ _i++) +#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ + for (_i = trans->nr_sorted - 1; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ + --_i) + static inline bool __path_has_node(const struct btree_path *path, const struct btree *b) { @@ -161,6 +203,18 @@ bch2_btree_path_set_pos(struct btree_trans *trans, : path; } +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + +static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + + return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); +} + int __must_check bch2_btree_path_traverse(struct btree_trans *, struct btree_path *, unsigned); struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, @@ -172,6 +226,15 @@ struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); +int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); + +static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) +{ + return mutex_trylock(lock) + ? 0 + : __bch2_trans_mutex_lock(trans, lock); +} + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, @@ -193,6 +256,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); void bch2_path_put(struct btree_trans *, struct btree_path *, bool); int bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock_notrace(struct btree_trans *); void bch2_trans_unlock(struct btree_trans *); bool bch2_trans_locked(struct btree_trans *); @@ -201,20 +265,36 @@ static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_co return restart_count != trans->restart_count; } -void bch2_trans_verify_not_restarted(struct btree_trans *, u32); +void bch2_trans_restart_error(struct btree_trans *, u32); + +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + bch2_trans_restart_error(trans, restart_count); +} + +void bch2_trans_in_restart_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +{ + if (trans->restarted) + bch2_trans_in_restart_error(trans); +} __always_inline -static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) { BUG_ON(err <= 0); - BUG_ON(!bch2_err_matches(err, BCH_ERR_transaction_restart)); + BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; + trans->last_restarted_ip = _THIS_IP_; return -err; } __always_inline -static inline int btree_trans_restart(struct btree_trans *trans, int err) +static int btree_trans_restart(struct btree_trans *trans, int err) { btree_trans_restart_nounlock(trans, err); return -err; @@ -243,6 +323,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); int __must_check bch2_btree_iter_traverse(struct btree_iter *); struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); @@ -303,8 +384,85 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna } void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -void bch2_trans_iter_init(struct btree_trans *, struct btree_iter *, + +static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + btree_node_type_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + + if (trans->journal_replay_not_finished) + flags |= BTREE_ITER_WITH_JOURNAL; + + return flags; +} + +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + + return __bch2_btree_iter_flags(trans, btree_id, flags); +} + +static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) +{ + memset(iter, 0, sizeof(*iter)); + iter->trans = trans; + iter->btree_id = btree_id; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k.p = pos; + +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif + iter->path = bch2_path_get(trans, btree_id, iter->pos, + locks_want, depth, flags, ip); +} + +void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, unsigned, struct bpos, unsigned); + +static inline void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + if (__builtin_constant_p(btree_id) && + __builtin_constant_p(flags)) + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _THIS_IP_); + else + bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); +} + void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); @@ -320,33 +478,91 @@ void *__bch2_trans_kmalloc(struct btree_trans *, size_t); static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) { - unsigned new_top = trans->mem_top + size; - void *p = trans->mem + trans->mem_top; + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; - if (likely(new_top <= trans->mem_bytes)) { trans->mem_top += size; memset(p, 0, size); return p; } else { return __bch2_trans_kmalloc(trans, size); + } +} + +static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + trans->mem_top += size; + return p; + } else { + return __bch2_trans_kmalloc(trans, size); } } -u32 bch2_trans_begin(struct btree_trans *); +static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type) +{ + struct bkey_s_c k; + + bch2_trans_iter_init(trans, iter, btree_id, pos, flags); + k = bch2_btree_iter_peek_slot(iter); -static inline struct btree * -__btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter *iter) + if (!bkey_err(k) && type && k.k->type != type) + k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); + if (unlikely(bkey_err(k))) + bch2_trans_iter_exit(trans, iter); + return k; +} + +static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) { - struct btree *b; + return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); +} - while (b = bch2_btree_iter_peek_node(iter), - bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(trans); +#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ + _btree_id, _pos, _flags, KEY_TYPE_##_type)) - return b; +static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, + unsigned val_size, void *val) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + ret = bkey_err(k); + if (!ret) { + unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); + + memcpy(val, k.v, b); + if (unlikely(b < sizeof(*val))) + memset((void *) val + b, 0, sizeof(*val) - b); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; } +#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ + __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(*_val), _val) + +u32 bch2_trans_begin(struct btree_trans *); + /* * XXX * this does not handle transaction restarts from bch2_btree_iter_next_node() @@ -356,7 +572,7 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter _locks_want, _depth, _flags, _b, _ret) \ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ _start, _locks_want, _depth, _flags); \ - (_b) = __btree_iter_peek_node_and_restart((_trans), &(_iter)),\ + (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ (_b) = bch2_btree_iter_next_node(&(_iter))) @@ -365,11 +581,6 @@ __btree_iter_peek_node_and_restart(struct btree_trans *trans, struct btree_iter __for_each_btree_node(_trans, _iter, _btree_id, _start, \ 0, 0, _flags, _b, _ret) -static inline int bkey_err(struct bkey_s_c k) -{ - return PTR_ERR_OR_ZERO(k.k); -} - static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, unsigned flags) { @@ -394,7 +605,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter * if (!(flags & BTREE_ITER_SLOTS)) return bch2_btree_iter_peek_upto(iter, end); - if (bkey_cmp(iter->pos, end) > 0) + if (bkey_gt(iter->pos, end)) return bkey_s_c_null; return bch2_btree_iter_peek_slot(iter); @@ -410,6 +621,8 @@ static inline int btree_trans_too_many_iters(struct btree_trans *trans) return 0; } +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + static inline struct bkey_s_c __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, struct btree_iter *iter, unsigned flags) @@ -424,6 +637,22 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, return k; } +static inline struct bkey_s_c +__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_upto_type(iter, end, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + #define lockrestart_do(_trans, _do) \ ({ \ u32 _restart_count; \ @@ -498,6 +727,36 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, _ret; \ }) +#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + \ + _ret = 0; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ + if (!(_k).k) \ + break; \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ ({ \ @@ -536,6 +795,22 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) +#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ + _start, _end, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -544,6 +819,15 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ + &(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ @@ -572,6 +856,43 @@ __bch2_btree_iter_peek_and_restart(struct btree_trans *trans, !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) +#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ + for (; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define drop_locks_do(_trans, _do) \ +({ \ + bch2_trans_unlock(_trans); \ + _do ?: bch2_trans_relock(_trans); \ +}) + +#define allocate_dropping_locks_errcode(_trans, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + int _ret = _do; \ + \ + if (bch2_err_matches(_ret, ENOMEM)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, _do); \ + } \ + _ret; \ +}) + +#define allocate_dropping_locks(_trans, _ret, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + typeof(_do) _p = _do; \ + \ + _ret = 0; \ + if (unlikely(!_p)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + } \ + _p; \ +}) + /* new multiple iterator interface: */ void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index 634c673..f7c001d 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -10,9 +10,10 @@ #include "error.h" #include "journal.h" #include "journal_reclaim.h" +#include "trace.h" #include -#include +#include static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -27,8 +28,8 @@ static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, const struct bkey_cached *ck = obj; const struct bkey_cached_key *key = arg->key; - return cmp_int(ck->key.btree_id, key->btree_id) ?: - bpos_cmp(ck->key.pos, key->pos); + return ck->key.btree_id != key->btree_id || + !bpos_eq(ck->key.pos, key->pos); } static const struct rhashtable_params bch2_btree_key_cache_params = { @@ -56,13 +57,12 @@ static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) if (!six_trylock_intent(&ck->c.lock)) return false; - if (!six_trylock_write(&ck->c.lock)) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { six_unlock_intent(&ck->c.lock); return false; } - if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - six_unlock_write(&ck->c.lock); + if (!six_trylock_write(&ck->c.lock)) { six_unlock_intent(&ck->c.lock); return false; } @@ -197,6 +197,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, struct btree_key_cache *bc = &c->btree_key_cache; struct bkey_cached *ck = NULL; bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + int ret; if (!pcpu_readers) { #ifdef __KERNEL__ @@ -244,14 +245,14 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, if (ck) { int ret; - ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent); + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); if (unlikely(ret)) { bkey_cached_move_to_freelist(bc, ck); return ERR_PTR(ret); } path->l[0].b = (void *) ck; - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); ret = bch2_btree_node_lock_write(trans, path, &ck->c); @@ -264,22 +265,24 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, return ck; } - /* GFP_NOFS because we're holding btree locks: */ - ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); - if (likely(ck)) { - INIT_LIST_HEAD(&ck->list); - __six_lock_init(&ck->c.lock, "b->c.lock", &bch2_btree_node_lock_key); - if (pcpu_readers) - six_lock_pcpu_alloc(&ck->c.lock); - - ck->c.cached = true; - BUG_ON(!six_trylock_intent(&ck->c.lock)); - BUG_ON(!six_trylock_write(&ck->c.lock)); - *was_new = true; - return ck; + ck = allocate_dropping_locks(trans, ret, + kmem_cache_zalloc(bch2_key_cache, _gfp)); + if (ret) { + kmem_cache_free(bch2_key_cache, ck); + return ERR_PTR(ret); } - return NULL; + if (!ck) + return NULL; + + INIT_LIST_HEAD(&ck->list); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + + ck->c.cached = true; + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + *was_new = true; + return ck; } static struct bkey_cached * @@ -325,13 +328,10 @@ btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_ids[path->btree_id]); - return ERR_PTR(-ENOMEM); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); } mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); - } else { - if (path->btree_id == BTREE_ID_subvolumes) - six_lock_pcpu_alloc(&ck->c.lock); } ck->c.level = 0; @@ -369,24 +369,22 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct btree_path *ck_path, struct bkey_cached *ck) { - struct btree_path *path; + struct btree_iter iter; struct bkey_s_c k; unsigned new_u64s = 0; struct bkey_i *new_k = NULL; - struct bkey u; int ret; - path = bch2_path_get(trans, ck->key.btree_id, - ck->key.pos, 0, 0, 0, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, path, 0); + k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + ret = bkey_err(k); if (ret) goto err; - k = bch2_btree_path_peek_slot(path, &u); - if (!bch2_btree_node_relock(trans, ck_path, 0)) { trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); goto err; } @@ -405,12 +403,30 @@ static int btree_key_cache_fill(struct btree_trans *trans, if (new_u64s > ck->u64s) { new_u64s = roundup_pow_of_two(new_u64s); - new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", - bch2_btree_ids[ck->key.btree_id], new_u64s); - ret = -ENOMEM; - goto err; + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + goto err; + } + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } } } @@ -431,9 +447,9 @@ static int btree_key_cache_fill(struct btree_trans *trans, bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); /* We're not likely to need this iterator again: */ - path->preserve = false; + set_btree_iter_dontneed(&iter); err: - bch2_path_put(trans, path, 0); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -449,7 +465,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree path->l[1].b = NULL; - if (bch2_btree_node_relock(trans, path, 0)) { + if (bch2_btree_node_relock_notrace(trans, path, 0)) { ck = (void *) path->l[0].b; goto fill; } @@ -476,7 +492,7 @@ retry: BUG_ON(ret); if (ck->key.btree_id != path->btree_id || - bpos_cmp(ck->key.pos, path->pos)) { + !bpos_eq(ck->key.pos, path->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } @@ -484,10 +500,12 @@ retry: mark_btree_node_locked(trans, path, 0, lock_want); } - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); path->l[0].b = (void *) ck; fill: - if (!ck->valid) { + path->uptodate = BTREE_ITER_UPTODATE; + + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { /* * Using the underscore version because we haven't set * path->uptodate yet: @@ -502,17 +520,23 @@ fill: ret = btree_key_cache_fill(trans, path, ck); if (ret) goto err; + + ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (ret) + goto err; + + path->uptodate = BTREE_ITER_UPTODATE; } if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) set_bit(BKEY_CACHED_ACCESSED, &ck->flags); - path->uptodate = BTREE_ITER_UPTODATE; - BUG_ON(!ck->valid); BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + BUG_ON(path->uptodate); return ret; err: + path->uptodate = BTREE_ITER_NEED_TRAVERSE; if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(ret); @@ -531,7 +555,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path path->l[1].b = NULL; - if (bch2_btree_node_relock(trans, path, 0)) { + if (bch2_btree_node_relock_notrace(trans, path, 0)) { ck = (void *) path->l[0].b; goto fill; } @@ -550,7 +574,7 @@ retry: return ret; if (ck->key.btree_id != path->btree_id || - bpos_cmp(ck->key.pos, path->pos)) { + !bpos_eq(ck->key.pos, path->pos)) { six_unlock_type(&ck->c.lock, lock_want); goto retry; } @@ -558,7 +582,7 @@ retry: mark_btree_node_locked(trans, path, 0, lock_want); } - path->l[0].lock_seq = ck->c.lock.state.seq; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); path->l[0].b = (void *) ck; fill: if (!ck->valid) @@ -627,9 +651,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) - ? JOURNAL_WATERMARK_reserved + ? BCH_WATERMARK_reclaim : 0)| commit_flags); @@ -696,6 +719,13 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, six_unlock_read(&ck->c.lock); goto unlock; } + + if (ck->seq != seq) { + bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, + bch2_btree_key_cache_journal_flush); + six_unlock_read(&ck->c.lock); + goto unlock; + } six_unlock_read(&ck->c.lock); ret = commit_do(&trans, NULL, NULL, 0, @@ -725,21 +755,22 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans, } bool bch2_btree_insert_key_cached(struct btree_trans *trans, - struct btree_path *path, - struct bkey_i *insert) + unsigned flags, + struct btree_insert_entry *insert_entry) { struct bch_fs *c = trans->c; - struct bkey_cached *ck = (void *) path->l[0].b; + struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_i *insert = insert_entry->k; bool kick_reclaim = false; - BUG_ON(insert->u64s > ck->u64s); + BUG_ON(insert->k.u64s > ck->u64s); - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { int difference; - BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); + BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); - difference = jset_u64s(insert->u64s) - ck->res.u64s; + difference = jset_u64s(insert->k.u64s) - ck->res.u64s; if (difference > 0) { trans->journal_preres.u64s -= difference; ck->res.u64s += difference; @@ -750,6 +781,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -757,8 +789,24 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, kick_reclaim = true; } - bch2_journal_pin_update(&c->journal, trans->journal_res.seq, - &ck->journal, bch2_btree_key_cache_journal_flush); + /* + * To minimize lock contention, we only add the journal pin here and + * defer pin updates to the flush callback via ->seq. Be careful not to + * update ->seq on nojournal commits because we don't want to update the + * pin to a seq that doesn't include journal updates on disk. Otherwise + * we risk losing the update after a crash. + * + * The only exception is if the pin is not active in the first place. We + * have to add the pin because journal reclaim drives key cache + * flushing. The flush callback will not proceed unless ->seq matches + * the latest pin, so make sure it starts with a consistent value. + */ + if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + !journal_pin_active(&ck->journal)) { + ck->seq = trans->journal_res.seq; + } + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + &ck->journal, bch2_btree_key_cache_journal_flush); if (kick_reclaim) journal_reclaim_kick(&c->journal); @@ -812,7 +860,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, break; list_del(&ck->list); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -828,7 +876,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, break; list_del(&ck->list); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); atomic_long_dec(&bc->nr_freed); scanned++; @@ -902,13 +950,13 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) struct bucket_table *tbl; struct bkey_cached *ck, *n; struct rhash_head *pos; + LIST_HEAD(items); unsigned i; #ifdef __KERNEL__ int cpu; #endif - if (bc->shrink.list.next) - unregister_shrinker(&bc->shrink); + unregister_shrinker(&bc->shrink); mutex_lock(&bc->lock); @@ -922,7 +970,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { bkey_cached_evict(bc, ck); - list_add(&ck->list, &bc->freed_nonpcpu); + list_add(&ck->list, &items); } rcu_read_unlock(); } @@ -934,14 +982,17 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) for (i = 0; i < f->nr; i++) { ck = f->objs[i]; - list_add(&ck->list, &bc->freed_nonpcpu); + list_add(&ck->list, &items); } } #endif - list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + list_splice(&bc->freed_pcpu, &items); + list_splice(&bc->freed_nonpcpu, &items); - list_for_each_entry_safe(ck, n, &bc->freed_nonpcpu, list) { + mutex_unlock(&bc->lock); + + list_for_each_entry_safe(ck, n, &items, list) { cond_resched(); bch2_journal_pin_drop(&c->journal, &ck->journal); @@ -949,7 +1000,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) list_del(&ck->list); kfree(ck->k); - six_lock_pcpu_free(&ck->c.lock); + six_lock_exit(&ck->c.lock); kmem_cache_free(bch2_key_cache, ck); } @@ -963,8 +1014,6 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", atomic_long_read(&bc->nr_keys)); - mutex_unlock(&bc->lock); - if (bc->table_init_done) rhashtable_destroy(&bc->table); @@ -978,28 +1027,30 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) INIT_LIST_HEAD(&c->freed_nonpcpu); } -static void bch2_btree_key_cache_shrinker_to_text(struct printbuf *out, struct shrinker *shrink) +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) { struct btree_key_cache *bc = container_of(shrink, struct btree_key_cache, shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - bch2_btree_key_cache_to_text(out, bc); + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); } int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); - int ret; #ifdef __KERNEL__ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); if (!bc->pcpu_freed) - return -ENOMEM; + return -BCH_ERR_ENOMEM_fs_btree_cache_init; #endif - ret = rhashtable_init(&bc->table, &bch2_btree_key_cache_params); - if (ret) - return ret; + if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; bc->table_init_done = true; @@ -1007,7 +1058,9 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->shrink.count_objects = bch2_btree_key_cache_count; bc->shrink.scan_objects = bch2_btree_key_cache_scan; bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; - return register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name); + if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return 0; } void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) @@ -1027,7 +1080,7 @@ void bch2_btree_key_cache_exit(void) int __init bch2_btree_key_cache_init(void) { - bch2_key_cache = KMEM_CACHE(bkey_cached, 0); + bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); if (!bch2_key_cache) return -ENOMEM; diff --git a/libbcachefs/btree_key_cache.h b/libbcachefs/btree_key_cache.h index eccea15..be3acde 100644 --- a/libbcachefs/btree_key_cache.h +++ b/libbcachefs/btree_key_cache.h @@ -29,8 +29,8 @@ bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, unsigned); -bool bch2_btree_insert_key_cached(struct btree_trans *, - struct btree_path *, struct bkey_i *); +bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, + struct btree_insert_entry *); int bch2_btree_key_cache_flush(struct btree_trans *, enum btree_id, struct bpos); void bch2_btree_key_cache_drop(struct btree_trans *, diff --git a/libbcachefs/btree_locking.c b/libbcachefs/btree_locking.c index dce2dc0..0b0f9d6 100644 --- a/libbcachefs/btree_locking.c +++ b/libbcachefs/btree_locking.c @@ -4,19 +4,25 @@ #include "btree_locking.h" #include "btree_types.h" -struct lock_class_key bch2_btree_node_lock_key; +static struct lock_class_key bch2_btree_node_lock_key; -/* Btree node locking: */ +void bch2_btree_lock_init(struct btree_bkey_cached_common *b, + enum six_lock_init_flags flags) +{ + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_set_no_check_recursion(&b->lock.dep_map); +#endif +} -static inline void six_lock_readers_add(struct six_lock *lock, int nr) +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void) { - if (lock->readers) - this_cpu_add(*lock->readers, nr); - else if (nr > 0) - atomic64_add(__SIX_VAL(read_lock, nr), &lock->state.counter); - else - atomic64_sub(__SIX_VAL(read_lock, -nr), &lock->state.counter); + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); } +#endif + +/* Btree node locking: */ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, struct btree_path *skip, @@ -99,10 +105,14 @@ static void lock_graph_up(struct lock_graph *g) closure_put(&g->g[--g->nr].trans->ref); } -static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +static noinline void lock_graph_pop_all(struct lock_graph *g) { - closure_get(&trans->ref); + while (g->nr) + lock_graph_up(g); +} +static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ g->g[g->nr++] = (struct trans_waiting_for_lock) { .trans = trans, .node_want = trans->locking, @@ -110,6 +120,12 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) }; } +static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + closure_get(&trans->ref); + __lock_graph_down(g, trans); +} + static bool lock_graph_remove_non_waiters(struct lock_graph *g) { struct trans_waiting_for_lock *i; @@ -185,7 +201,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) prt_printf(&buf, "backtrace:"); prt_newline(&buf); printbuf_indent_add(&buf, 2); - bch2_prt_backtrace(&buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task); printbuf_indent_sub(&buf, 2); prt_newline(&buf); } @@ -210,20 +226,28 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct trans_waiting_for_lock *i; for (i = g->g; i < g->g + g->nr; i++) - if (i->trans == trans) + if (i->trans == trans) { + closure_put(&trans->ref); return break_cycle(g, cycle); + } if (g->nr == ARRAY_SIZE(g->g)) { + closure_put(&trans->ref); + if (orig_trans->lock_may_not_fail) return 0; while (g->nr) lock_graph_up(g); + + if (cycle) + return 0; + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); } - lock_graph_down(g, trans); + __lock_graph_down(g, trans); return 0; } @@ -238,9 +262,13 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) struct trans_waiting_for_lock *top; struct btree_bkey_cached_common *b; struct btree_path *path; + unsigned path_idx; int ret; if (trans->lock_must_abort) { + if (cycle) + return -1; + trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); } @@ -253,12 +281,12 @@ next: top = &g.g[g.nr - 1]; - trans_for_each_path_from(top->trans, path, top->path_idx) { + trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { if (!path->nodes_locked) continue; - if (top->path_idx != path->idx) { - top->path_idx = path->idx; + if (path_idx != top->path_idx) { + top->path_idx = path_idx; top->level = 0; top->lock_start_time = 0; } @@ -274,7 +302,25 @@ next: b = &READ_ONCE(path->l[top->level].b)->c; if (IS_ERR_OR_NULL(b)) { - BUG_ON(!lock_graph_remove_non_waiters(&g)); + /* + * If we get here, it means we raced with the + * other thread updating its btree_path + * structures - which means it can't be blocked + * waiting on a lock: + */ + if (!lock_graph_remove_non_waiters(&g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be + * because we're being called by debugfs + * checking for lock cycles, which + * invokes us on btree_transactions that + * aren't actually waiting on anything. + * Just bail out: + */ + lock_graph_pop_all(&g); + } + goto next; } @@ -296,9 +342,10 @@ next: !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) continue; - ret = lock_graph_descend(&g, trans, cycle); + closure_get(&trans->ref); raw_spin_unlock(&b->lock.wait_lock); + ret = lock_graph_descend(&g, trans, cycle); if (ret) return ret; goto next; @@ -335,15 +382,50 @@ int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *p * locked: */ six_lock_readers_add(&b->lock, -readers); - ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, + lock_may_not_fail, _RET_IP_); six_lock_readers_add(&b->lock, readers); if (ret) - mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); return ret; } +void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + struct btree_path *linked; + unsigned i; + int ret; + + /* + * XXX BIG FAT NOTICE + * + * Drop all read locks before taking a write lock: + * + * This is a hack, because bch2_btree_node_lock_write_nofail() is a + * hack - but by dropping read locks first, this should never fail, and + * we only use this in code paths where whatever read locks we've + * already taken are no longer needed: + */ + + trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_read_locked(linked, i)) { + btree_node_unlock(trans, linked, i); + btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); + } + } + + ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); +} + /* relock */ static inline bool btree_path_get_locks(struct btree_trans *trans, @@ -407,7 +489,7 @@ bool __bch2_btree_node_relock(struct btree_trans *trans, return true; } fail: - if (trace) + if (trace && !trans->notrace_relock_fail) trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); return false; } @@ -469,7 +551,7 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); return false; success: - mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); return true; } @@ -504,11 +586,15 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans, return btree_path_get_locks(trans, path, false); } -__flatten -bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, +int __bch2_btree_path_relock(struct btree_trans *trans, struct btree_path *path, unsigned long trace_ip) { - return btree_path_get_locks(trans, path, true); + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } + + return 0; } bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, @@ -580,7 +666,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } else { if (btree_node_intent_locked(path, l)) { six_lock_downgrade(&path->l[l].b->c.lock); - mark_btree_node_locked_noreset(path, l, SIX_LOCK_read); + mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); } break; } @@ -615,6 +701,29 @@ int bch2_trans_relock(struct btree_trans *trans) return 0; } +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +void bch2_trans_unlock_noassert(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); +} + void bch2_trans_unlock(struct btree_trans *trans) { struct btree_path *path; @@ -626,8 +735,8 @@ void bch2_trans_unlock(struct btree_trans *trans) * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking * btree nodes, it implements its own walking: */ - EBUG_ON(!trans->is_initial_gc && - lock_class_is_held(&bch2_btree_node_lock_key)); + if (!trans->is_initial_gc) + bch2_assert_btree_nodes_not_locked(); } bool bch2_trans_locked(struct btree_trans *trans) @@ -640,6 +749,16 @@ bool bch2_trans_locked(struct btree_trans *trans) return false; } +int __bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) +{ + int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); + + if (ret) + mutex_unlock(lock); + return ret; +} + /* Debug */ #ifdef CONFIG_BCACHEFS_DEBUG diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index fb237c9..ce3c7d9 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -14,7 +14,15 @@ #include "btree_iter.h" -extern struct lock_class_key bch2_btree_node_lock_key; +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); + +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void); +#else +static inline void bch2_assert_btree_nodes_not_locked(void) {} +#endif + +void bch2_trans_unlock_noassert(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) { @@ -86,7 +94,7 @@ static inline void mark_btree_node_locked(struct btree_trans *trans, unsigned level, enum six_lock_type type) { - mark_btree_node_locked_noreset(path, level, type); + mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS path->l[level].lock_taken_time = local_clock(); #endif @@ -170,13 +178,13 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat struct btree_path *linked; EBUG_ON(path->l[b->c.level].b != b); - EBUG_ON(path->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); - mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); trans_for_each_path_with_node(trans, b, linked) - linked->l[b->c.level].lock_seq += 2; + linked->l[b->c.level].lock_seq++; six_unlock_write(&b->c.lock); } @@ -191,7 +199,8 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); static inline int __btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type, - bool lock_may_not_fail) + bool lock_may_not_fail, + unsigned long ip) { int ret; @@ -199,8 +208,8 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, trans->lock_must_abort = false; trans->locking = b; - ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, - bch2_six_check_for_deadlock, trans); + ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); WRITE_ONCE(trans->locking, NULL); WRITE_ONCE(trans->locking_wait.start_time, 0); return ret; @@ -209,16 +218,17 @@ static inline int __btree_node_lock_nopath(struct btree_trans *trans, static inline int __must_check btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, - enum six_lock_type type) + enum six_lock_type type, + unsigned long ip) { - return __btree_node_lock_nopath(trans, b, type, false); + return __btree_node_lock_nopath(trans, b, type, false, ip); } static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - int ret = __btree_node_lock_nopath(trans, b, type, true); + int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); BUG_ON(ret); } @@ -237,7 +247,7 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans, trans_for_each_path(trans, path) if (&path->l[level].b->c == b && btree_node_locked_type(path, level) >= want) { - six_lock_increment(&b->lock, want); + six_lock_increment(&b->lock, (enum six_lock_type) want); return true; } @@ -257,8 +267,8 @@ static inline int btree_node_lock(struct btree_trans *trans, EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); if (likely(six_trylock_type(&b->lock, type)) || - btree_node_lock_increment(trans, b, level, type) || - !(ret = btree_node_lock_nopath(trans, b, type))) { + btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || + !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { #ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS path->l[b->level].lock_taken_time = local_clock(); #endif @@ -276,7 +286,7 @@ static inline int __btree_node_lock_write(struct btree_trans *trans, bool lock_may_not_fail) { EBUG_ON(&path->l[b->level].b->c != b); - EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); + EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); EBUG_ON(!btree_node_intent_locked(path, b->level)); /* @@ -284,22 +294,13 @@ static inline int __btree_node_lock_write(struct btree_trans *trans, * write lock: thus, we need to tell the cycle detector we have a write * lock _before_ taking the lock: */ - mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); + mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); return likely(six_trylock_write(&b->lock)) ? 0 : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); } -static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) -{ - int ret = __btree_node_lock_write(trans, path, b, true); - - BUG_ON(ret); -} - static inline int __must_check bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, @@ -308,10 +309,25 @@ bch2_btree_node_lock_write(struct btree_trans *trans, return __btree_node_lock_write(trans, path, b, false); } +void bch2_btree_node_lock_write_nofail(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *); + /* relock: */ bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *, unsigned long); +int __bch2_btree_path_relock(struct btree_trans *, + struct btree_path *, unsigned long); + +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_node_locked(path, path->level) + ? 0 + : __bch2_btree_path_relock(trans, path, trace_ip); +} + bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); static inline bool bch2_btree_node_relock(struct btree_trans *trans, @@ -338,17 +354,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, __bch2_btree_node_relock(trans, path, level, false)); } -static inline int bch2_btree_path_relock(struct btree_trans *trans, - struct btree_path *path, unsigned long trace_ip) -{ - if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { - trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); - } - - return 0; -} - /* upgrade */ bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 6d44b2e..6b6333d 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -9,7 +9,9 @@ //#include "bkey_methods.h" #include "buckets_types.h" #include "darray.h" +#include "errcode.h" #include "journal_types.h" +#include "replicas_types.h" struct open_bucket; struct btree_update; @@ -192,31 +194,34 @@ struct btree_node_iter { /* * Iterate over all possible positions, synthesizing deleted keys for holes: */ -#define BTREE_ITER_SLOTS (1 << 0) -#define BTREE_ITER_ALL_LEVELS (1 << 1) +static const u16 BTREE_ITER_SLOTS = 1 << 0; +static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; /* * Indicates that intent locks should be taken on leaf nodes, because we expect * to be doing updates: */ -#define BTREE_ITER_INTENT (1 << 2) +static const u16 BTREE_ITER_INTENT = 1 << 2; /* * Causes the btree iterator code to prefetch additional btree nodes from disk: */ -#define BTREE_ITER_PREFETCH (1 << 3) +static const u16 BTREE_ITER_PREFETCH = 1 << 3; /* * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for * @pos or the first key strictly greater than @pos */ -#define BTREE_ITER_IS_EXTENTS (1 << 4) -#define BTREE_ITER_NOT_EXTENTS (1 << 5) -#define BTREE_ITER_CACHED (1 << 6) -#define BTREE_ITER_WITH_KEY_CACHE (1 << 7) -#define BTREE_ITER_WITH_UPDATES (1 << 8) -#define BTREE_ITER_WITH_JOURNAL (1 << 9) -#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 10) -#define BTREE_ITER_ALL_SNAPSHOTS (1 << 11) -#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 12) -#define BTREE_ITER_NOPRESERVE (1 << 13) +static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; +static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; +static const u16 BTREE_ITER_CACHED = 1 << 6; +static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; +static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; +static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; +static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; +static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; +static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; +static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; +static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; +#define __BTREE_ITER_FLAGS_END 16 enum btree_path_uptodate { BTREE_ITER_UPTODATE = 0, @@ -224,6 +229,10 @@ enum btree_path_uptodate { BTREE_ITER_NEED_TRAVERSE = 2, }; +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) +#define TRACK_PATH_ALLOCATED +#endif + struct btree_path { u8 idx; u8 sorted_idx; @@ -233,7 +242,7 @@ struct btree_path { /* btree_iter_copy starts here: */ struct bpos pos; - enum btree_id btree_id:4; + enum btree_id btree_id:5; bool cached:1; bool preserve:1; enum btree_path_uptodate uptodate:2; @@ -243,7 +252,7 @@ struct btree_path { */ bool should_be_locked:1; unsigned level:3, - locks_want:4; + locks_want:3; u8 nodes_locked; struct btree_path_level { @@ -254,7 +263,7 @@ struct btree_path { u64 lock_taken_time; #endif } l[BTREE_MAX_DEPTH]; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif }; @@ -264,6 +273,15 @@ static inline struct btree_path_level *path_l(struct btree_path *path) return path->l + path->level; } +static inline unsigned long btree_path_ip_allocated(struct btree_path *path) +{ +#ifdef TRACK_PATH_ALLOCATED + return path->ip_allocated; +#else + return _THIS_IP_; +#endif +} + /* * @pos - iterator's current position * @level - current btree depth @@ -277,7 +295,7 @@ struct btree_iter { struct btree_path *update_path; struct btree_path *key_cache_path; - enum btree_id btree_id:4; + enum btree_id btree_id:8; unsigned min_depth:3; unsigned advanced:1; @@ -288,7 +306,6 @@ struct btree_iter { unsigned snapshot; struct bpos pos; - struct bpos pos_after_commit; /* * Current unpacked key - so that bch2_btree_iter_next()/ * bch2_btree_iter_next_slot() can correctly advance pos. @@ -298,7 +315,7 @@ struct btree_iter { /* BTREE_ITER_WITH_JOURNAL: */ size_t journal_idx; struct bpos journal_pos; -#ifdef CONFIG_BCACHEFS_DEBUG +#ifdef TRACK_PATH_ALLOCATED unsigned long ip_allocated; #endif }; @@ -345,6 +362,7 @@ struct bkey_cached { struct journal_preres res; struct journal_entry_pin journal; + u64 seq; struct bkey_i *k; }; @@ -372,17 +390,14 @@ struct btree_insert_entry { u8 old_btree_u64s; struct bkey_i *k; struct btree_path *path; + u64 seq; /* key being overwritten: */ struct bkey old_k; const struct bch_val *old_v; unsigned long ip_allocated; }; -#ifndef CONFIG_LOCKDEP #define BTREE_ITER_MAX 64 -#else -#define BTREE_ITER_MAX 32 -#endif struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -413,15 +428,21 @@ struct btree_trans { u8 fn_idx; u8 nr_sorted; u8 nr_updates; - u8 traverse_all_idx; + u8 nr_wb_updates; + u8 wb_updates_size; bool used_mempool:1; bool in_traverse_all:1; + bool paths_sorted:1; bool memory_allocation_failure:1; - bool is_initial_gc:1; + bool journal_transaction_names:1; bool journal_replay_not_finished:1; + bool is_initial_gc:1; + bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; + unsigned long last_begin_ip; unsigned long last_restarted_ip; + unsigned long srcu_lock_time; /* * For when bch2_trans_update notices we'll be splitting a compressed @@ -437,20 +458,20 @@ struct btree_trans { unsigned mem_bytes; void *mem; - u8 sorted[BTREE_ITER_MAX]; + u8 sorted[BTREE_ITER_MAX + 8]; struct btree_path *paths; struct btree_insert_entry *updates; + struct btree_write_buffered_key *wb_updates; /* update path: */ struct btree_trans_commit_hook *hooks; - DARRAY(u64) extra_journal_entries; + darray_u64 extra_journal_entries; struct journal_entry_pin *journal_pin; struct journal_res journal_res; struct journal_preres journal_preres; u64 *journal_seq; struct disk_reservation *disk_res; - unsigned flags; unsigned journal_u64s; unsigned journal_preres_u64s; struct replicas_delta_list *fs_usage_deltas; @@ -622,7 +643,7 @@ static inline unsigned bset_byte_offset(struct btree *b, void *i) } enum btree_node_type { -#define x(kwd, val) BKEY_TYPE_##kwd = val, +#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, BCH_BTREE_IDS() #undef x BKEY_TYPE_btree, @@ -641,56 +662,64 @@ static inline enum btree_node_type btree_node_type(struct btree *b) } #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ - ((1U << BKEY_TYPE_extents)| \ - (1U << BKEY_TYPE_alloc)| \ - (1U << BKEY_TYPE_inodes)| \ - (1U << BKEY_TYPE_stripes)| \ - (1U << BKEY_TYPE_reflink)| \ - (1U << BKEY_TYPE_btree)) + (BIT(BKEY_TYPE_extents)| \ + BIT(BKEY_TYPE_alloc)| \ + BIT(BKEY_TYPE_inodes)| \ + BIT(BKEY_TYPE_stripes)| \ + BIT(BKEY_TYPE_reflink)| \ + BIT(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ - ((1U << BKEY_TYPE_alloc)| \ - (1U << BKEY_TYPE_inodes)| \ - (1U << BKEY_TYPE_stripes)| \ - (1U << BKEY_TYPE_snapshots)) + (BIT(BKEY_TYPE_alloc)| \ + BIT(BKEY_TYPE_inodes)| \ + BIT(BKEY_TYPE_stripes)| \ + BIT(BKEY_TYPE_snapshots)) #define BTREE_NODE_TYPE_HAS_TRIGGERS \ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) -#define BTREE_ID_IS_EXTENTS \ - ((1U << BTREE_ID_extents)| \ - (1U << BTREE_ID_reflink)| \ - (1U << BTREE_ID_freespace)) +static inline bool btree_node_type_needs_gc(enum btree_node_type type) +{ + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); +} static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return (1U << type) & BTREE_ID_IS_EXTENTS; -} + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) + BCH_BTREE_IDS() +#undef x + ; -#define BTREE_ID_HAS_SNAPSHOTS \ - ((1U << BTREE_ID_extents)| \ - (1U << BTREE_ID_inodes)| \ - (1U << BTREE_ID_dirents)| \ - (1U << BTREE_ID_xattrs)) + return (1U << type) & mask; +} -#define BTREE_ID_HAS_PTRS \ - ((1U << BTREE_ID_extents)| \ - (1U << BTREE_ID_reflink)) +static inline bool btree_id_is_extents(enum btree_id btree) +{ + return btree_node_type_is_extents((enum btree_node_type) btree); +} static inline bool btree_type_has_snapshots(enum btree_id id) { - return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) + BCH_BTREE_IDS() +#undef x + ; + + return (1U << id) & mask; } static inline bool btree_type_has_ptrs(enum btree_id id) { - return (1 << id) & BTREE_ID_HAS_PTRS; -} + const unsigned mask = 0 +#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) + BCH_BTREE_IDS() +#undef x + ; -static inline bool btree_node_type_needs_gc(enum btree_node_type type) -{ - return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); + return (1U << id) & mask; } struct btree_root { @@ -703,15 +732,6 @@ struct btree_root { s8 error; }; -enum btree_insert_ret { - BTREE_INSERT_OK, - /* leaf node needs to be split */ - BTREE_INSERT_BTREE_NODE_FULL, - BTREE_INSERT_NEED_MARK_REPLICAS, - BTREE_INSERT_NEED_JOURNAL_RES, - BTREE_INSERT_NEED_JOURNAL_RECLAIM, -}; - enum btree_gc_coalesce_fail_reason { BTREE_GC_COALESCE_FAIL_RESERVE_GET, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 1c2e7b2..2281140 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "journal.h" +#include "journal.h" struct bch_fs; struct btree; @@ -13,14 +14,19 @@ void bch2_btree_node_prep_for_write(struct btree_trans *, bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, struct btree *, struct btree_node_iter *, struct bkey_i *); + +int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); +int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); +void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, + struct bkey_i *, u64); + enum btree_insert_flags { - /* First two bits for journal watermark: */ - __BTREE_INSERT_NOFAIL = 2, + /* First bits for bch_watermark: */ + __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, - __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, __BTREE_INSERT_JOURNAL_RECLAIM, __BTREE_INSERT_NOWAIT, @@ -30,32 +36,34 @@ enum btree_insert_flags { }; /* Don't check for -ENOSPC: */ -#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) - -#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) -#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) +#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) -/* for copygc, or when merging btree nodes */ -#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) +#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) +#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) /* Insert is for journal replay - don't get journal reservations: */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) +#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) /* Insert is being called from journal reclaim path: */ -#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) +#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) /* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) -#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) +#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) +#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) -#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) -#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) +#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); -int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); +int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, + struct bkey_i *, enum btree_update_flags); + +int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *, + enum btree_update_flags); int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, u64 *, int flags); @@ -64,25 +72,50 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); -int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); -void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); -int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, - struct btree *, struct bkey_i *, bool); -int bch2_btree_node_update_key_get_iter(struct btree_trans *, - struct btree *, struct bkey_i *, bool); +int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, + struct bpos, struct bpos); + +/* + * For use when splitting extents in existing snapshots: + * + * If @old_pos is an interior snapshot node, iterate over descendent snapshot + * nodes: for every descendent snapshot in whiche @old_pos is overwritten and + * not visible, emit a whiteout at @new_pos. + */ +static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id btree, + struct bpos old_pos, + struct bpos new_pos) +{ + if (!btree_type_has_snapshots(btree) || + bkey_eq(old_pos, new_pos)) + return 0; + + return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, + enum btree_update_flags, + struct bkey_s_c, struct bkey_s_c); -int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_update_flags); +int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos); int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_buffered(struct btree_trans *, + enum btree_id, struct bkey_i *); void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); -int __bch2_trans_commit(struct btree_trans *); +int __bch2_trans_commit(struct btree_trans *, unsigned); -int bch2_trans_log_msg(struct btree_trans *, const char *); +int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +int bch2_journal_log_msg(struct bch_fs *, const char *, ...); /** * bch2_trans_commit - insert keys at given iterator positions @@ -100,9 +133,8 @@ static inline int bch2_trans_commit(struct btree_trans *trans, { trans->disk_res = disk_res; trans->journal_seq = journal_seq; - trans->flags = flags; - return __bch2_trans_commit(trans); + return __bch2_trans_commit(trans, flags); } #define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ @@ -142,6 +174,11 @@ static inline int bch2_trans_commit(struct btree_trans *trans, (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) +#define trans_for_each_wb_update(_trans, _i) \ + for ((_i) = (_trans)->wb_updates; \ + (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ + (_i)++) + static inline void bch2_trans_reset_updates(struct btree_trans *trans) { struct btree_insert_entry *i; @@ -151,8 +188,166 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) trans->extra_journal_res = 0; trans->nr_updates = 0; + trans->nr_wb_updates = 0; + trans->wb_updates = NULL; trans->hooks = NULL; trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } +} + +static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, + unsigned type, unsigned min_bytes) +{ + unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); + struct bkey_i *mut; + + if (type && k.k->type != type) + return ERR_PTR(-ENOENT); + + mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + if (!IS_ERR(mut)) { + bkey_reassemble(mut, k); + + if (unlikely(bytes > bkey_bytes(k.k))) { + memset((void *) mut + bkey_bytes(k.k), 0, + bytes - bkey_bytes(k.k)); + mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); + } + } + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +{ + return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); +} + +#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags, + unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) + return ERR_PTR(ret); + + *k = bkey_i_to_s_c(mut); + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags) +{ + return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); +} + +#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type); + struct bkey_i *ret = unlikely(IS_ERR(k.k)) + ? ERR_CAST(k.k) + : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); + if (unlikely(IS_ERR(ret))) + bch2_trans_iter_exit(trans, iter); + return ret; } +static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); +} + +static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } + + return mut; +} + +static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned min_bytes) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); +} + +static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); +} + +#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ + _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, + unsigned flags, unsigned type, unsigned val_size) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); + int ret; + + if (IS_ERR(k)) + return k; + + bkey_init(&k->k); + k->k.p = iter->pos; + k->k.type = type; + set_bkey_val_bytes(&k->k, val_size); + + ret = bch2_trans_update(trans, iter, k, flags); + if (unlikely(ret)) + return ERR_PTR(ret); + return k; +} + +#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ + KEY_TYPE_##_type, sizeof(struct bch_##_type))) + #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index d4f1920..f42ef46 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -11,6 +11,7 @@ #include "btree_iter.h" #include "btree_locking.h" #include "buckets.h" +#include "clock.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -19,9 +20,9 @@ #include "recovery.h" #include "replicas.h" #include "super-io.h" +#include "trace.h" #include -#include static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, struct btree_path *, struct btree *, @@ -72,7 +73,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) break; bp = bkey_s_c_to_btree_ptr_v2(k); - if (bpos_cmp(next_node, bp.v->min_key)) { + if (!bpos_eq(next_node, bp.v->min_key)) { bch2_dump_btree_node(c, b); bch2_bpos_to_text(&buf1, next_node); bch2_bpos_to_text(&buf2, bp.v->min_key); @@ -82,7 +83,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) bch2_btree_node_iter_advance(&iter, b); if (bch2_btree_node_iter_end(&iter)) { - if (bpos_cmp(k.k->p, b->key.k.p)) { + if (!bpos_eq(k.k->p, b->key.k.p)) { bch2_dump_btree_node(c, b); bch2_bpos_to_text(&buf1, b->key.k.p); bch2_bpos_to_text(&buf2, k.k->p); @@ -161,6 +162,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) { trace_and_count(c, btree_node_free, c, b); + BUG_ON(btree_node_write_blocked(b)); BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_need_write(b)); BUG_ON(b == btree_node_root(c, b)); @@ -186,7 +188,7 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, bch2_btree_node_hash_remove(&c->btree_cache, b); __btree_node_free(c, b); six_unlock_write(&b->c.lock); - mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); trans_for_each_path(trans, path) if (path->l[level].b == b) { @@ -241,21 +243,15 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct bch_fs *c = trans->c; struct write_point *wp; struct btree *b; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct open_buckets ob = { .nr = 0 }; struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; - unsigned nr_reserve; - enum alloc_reserve alloc_reserve; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + ? BTREE_NODE_RESERVE + : 0; int ret; - if (flags & BTREE_INSERT_USE_RESERVE) { - nr_reserve = 0; - alloc_reserve = RESERVE_btree_movinggc; - } else { - nr_reserve = BTREE_NODE_RESERVE; - alloc_reserve = RESERVE_btree; - } - mutex_lock(&c->btree_reserve_cache_lock); if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = @@ -277,7 +273,7 @@ retry: &devs_have, res->nr_replicas, c->opts.metadata_replicas_required, - alloc_reserve, 0, cl, &wp); + watermark, 0, cl, &wp); if (unlikely(ret)) return ERR_PTR(ret); @@ -299,7 +295,7 @@ retry: bch2_open_bucket_get(c, wp, &ob); bch2_alloc_sectors_done(c, wp); mem_alloc: - b = bch2_btree_node_mem_alloc(c, interior_node); + b = bch2_btree_node_mem_alloc(trans, interior_node); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -362,6 +358,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, BUG_ON(ret); trace_and_count(c, btree_node_alloc, c, b); + bch2_increment_clock(c, btree_sectors(c), WRITE); return b; } @@ -644,11 +641,10 @@ static void btree_update_nodes_written(struct btree_update *as) * which may require allocations as well. */ ret = commit_do(&trans, &as->disk_res, &journal_seq, + BCH_WATERMARK_reclaim| BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RECLAIM| - JOURNAL_WATERMARK_reserved, + BTREE_INSERT_JOURNAL_RECLAIM, btree_update_nodes_written_trans(&trans, as)); bch2_trans_unlock(&trans); @@ -685,7 +681,8 @@ err: bch2_trans_unlock(&trans); btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); - bch2_btree_path_level_init(&trans, path, b); + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); + path->l[b->c.level].b = b; bch2_btree_node_lock_write_nofail(&trans, path, &b->c); @@ -723,7 +720,7 @@ err: mutex_unlock(&c->btree_interior_update_lock); - mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); six_unlock_write(&b->c.lock); btree_node_write_if_need(c, b, SIX_LOCK_intent); @@ -806,6 +803,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); BUG_ON(!btree_node_dirty(b)); + BUG_ON(!b->c.level); as->mode = BTREE_INTERIOR_UPDATING_NODE; as->b = b; @@ -975,6 +973,7 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as, clear_btree_node_dirty_acct(c, b); clear_btree_node_need_write(b); + clear_btree_node_write_blocked(b); /* * Does this node have unwritten data that has a pin on the journal? @@ -1043,14 +1042,24 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2] = { 0, 0 }; unsigned update_level = level; - int journal_flags = flags & JOURNAL_WATERMARK_MASK; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned journal_flags = 0; int ret = 0; u32 restart_count = trans->restart_count; BUG_ON(!path->should_be_locked); + if (watermark == BCH_WATERMARK_copygc) + watermark = BCH_WATERMARK_btree_copygc; + if (watermark < BCH_WATERMARK_btree) + watermark = BCH_WATERMARK_btree; + + flags &= ~BCH_WATERMARK_MASK; + flags |= watermark; + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) journal_flags |= JOURNAL_RES_GET_NONBLOCK; + journal_flags |= watermark; while (1) { nr_nodes[!!update_level] += 1 + split; @@ -1071,22 +1080,20 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, BKEY_BTREE_PTR_U64s_MAX * (1 + split))) break; - split = true; + split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); } if (flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); else if (!down_read_trylock(&c->gc_lock)) { - bch2_trans_unlock(trans); - down_read(&c->gc_lock); - ret = bch2_trans_relock(trans); + ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); if (ret) { up_read(&c->gc_lock); return ERR_PTR(ret); } } - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); memset(as, 0, sizeof(*as)); closure_init(&as->cl, NULL); as->c = c; @@ -1122,23 +1129,19 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, BTREE_UPDATE_JOURNAL_RES, journal_flags|JOURNAL_RES_GET_NONBLOCK); if (ret) { - bch2_trans_unlock(trans); - if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { ret = -BCH_ERR_journal_reclaim_would_deadlock; goto err; } - ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + ret = drop_locks_do(trans, + bch2_journal_preres_get(&c->journal, &as->journal_preres, BTREE_UPDATE_JOURNAL_RES, - journal_flags); - if (ret) { + journal_flags)); + if (ret == -BCH_ERR_journal_preres_get_blocked) { trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); - goto err; } - - ret = bch2_trans_relock(trans); if (ret) goto err; } @@ -1155,6 +1158,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, bch2_err_matches(ret, ENOMEM)) { struct closure cl; + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if (bch2_err_matches(ret, ENOSPC) && + (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + watermark != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + closure_init_stack(&cl); do { @@ -1162,7 +1176,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, bch2_trans_unlock(trans); closure_sync(&cl); - } while (ret == -EAGAIN); + } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); } if (ret) { @@ -1196,7 +1210,7 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) (b->c.level < btree_node_root(c, b)->c.level || !btree_node_dying(btree_node_root(c, b)))); - btree_node_root(c, b) = b; + bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); bch2_recalc_btree_reserve(c); @@ -1329,7 +1343,7 @@ __bch2_btree_insert_keys_interior(struct btree_update *as, while (!bch2_keylist_empty(keys)) { struct bkey_i *k = bch2_keylist_front(keys); - if (bpos_cmp(k->k.p, b->key.k.p) > 0) + if (bpos_gt(k->k.p, b->key.k.p)) break; bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k); @@ -1347,7 +1361,7 @@ static void __btree_split_node(struct btree_update *as, struct btree *n[2]) { struct bkey_packed *k; - struct bpos n1_pos; + struct bpos n1_pos = POS_MIN; struct btree_node_iter iter; struct bset *bsets[2]; struct bkey_format_state format[2]; @@ -1409,7 +1423,7 @@ static void __btree_split_node(struct btree_update *as, out[i]->needs_whiteout = false; btree_keys_account_key_add(&n[i]->nr, 0, out[i]); - out[i] = bkey_next(out[i]); + out[i] = bkey_p_next(out[i]); } for (i = 0; i < 2; i++) { @@ -1446,8 +1460,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct keylist *keys) { if (!bch2_keylist_empty(keys) && - bpos_cmp(bch2_keylist_front(keys)->k.p, - b->data->max_key) <= 0) { + bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { struct btree_node_iter node_iter; bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); @@ -1675,9 +1688,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); - if (!(local_clock() & 63)) - return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); - ret = bch2_btree_node_lock_write(trans, path, &b->c); if (ret) return ret; @@ -1714,8 +1724,10 @@ split: * We could attempt to avoid the transaction restart, by calling * bch2_btree_path_upgrade() and allocating more nodes: */ - if (b->c.level >= as->update_level) + if (b->c.level >= as->update_level) { + trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); + } return btree_split(as, trans, path, b, keys, flags); } @@ -1771,8 +1783,8 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, b = path->l[level].b; - if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) || - (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) { + if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { b->sib_u64s[sib] = U16_MAX; return 0; } @@ -1805,7 +1817,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, next = m; } - if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) { + if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; bch2_bpos_to_text(&buf1, prev->data->max_key); @@ -1847,9 +1859,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(path, b); as = bch2_btree_update_start(trans, path, level, false, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - flags); + BTREE_INSERT_NOFAIL|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -1995,6 +2005,7 @@ err: struct async_btree_rewrite { struct bch_fs *c; struct work_struct work; + struct list_head list; enum btree_id btree_id; unsigned level; struct bpos pos; @@ -2004,6 +2015,7 @@ struct async_btree_rewrite { static int async_btree_node_rewrite_trans(struct btree_trans *trans, struct async_btree_rewrite *a) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct btree *b; int ret; @@ -2015,8 +2027,18 @@ static int async_btree_node_rewrite_trans(struct btree_trans *trans, if (ret) goto out; - if (!b || b->data->keys.seq != a->seq) + if (!b || b->data->keys.seq != a->seq) { + struct printbuf buf = PRINTBUF; + + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null"); + bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", + __func__, a->seq, buf.buf); + printbuf_exit(&buf); goto out; + } ret = bch2_btree_node_rewrite(trans, &iter, b, 0); out: @@ -2025,28 +2047,29 @@ out: return ret; } -void async_btree_node_rewrite_work(struct work_struct *work) +static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; + int ret; - bch2_trans_do(c, NULL, NULL, 0, + ret = bch2_trans_do(c, NULL, NULL, 0, async_btree_node_rewrite_trans(&trans, a)); - percpu_ref_put(&c->writes); + if (ret) + bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { struct async_btree_rewrite *a; - - if (!percpu_ref_tryget_live(&c->writes)) - return; + int ret; a = kmalloc(sizeof(*a), GFP_NOFS); if (!a) { - percpu_ref_put(&c->writes); + bch_err(c, "%s: error allocating memory", __func__); return; } @@ -2055,15 +2078,68 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->level = b->c.level; a->pos = b->key.k.p; a->seq = b->data->keys.seq; - INIT_WORK(&a->work, async_btree_node_rewrite_work); + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + mutex_lock(&c->pending_node_rewrites_lock); + list_add(&a->list, &c->pending_node_rewrites); + mutex_unlock(&c->pending_node_rewrites_lock); + return; + } + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (test_bit(BCH_FS_STARTED, &c->flags)) { + bch_err(c, "%s: error getting c->writes ref", __func__); + kfree(a); + return; + } + + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err(c, "%s: error going read-write: %s", + __func__, bch2_err_str(ret)); + kfree(a); + return; + } + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + } + queue_work(c->btree_interior_update_worker, &a->work); } +void bch2_do_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + queue_work(c->btree_interior_update_worker, &a->work); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + +void bch2_free_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + kfree(a); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + static int __bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, struct btree *new_hash, struct bkey_i *new_key, + unsigned commit_flags, bool skip_triggers) { struct bch_fs *c = trans->c; @@ -2099,11 +2175,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, _THIS_IP_); BUG_ON(iter2.path->level != b->c.level); - BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p)); + BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); btree_path_set_level_up(trans, iter2.path); - bch2_btree_path_check_sort(trans, iter2.path, 0); + trans->paths_sorted = false; ret = bch2_btree_iter_traverse(&iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); @@ -2124,12 +2200,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); } - ret = bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RECLAIM| - JOURNAL_WATERMARK_reserved); + ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); if (ret) goto err; @@ -2163,7 +2234,7 @@ err: int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, struct bkey_i *new_key, - bool skip_triggers) + unsigned commit_flags, bool skip_triggers) { struct bch_fs *c = trans->c; struct btree *new_hash = NULL; @@ -2184,19 +2255,17 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite if (btree_ptr_hash_val(new_key) != b->hash_val) { ret = bch2_btree_cache_cannibalize_lock(c, &cl); if (ret) { - bch2_trans_unlock(trans); - closure_sync(&cl); - ret = bch2_trans_relock(trans); + ret = drop_locks_do(trans, (closure_sync(&cl), 0)); if (ret) return ret; } - new_hash = bch2_btree_node_mem_alloc(c, false); + new_hash = bch2_btree_node_mem_alloc(trans, false); } path->intent_ref++; - ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, - new_key, skip_triggers); + ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, + commit_flags, skip_triggers); --path->intent_ref; if (new_hash) { @@ -2214,7 +2283,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, struct btree *b, struct bkey_i *new_key, - bool skip_triggers) + unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; int ret; @@ -2235,7 +2304,8 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, BUG_ON(!btree_node_hashed(b)); - ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers); + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, + commit_flags, skip_triggers); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -2254,8 +2324,9 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) bch2_btree_set_root_inmem(c, b); } -void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) { + struct bch_fs *c = trans->c; struct closure cl; struct btree *b; int ret; @@ -2267,7 +2338,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c, false); + b = bch2_btree_node_mem_alloc(trans, false); bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); @@ -2296,6 +2367,12 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); + return 0; +} + +void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +{ + bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id)); } void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) @@ -2334,20 +2411,15 @@ bool bch2_btree_interior_updates_flush(struct bch_fs *c) return ret; } -void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) +void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) { - struct btree_root *r; - struct jset_entry *entry; + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); mutex_lock(&c->btree_root_lock); - vstruct_for_each(jset, entry) - if (entry->type == BCH_JSET_ENTRY_btree_root) { - r = &c->btree_roots[entry->btree_id]; - r->level = entry->level; - r->alive = true; - bkey_copy(&r->key, &entry->start[0]); - } + r->level = entry->level; + r->alive = true; + bkey_copy(&r->key, &entry->start[0]); mutex_unlock(&c->btree_root_lock); } @@ -2367,15 +2439,15 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, mutex_lock(&c->btree_root_lock); - for (i = 0; i < BTREE_ID_NR; i++) - if (c->btree_roots[i].alive && !test_bit(i, &have)) { - journal_entry_set(end, - BCH_JSET_ENTRY_btree_root, - i, c->btree_roots[i].level, - &c->btree_roots[i].key, - c->btree_roots[i].key.u64s); + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->alive && !test_bit(i, &have)) { + journal_entry_set(end, BCH_JSET_ENTRY_btree_root, + i, r->level, &r->key, r->key.k.u64s); end = vstruct_next(end); } + } mutex_unlock(&c->btree_root_lock); @@ -2389,7 +2461,7 @@ void bch2_fs_btree_interior_update_exit(struct bch_fs *c) mempool_exit(&c->btree_interior_update_pool); } -int bch2_fs_btree_interior_update_init(struct bch_fs *c) +void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) { mutex_init(&c->btree_reserve_cache_lock); INIT_LIST_HEAD(&c->btree_interior_update_list); @@ -2397,11 +2469,20 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) mutex_init(&c->btree_interior_update_lock); INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); + INIT_LIST_HEAD(&c->pending_node_rewrites); + mutex_init(&c->pending_node_rewrites_lock); +} + +int bch2_fs_btree_interior_update_init(struct bch_fs *c) +{ c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); if (!c->btree_interior_update_worker) - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; - return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_update)); + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, + sizeof(struct btree_update))) + return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + + return 0; } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 2e6d220..5e0a467 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -154,6 +154,15 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, btree_next_sib); } +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); +void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); +int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, + unsigned, bool); +int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, + struct bkey_i *, unsigned, bool); + void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); @@ -314,11 +323,15 @@ void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); bool bch2_btree_interior_updates_flush(struct bch_fs *); -void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); +void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, struct jset_entry *, struct jset_entry *); +void bch2_do_pending_node_rewrites(struct bch_fs *); +void bch2_free_pending_node_rewrites(struct bch_fs *); + void bch2_fs_btree_interior_update_exit(struct bch_fs *); +void bch2_fs_btree_interior_update_init_early(struct bch_fs *); int bch2_fs_btree_interior_update_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 05c1b28..369e37a 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -8,6 +8,7 @@ #include "btree_iter.h" #include "btree_key_cache.h" #include "btree_locking.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "debug.h" #include "errcode.h" @@ -19,14 +20,54 @@ #include "recovery.h" #include "subvolume.h" #include "replicas.h" +#include "trace.h" #include #include -#include + +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); + + if (j_k) + k = bkey_i_to_s_c(j_k); + } + + u = *k.k; + u.needs_whiteout = i->old_k.needs_whiteout; + + BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); + BUG_ON(i->old_v != k.v); +#endif +} static int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, - struct bkey_i *, enum btree_update_flags); + struct bkey_i *, enum btree_update_flags, + unsigned long ip); static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) @@ -62,9 +103,6 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, { struct bch_fs *c = trans->c; - if (path->cached) - return; - if (unlikely(btree_node_just_written(b)) && bch2_btree_post_write_cleanup(c, b)) bch2_trans_node_reinit_iter(trans, b); @@ -92,8 +130,8 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(btree_node_just_written(b)); EBUG_ON(bset_written(b, btree_bset_last(b))); EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0); - EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0); + EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); + EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(trans->c, b)); @@ -190,12 +228,12 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, return 0; } -static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 0, seq); } -static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) { return __btree_node_flush(j, pin, 1, seq); } @@ -207,35 +245,38 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c, bch2_journal_pin_add(&c->journal, seq, &w->journal, btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); } /** * btree_insert_key - insert a key one key into a leaf node */ -static void btree_insert_key_leaf(struct btree_trans *trans, - struct btree_insert_entry *insert) +inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *insert, + u64 journal_seq) { struct bch_fs *c = trans->c; - struct btree *b = insert_l(insert)->b; + struct btree *b = path_l(path)->b; struct bset_tree *t = bset_tree_last(b); struct bset *i = bset(b, t); int old_u64s = bset_u64s(t); int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; - if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b, - &insert_l(insert)->iter, insert->k))) + if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, + &path_l(path)->iter, insert))) return; - i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, - le64_to_cpu(i->journal_seq))); + i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); - bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); + bch2_btree_add_journal_pin(c, b, journal_seq); - if (unlikely(!btree_node_dirty(b))) + if (unlikely(!btree_node_dirty(b))) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); set_btree_node_dirty_acct(c, b); + } live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; u64s_added = (int) bset_u64s(t) - old_u64s; @@ -257,7 +298,7 @@ static void btree_insert_key_leaf(struct btree_trans *trans, static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { - BUG_ON(bpos_cmp(i->k->k.p, i->path->pos)); + BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); BUG_ON(i->cached != i->path->cached); BUG_ON(i->level != i->path->level); BUG_ON(i->btree_id != i->path->btree_id); @@ -265,49 +306,30 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && i->k->k.p.snapshot && - bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); } static noinline int -bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s, +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, unsigned long trace_ip) { - struct bch_fs *c = trans->c; - int ret; - - bch2_trans_unlock(trans); - - ret = bch2_journal_preres_get(&c->journal, - &trans->journal_preres, u64s, 0); - if (ret) - return ret; - - ret = bch2_trans_relock(trans); - if (ret) { - trace_and_count(c, trans_restart_journal_preres_get, trans, trace_ip, 0); - return ret; - } - - return 0; + return drop_locks_do(trans, + bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, + trans->journal_preres_u64s, + (flags & BCH_WATERMARK_MASK))); } static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, - unsigned flags) + unsigned flags) { - struct bch_fs *c = trans->c; - int ret; - - ret = bch2_journal_res_get(&c->journal, &trans->journal_res, - trans->journal_u64s, - flags| - (trans->flags & JOURNAL_WATERMARK_MASK)); - - return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, + trans->journal_u64s, flags); } #define JSET_ENTRY_LOG_U64s 4 -static void journal_transaction_name(struct btree_trans *trans) +static noinline void journal_transaction_name(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; @@ -321,26 +343,23 @@ static void journal_transaction_name(struct btree_trans *trans) strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); } -static inline enum btree_insert_ret -btree_key_can_insert(struct btree_trans *trans, - struct btree *b, - unsigned u64s) +static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) { struct bch_fs *c = trans->c; if (!bch2_btree_node_insert_fits(c, b, u64s)) - return BTREE_INSERT_BTREE_NODE_FULL; + return -BCH_ERR_btree_insert_btree_node_full; - return BTREE_INSERT_OK; + return 0; } -static enum btree_insert_ret -btree_key_can_insert_cached(struct btree_trans *trans, - struct btree_path *path, - unsigned u64s) +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) { struct bch_fs *c = trans->c; struct bkey_cached *ck = (void *) path->l[0].b; + struct btree_insert_entry *i; unsigned new_u64s; struct bkey_i *new_k; @@ -348,8 +367,8 @@ btree_key_can_insert_cached(struct btree_trans *trans, if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c) && - !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)) - return BTREE_INSERT_NEED_JOURNAL_RECLAIM; + !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return -BCH_ERR_btree_insert_need_journal_reclaim; /* * bch2_varint_decode can read past the end of the buffer by at most 7 @@ -358,16 +377,20 @@ btree_key_can_insert_cached(struct btree_trans *trans, u64s += 1; if (u64s <= ck->u64s) - return BTREE_INSERT_OK; + return 0; new_u64s = roundup_pow_of_two(u64s); new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); if (!new_k) { bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_ids[path->btree_id], new_u64s); - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_key_cache_insert; } + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + ck->u64s = new_u64s; ck->k = new_k; return 0; @@ -381,18 +404,22 @@ static int run_one_mem_trigger(struct btree_trans *trans, { struct bkey_s_c old = { &i->old_k, i->old_v }; struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); int ret; + verify_update_old_key(trans, i); + if (unlikely(flags & BTREE_TRIGGER_NORUN)) return 0; - if (!btree_node_type_needs_gc(i->btree_id)) + if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) return 0; - if (bch2_bkey_ops[old.k->type].atomic_trigger == - bch2_bkey_ops[i->k->k.type].atomic_trigger && + if (old_ops->atomic_trigger == new_ops->atomic_trigger && ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { - ret = bch2_mark_key(trans, old, bkey_i_to_s_c(new), + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); } else { struct bkey _deleted = KEY(0, 0, 0); @@ -400,9 +427,11 @@ static int run_one_mem_trigger(struct btree_trans *trans, _deleted.p = i->path->pos; - ret = bch2_mark_key(trans, deleted, bkey_i_to_s_c(new), + ret = bch2_mark_key(trans, i->btree_id, i->level, + deleted, bkey_i_to_s_c(new), BTREE_TRIGGER_INSERT|flags) ?: - bch2_mark_key(trans, old, deleted, + bch2_mark_key(trans, i->btree_id, i->level, + old, deleted, BTREE_TRIGGER_OVERWRITE|flags); } @@ -419,6 +448,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ */ struct bkey old_k = i->old_k; struct bkey_s_c old = { &old_k, i->old_v }; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); if ((i->flags & BTREE_TRIGGER_NORUN) || !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) @@ -426,8 +459,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ if (!i->insert_trigger_run && !i->overwrite_trigger_run && - bch2_bkey_ops[old.k->type].trans_trigger == - bch2_bkey_ops[i->k->k.type].trans_trigger && + old_ops->trans_trigger == new_ops->trans_trigger && ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { i->overwrite_trigger_run = true; i->insert_trigger_run = true; @@ -517,11 +549,12 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) } } +#ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && (!i->insert_trigger_run || !i->overwrite_trigger_run)); - +#endif return 0; } @@ -549,12 +582,13 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) } static inline int -bch2_trans_commit_write_locked(struct btree_trans *trans, +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; struct btree_trans_commit_hook *h; unsigned u64s = 0; bool marking = false; @@ -573,14 +607,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, prefetch(&trans->c->journal.flags); - h = trans->hooks; - while (h) { - ret = h->fn(trans, h); - if (ret) - return ret; - h = h->next; - } - trans_for_each_update(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) @@ -589,7 +615,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, u64s += i->k->k.u64s; ret = !i->cached ? btree_key_can_insert(trans, insert_l(i)->b, u64s) - : btree_key_can_insert_cached(trans, i->path, u64s); + : btree_key_can_insert_cached(trans, flags, i->path, u64s); if (ret) { *stopped_at = i; return ret; @@ -597,65 +623,36 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (btree_node_type_needs_gc(i->bkey_type)) marking = true; - - /* - * Revalidate before calling mem triggers - XXX, ugly: - * - * - successful btree node splits don't cause transaction - * restarts and will have invalidated the pointer to the bkey - * value - * - btree_node_lock_for_insert() -> btree_node_prep_for_write() - * when it has to resort - * - btree_key_can_insert_cached() when it has to reallocate - * - * Ugly because we currently have no way to tell if the - * pointer's been invalidated, which means it's debatabale - * whether we should be stashing the old key at all. - */ - i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; - - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { - struct bkey_i *j_k = - bch2_journal_keys_peek_slot(c, i->btree_id, i->level, - i->k->k.p); - - if (j_k) { - i->old_k = j_k->k; - i->old_v = &j_k->v; - } - } } + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + /* * Don't get journal reservation until after we know insert will * succeed: */ - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { ret = bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_NONBLOCK); if (ret) return ret; - journal_transaction_name(trans); + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); } else { trans->journal_res.seq = c->journal.replay_journal_seq; } - if (unlikely(trans->extra_journal_entries.nr)) { - memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->extra_journal_entries.data, - trans->extra_journal_entries.nr); - - trans->journal_res.offset += trans->extra_journal_entries.nr; - trans->journal_res.u64s -= trans->extra_journal_entries.nr; - } - /* * Not allowed to fail after we've gotten our journal reservation - we * have to use it: */ - if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { if (bch2_journal_seq_verify) trans_for_each_update(trans, i) i->k->k.version.lo = trans->journal_res.seq; @@ -666,35 +663,67 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (trans->fs_usage_deltas && bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) - return BTREE_INSERT_NEED_MARK_REPLICAS; + return -BCH_ERR_btree_insert_need_mark_replicas; + + if (trans->nr_wb_updates) { + EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } trans_for_each_update(trans, i) if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { ret = run_one_mem_trigger(trans, i, i->flags); if (ret) - return ret; + goto fatal_err; } if (unlikely(c->gc_pos.phase)) { ret = bch2_trans_commit_run_gc_triggers(trans); if (ret) - return ret; + goto fatal_err; } - if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - trans_for_each_update(trans, i) { - struct journal *j = &c->journal; - struct jset_entry *entry; + if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + trans_for_each_update(trans, i) { if (i->key_cache_already_flushed) continue; - entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_overwrite, - i->btree_id, i->level, - i->old_k.u64s); - bkey_reassemble(&entry->start[0], - (struct bkey_s_c) { &i->old_k, i->old_v }); + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + verify_update_old_key(trans, i); + + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble(&entry->start[0], + (struct bkey_s_c) { &i->old_k, i->old_v }); + } entry = bch2_journal_add_entry(j, &trans->journal_res, BCH_JSET_ENTRY_btree_keys, @@ -703,6 +732,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, bkey_copy(&entry->start[0], i->k); } + trans_for_each_wb_update(trans, wb) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); + bkey_copy(&entry->start[0], &wb->k); + } + if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; } @@ -710,16 +747,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, trans_for_each_update(trans, i) { i->k->k.needs_whiteout = false; - if (!i->cached) - btree_insert_key_leaf(trans, i); - else if (!i->key_cache_already_flushed) - bch2_btree_insert_key_cached(trans, i->path, i->k); + if (!i->cached) { + u64 seq = trans->journal_res.seq; + + if (i->flags & BTREE_UPDATE_PREJOURNAL) + seq = i->seq; + + bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); + } else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, flags, i); else { bch2_btree_key_cache_drop(trans, i->path); btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); } } + return 0; +fatal_err: + bch2_fatal_error(c); +revert_fs_usage: + if (trans->fs_usage_deltas) + bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); return ret; } @@ -747,7 +795,8 @@ static inline int trans_lock_write(struct btree_trans *trans) if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) return trans_lock_write_fail(trans, i); - bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); } return 0; @@ -756,17 +805,22 @@ static inline int trans_lock_write(struct btree_trans *trans) static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) { struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; trans_for_each_update(trans, i) bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + + trans_for_each_wb_update(trans, wb) + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); } -static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, +#ifdef CONFIG_BCACHEFS_DEBUG +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *i, struct printbuf *err) { struct bch_fs *c = trans->c; - int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; printbuf_reset(err); prt_printf(err, "invalid bkey on insert from %s -> %ps", @@ -787,28 +841,37 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, return -EINVAL; } +#endif /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ -static inline int do_bch2_trans_commit(struct btree_trans *trans, +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, unsigned long trace_ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i; - struct printbuf buf = PRINTBUF; - int ret, u64s_delta = 0; - int rw = (trans->flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + int ret = 0, u64s_delta = 0; +#ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) { + struct printbuf buf = PRINTBUF; + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, rw, &buf))) - return bch2_trans_commit_bkey_invalid(trans, i, &buf); + i->bkey_type, invalid_flags, &buf))) + ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); btree_insert_entry_checks(trans, i); - } + printbuf_exit(&buf); - printbuf_exit(&buf); + if (ret) + return ret; + } +#endif trans_for_each_update(trans, i) { if (i->cached) @@ -820,7 +883,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (!same_leaf_as_next(trans, i)) { if (u64s_delta <= 0) { ret = bch2_foreground_maybe_merge(trans, i->path, - i->level, trans->flags); + i->level, flags); if (unlikely(ret)) return ret; } @@ -831,11 +894,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, - JOURNAL_RES_GET_NONBLOCK| - (trans->flags & JOURNAL_WATERMARK_MASK)); - if (unlikely(ret == -EAGAIN)) - ret = bch2_trans_journal_preres_get_cold(trans, - trans->journal_preres_u64s, trace_ip); + (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) + ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); if (unlikely(ret)) return ret; @@ -843,9 +904,9 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, if (unlikely(ret)) return ret; - ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip); + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); - if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + if (!ret && unlikely(trans->journal_replay_not_finished)) bch2_drop_overwrites_from_journal(trans); trans_for_each_update(trans, i) @@ -882,47 +943,39 @@ static int journal_reclaim_wait_done(struct bch_fs *c) } static noinline -int bch2_trans_commit_error(struct btree_trans *trans, +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct btree_insert_entry *i, int ret, unsigned long trace_ip) { struct bch_fs *c = trans->c; switch (ret) { - case BTREE_INSERT_BTREE_NODE_FULL: - ret = bch2_btree_split_leaf(trans, i->path, trans->flags); + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); break; - case BTREE_INSERT_NEED_MARK_REPLICAS: - bch2_trans_unlock(trans); - - ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas); - if (ret) - break; - - ret = bch2_trans_relock(trans); - if (ret) - trace_and_count(c, trans_restart_mark_replicas, trans, trace_ip); + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); break; - case BTREE_INSERT_NEED_JOURNAL_RES: - bch2_trans_unlock(trans); - - if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & JOURNAL_WATERMARK_reserved)) { + case -BCH_ERR_journal_res_get_blocked: + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; break; } - ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); - if (ret) - break; - - ret = bch2_trans_relock(trans); - if (ret) - trace_and_count(c, trans_restart_journal_res_get, trans, trace_ip); + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); break; - case BTREE_INSERT_NEED_JOURNAL_RECLAIM: + case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); @@ -933,9 +986,31 @@ int bch2_trans_commit_error(struct btree_trans *trans, break; ret = bch2_trans_relock(trans); - if (ret) - trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip); break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } default: BUG_ON(ret >= 0); break; @@ -944,31 +1019,28 @@ int bch2_trans_commit_error(struct btree_trans *trans, BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && - !(trans->flags & BTREE_INSERT_NOWAIT) && - (trans->flags & BTREE_INSERT_NOFAIL), c, + !(flags & BTREE_INSERT_NOWAIT) && + (flags & BTREE_INSERT_NOFAIL), c, "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); return ret; } static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans) +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; int ret; - if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)) || + if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || test_bit(BCH_FS_STARTED, &c->flags)) - return -EROFS; + return -BCH_ERR_erofs_trans_commit; - bch2_trans_unlock(trans); - - ret = bch2_fs_read_write_early(c) ?: - bch2_trans_relock(trans); + ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); if (ret) return ret; - percpu_ref_get(&c->writes); + bch2_write_ref_get(c, BCH_WRITE_REF_trans); return 0; } @@ -993,18 +1065,20 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } -int __bch2_trans_commit(struct btree_trans *trans) +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; struct btree_insert_entry *i = NULL; + struct btree_write_buffered_key *wb; unsigned u64s; int ret = 0; if (!trans->nr_updates && + !trans->nr_wb_updates && !trans->extra_journal_entries.nr) goto out_reset; - if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) + if (flags & BTREE_INSERT_GC_LOCK_HELD) lockdep_assert_held(&c->gc_lock); ret = bch2_trans_commit_run_triggers(trans); @@ -1016,13 +1090,27 @@ int __bch2_trans_commit(struct btree_trans *trans) goto out_reset; } - if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && - unlikely(!percpu_ref_tryget_live(&c->writes))) { - ret = bch2_trans_commit_get_rw_cold(trans); + if (!(flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); if (ret) goto out_reset; } + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); @@ -1030,17 +1118,19 @@ int __bch2_trans_commit(struct btree_trans *trans) trans->journal_u64s = trans->extra_journal_entries.nr; trans->journal_preres_u64s = 0; - /* For journalling transaction name: */ - trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); trans_for_each_update(trans, i) { - BUG_ON(!i->path->should_be_locked); + EBUG_ON(!i->path->should_be_locked); ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); if (unlikely(ret)) goto out; - BUG_ON(!btree_node_intent_locked(i->path, i->level)); + EBUG_ON(!btree_node_intent_locked(i->path, i->level)); if (i->key_cache_already_flushed) continue; @@ -1048,27 +1138,35 @@ int __bch2_trans_commit(struct btree_trans *trans) /* we're going to journal the key being updated: */ u64s = jset_u64s(i->k->k.u64s); if (i->cached && - likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) + likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) trans->journal_preres_u64s += u64s; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + trans->journal_u64s += u64s; /* and we're also going to log the overwrite: */ - trans->journal_u64s += jset_u64s(i->old_k.u64s); + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); } + trans_for_each_wb_update(trans, wb) + trans->journal_u64s += jset_u64s(wb->k.k.u64s); + if (trans->extra_journal_res) { ret = bch2_disk_reservation_add(c, trans->disk_res, trans->extra_journal_res, - (trans->flags & BTREE_INSERT_NOFAIL) + (flags & BTREE_INSERT_NOFAIL) ? BCH_DISK_RESERVATION_NOFAIL : 0); if (ret) goto err; } retry: - BUG_ON(trans->restarted); + bch2_trans_verify_not_in_restart(trans); memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = do_bch2_trans_commit(trans, &i, _RET_IP_); + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); /* make sure we didn't drop or screw up locks: */ bch2_trans_verify_locks(trans); @@ -1080,22 +1178,14 @@ retry: out: bch2_journal_preres_put(&c->journal, &trans->journal_preres); - if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) - percpu_ref_put(&c->writes); + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); out_reset: bch2_trans_reset_updates(trans); - if (trans->fs_usage_deltas) { - trans->fs_usage_deltas->used = 0; - memset((void *) trans->fs_usage_deltas + - offsetof(struct replicas_delta_list, memset_start), 0, - (void *) &trans->fs_usage_deltas->memset_end - - (void *) &trans->fs_usage_deltas->memset_start); - } - return ret; err: - ret = bch2_trans_commit_error(trans, i, ret, _RET_IP_); + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); if (ret) goto out; @@ -1123,7 +1213,7 @@ static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, if (!k.k) break; - if (bkey_cmp(pos, k.k->p)) + if (!bkey_eq(pos, k.k->p)) break; if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { @@ -1141,181 +1231,298 @@ static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, struct bpos pos) { if (!btree_type_has_snapshots(id) || - pos.snapshot == U32_MAX || - !snapshot_t(trans->c, pos.snapshot)->children[0]) + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) return 0; return __check_pos_snapshot_overwritten(trans, id, pos); } -int bch2_trans_update_extent(struct btree_trans *trans, - struct btree_iter *orig_iter, - struct bkey_i *insert, - enum btree_update_flags flags) +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) { struct bch_fs *c = trans->c; - struct btree_iter iter, update_iter; - struct bpos start = bkey_start_pos(&insert->k); struct bkey_i *update; - struct bkey_s_c k; - enum btree_id btree_id = orig_iter->btree_id; - int ret = 0, compressed_sectors; + int ret; - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES| - BTREE_ITER_NOT_EXTENTS); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); - if ((ret = bkey_err(k))) - goto err; - if (!k.k) - goto out; + update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; - if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { - /* - * We can't merge extents if they belong to interior snapshot - * tree nodes, and there's a snapshot in which one extent is - * visible and the other is not - i.e. if visibility is - * different. - * - * Instead of checking if visibilitiy of the two extents is - * different, for now we just check if either has been - * overwritten: - */ - ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); - if (ret < 0) - goto err; - if (ret) - goto nomerge1; + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; - ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); - if (ret < 0) - goto err; - if (ret) - goto nomerge1; + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; - bkey_reassemble(update, k); + *insert = update; + return 0; +} - if (bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(insert))) { - ret = bch2_btree_delete_at(trans, &iter, flags); - if (ret) - goto err; +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; - insert = update; - goto next; - } - } -nomerge1: - ret = 0; - if (!bkey_cmp(k.k->p, start)) - goto next; + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; - while (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) > 0) { - bool front_split = bkey_cmp(bkey_start_pos(k.k), start) < 0; - bool back_split = bkey_cmp(k.k->p, insert->k.p) > 0; + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} - /* - * If we're going to be splitting a compressed extent, note it - * so that __bch2_trans_commit() can increase our disk - * reservation: - */ - if (((front_split && back_split) || - ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && - (compressed_sectors = bch2_bkey_sectors_compressed(k))) - trans->extra_journal_res += compressed_sectors; - - if (front_split) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; - bkey_reassemble(update, k); + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; - bch2_cut_back(start, update); + pos.snapshot++; - bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags); - bch2_trans_iter_exit(trans, &update_iter); + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; - if (ret) - goto err; + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; } + } + bch2_trans_iter_exit(trans, &iter); - if (k.k->p.snapshot != insert->k.p.snapshot && - (front_split || back_split)) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + return ret; +} - bkey_reassemble(update, k); +int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter old_iter, new_iter = { NULL }; + struct bkey_s_c old_k, new_k; + snapshot_id_list s; + struct bkey_i *update; + int ret; - bch2_cut_front(start, update); - bch2_cut_back(insert->k.p, update); + if (!bch2_snapshot_has_children(c, old_pos.snapshot)) + return 0; - bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) - goto err; - } + darray_init(&s); - if (bkey_cmp(k.k->p, insert->k.p) <= 0) { - update = bch2_trans_kmalloc(trans, sizeof(*update)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + bch2_trans_iter_init(trans, &old_iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) + continue; + + new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bkey_err(new_k); + if (ret) + break; + + if (new_k.k->type == KEY_TYPE_deleted) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; bkey_init(&update->k); - update->k.p = k.k->p; + update->k.p = whiteout_pos; + update->k.type = KEY_TYPE_whiteout; - if (insert->k.p.snapshot != k.k->p.snapshot) { - update->k.p.snapshot = insert->k.p.snapshot; - update->k.type = KEY_TYPE_whiteout; - } + ret = bch2_trans_update(trans, &new_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } + bch2_trans_iter_exit(trans, &new_iter); + + ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &old_iter); + darray_exit(&s); + + return ret; +} + +int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_update_flags flags, + struct bkey_s_c old, + struct bkey_s_c new) +{ + enum btree_id btree_id = iter->btree_id; + struct bkey_i *update; + struct bpos new_start = bkey_start_pos(new.k); + bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); + bool back_split = bkey_gt(old.k->p, new.k->p); + int ret = 0, compressed_sectors; + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(old))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_back(new_start, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + /* If we're overwriting in a different snapshot - middle split: */ + if (old.k->p.snapshot != new.k->p.snapshot && + (front_split || back_split)) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new_start, update); + bch2_cut_back(new.k->p, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + old.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } + + if (bkey_le(old.k->p, new.k->p)) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; - bch2_trans_iter_init(trans, &update_iter, btree_id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags); - bch2_trans_iter_exit(trans, &update_iter); + bkey_init(&update->k); + update->k.p = old.k->p; + update->k.p.snapshot = new.k->p.snapshot; + if (new.k->p.snapshot != old.k->p.snapshot) { + update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + return ret; if (ret) - goto err; + update->k.type = KEY_TYPE_whiteout; } - if (back_split) { - update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - if ((ret = PTR_ERR_OR_ZERO(update))) - goto err; + ret = bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + return ret; + } - bkey_reassemble(update, k); - bch2_cut_front(insert->k.p, update); + if (back_split) { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; + + bch2_cut_front(new.k->p, update); + + ret = bch2_trans_update_by_path(trans, iter->path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags, _RET_IP_); + if (ret) + return ret; + } + + return 0; +} - ret = bch2_trans_update_by_path(trans, iter.path, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| - flags); +static int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); if (ret) goto err; - goto out; } + + goto next; + } + + while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { + bool done = bkey_lt(insert->k.p, k.k->p); + + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) + goto err; + + if (done) + goto out; next: bch2_btree_iter_advance(&iter); k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); @@ -1326,80 +1533,19 @@ next: } if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { - ret = check_pos_snapshot_overwritten(trans, btree_id, insert->k.p); - if (ret < 0) - goto err; + ret = extent_back_merge(trans, &iter, insert, k); if (ret) - goto nomerge2; - - ret = check_pos_snapshot_overwritten(trans, btree_id, k.k->p); - if (ret < 0) goto err; - if (ret) - goto nomerge2; - - bch2_bkey_merge(c, bkey_i_to_s(insert), k); } -nomerge2: - ret = 0; out: - if (!bkey_deleted(&insert->k)) { - /* - * Rewinding iterators is expensive: get a new one and the one - * that points to the start of insert will be cloned from: - */ - bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, insert, flags); - } + if (!bkey_deleted(&insert->k)) + ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); err: bch2_trans_iter_exit(trans, &iter); return ret; } -/* - * When deleting, check if we need to emit a whiteout (because we're overwriting - * something in an ancestor snapshot) - */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) -{ - struct btree_iter iter; - struct bkey_s_c k; - u32 snapshot = pos.snapshot; - int ret; - - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; - - pos.snapshot++; - - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { - if (bkey_cmp(k.k->p, pos)) - break; - - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static int __must_check -bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags, - unsigned long ip); - static noinline int flush_new_cached_update(struct btree_trans *trans, struct btree_path *path, struct btree_insert_entry *i, @@ -1407,37 +1553,56 @@ static noinline int flush_new_cached_update(struct btree_trans *trans, unsigned long ip) { struct btree_path *btree_path; + struct bkey k; int ret; - i->key_cache_already_flushed = true; - i->flags |= BTREE_TRIGGER_NORUN; - btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, BTREE_ITER_INTENT, _THIS_IP_); - ret = bch2_btree_path_traverse(trans, btree_path, 0); if (ret) - goto err; + goto out; + + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; btree_path_set_should_be_locked(btree_path); - ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip); -err: + ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); +out: bch2_path_put(trans, btree_path, true); return ret; } static int __must_check -bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags, - unsigned long ip) +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) { struct bch_fs *c = trans->c; struct btree_insert_entry *i, n; + u64 seq = 0; + int cmp; - BUG_ON(!path->should_be_locked); + EBUG_ON(!path->should_be_locked); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(!bpos_eq(k->k.p, path->pos)); - BUG_ON(trans->nr_updates >= BTREE_ITER_MAX); - BUG_ON(bpos_cmp(k->k.p, path->pos)); + /* + * The transaction journal res hasn't been allocated at this point. + * That occurs at commit time. Reuse the seq field to pass in the seq + * of a prejournaled key. + */ + if (flags & BTREE_UPDATE_PREJOURNAL) + seq = trans->journal_res.seq; n = (struct btree_insert_entry) { .flags = flags, @@ -1447,6 +1612,7 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa .cached = path->cached, .path = path, .k = k, + .seq = seq, .ip_allocated = ip, }; @@ -1460,28 +1626,30 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa * Pending updates are kept sorted: first, find position of new update, * then delete/trim any updates the new update overwrites: */ - trans_for_each_update(trans, i) - if (btree_insert_entry_cmp(&n, i) <= 0) + trans_for_each_update(trans, i) { + cmp = btree_insert_entry_cmp(&n, i); + if (cmp <= 0) break; + } - if (i < trans->updates + trans->nr_updates && - !btree_insert_entry_cmp(&n, i)) { - BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + if (!cmp && i < trans->updates + trans->nr_updates) { + EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); bch2_path_put(trans, i->path, true); i->flags = n.flags; i->cached = n.cached; i->k = n.k; i->path = n.path; + i->seq = n.seq; i->ip_allocated = n.ip_allocated; } else { array_insert_item(trans->updates, trans->nr_updates, i - trans->updates, n); - i->old_v = bch2_btree_path_peek_slot(path, &i->old_k).v; + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; - if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) { + if (unlikely(trans->journal_replay_not_finished)) { struct bkey_i *j_k = bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); @@ -1501,19 +1669,12 @@ bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *pa * the key cache - but the key has to exist in the btree for that to * work: */ - if (unlikely(path->cached && bkey_deleted(&i->old_k))) + if (path->cached && bkey_deleted(&i->old_k)) return flush_new_cached_update(trans, path, i, flags, ip); return 0; } -static int __must_check -bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, - struct bkey_i *k, enum btree_update_flags flags) -{ - return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_); -} - int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { @@ -1544,7 +1705,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter btree_id_cached(trans->c, path->btree_id)) { if (!iter->key_cache_path || !iter->key_cache_path->should_be_locked || - bpos_cmp(iter->key_cache_path->pos, k->k.p)) { + !bpos_eq(iter->key_cache_path->pos, k->k.p)) { if (!iter->key_cache_path) iter->key_cache_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, @@ -1574,7 +1735,103 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter path = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path, k, flags); + return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); +} + +/* + * Add a transaction update for a key that has already been journaled. + */ +int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, + struct btree_iter *iter, struct bkey_i *k, + enum btree_update_flags flags) +{ + trans->journal_res.seq = seq; + return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| + BTREE_UPDATE_PREJOURNAL); +} + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct btree_write_buffered_key *i; + int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + trans_for_each_wb_update(trans, i) { + if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { + bkey_copy(&i->k, k); + return 0; + } + } + + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; + + if (trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_transaction_stats *s = btree_trans_stats(trans); + + BUG_ON(trans->wb_updates_size > U8_MAX / 2); + trans->wb_updates_size = max(1, trans->wb_updates_size * 2); + if (s) + s->wb_updates_size = trans->wb_updates_size; + } + + u = bch2_trans_kmalloc_nomemzero(trans, + trans->wb_updates_size * + sizeof(struct btree_write_buffered_key)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + if (trans->nr_wb_updates) + memcpy(u, trans->wb_updates, trans->nr_wb_updates * + sizeof(struct btree_write_buffered_key)); + trans->wb_updates = u; + } + + trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { + .btree = btree, + }; + + bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; + + return 0; +} + +int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) +{ + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + k = bch2_btree_iter_prev(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { + ret = -BCH_ERR_ENOSPC_btree_slot; + goto err; + } + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; } void bch2_trans_commit_hook(struct btree_trans *trans, @@ -1584,16 +1841,33 @@ void bch2_trans_commit_hook(struct btree_trans *trans, trans->hooks = h; } -int __bch2_btree_insert(struct btree_trans *trans, - enum btree_id id, struct bkey_i *k) +int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) { struct btree_iter iter; int ret; bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_CACHED| BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, 0); + bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1611,7 +1885,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, u64 *journal_seq, int flags) { return bch2_trans_do(c, disk_res, journal_seq, flags, - __bch2_btree_insert(&trans, id, k)); + __bch2_btree_insert(&trans, id, k, 0)); } int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, @@ -1635,6 +1909,20 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, struct bpos start, struct bpos end, unsigned update_flags, @@ -1646,7 +1934,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); - while ((k = bch2_btree_iter_peek(&iter)).k) { + while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -1655,9 +1943,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, if (ret) goto err; - if (bkey_cmp(iter.pos, end) >= 0) - break; - bkey_init(&delete.k); /* @@ -1676,18 +1961,10 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, */ delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - unsigned max_sectors = - KEY_SIZE_MAX & (~0 << trans->c->block_bits); - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: bch2_trans_commit(trans, &disk_res, journal_seq, @@ -1732,18 +2009,43 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } -int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) { - unsigned len = strlen(msg); - unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); + struct bkey_i *k; + int ret = 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + return bch2_trans_update_buffered(trans, btree, k); +} + +static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; struct jset_entry_log *l; + unsigned u64s; int ret; - ret = darray_make_room(&trans->extra_journal_entries, jset_u64s(u64s)); + prt_vprintf(&buf, fmt, args); + ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) - return ret; + goto err; - l = (void *) &darray_top(trans->extra_journal_entries); + u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + + ret = darray_make_room(entries, jset_u64s(u64s)); + if (ret) + goto err; + + l = (void *) &darray_top(*entries); l->entry.u64s = cpu_to_le16(u64s); l->entry.btree_id = 0; l->entry.level = 1; @@ -1751,10 +2053,55 @@ int bch2_trans_log_msg(struct btree_trans *trans, const char *msg) l->entry.pad[0] = 0; l->entry.pad[1] = 0; l->entry.pad[2] = 0; - memcpy(l->d, msg, len); - while (len & 7) - l->d[len++] = '\0'; + memcpy(l->d, buf.buf, buf.pos); + while (buf.pos & 7) + l->d[buf.pos++] = '\0'; - trans->extra_journal_entries.nr += jset_u64s(u64s); - return 0; + entries->nr += jset_u64s(u64s); +err: + printbuf_exit(&buf); + return ret; +} + +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) +{ + int ret; + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW|commit_flags, + __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); + } + + return ret; +} + +int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); + va_end(args); + return ret; } diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c new file mode 100644 index 0000000..5f96db5 --- /dev/null +++ b/libbcachefs/btree_write_buffer.c @@ -0,0 +1,372 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" + +#include + +static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->btree, r->btree) ?: + bpos_cmp(l->k.k.p, r->k.k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + +static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb, + unsigned commit_flags, + bool *write_locked, + size_t *fast) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + path = iter->path; + + if (!*write_locked) { + ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, path->l[0].b); + *write_locked = true; + } + + if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + goto trans_commit; + } + + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); + (*fast)++; + + if (path->ref > 1) { + /* + * We can't clone a path that has write locks: if the path is + * shared, unlock before set_pos(), traverse(): + */ + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + } + return 0; +trans_commit: + return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM); +} + +static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) +{ + union btree_write_buffer_state old, new; + u64 v = READ_ONCE(wb->state.v); + + do { + old.v = new.v = v; + + new.nr = 0; + new.idx++; + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) + cpu_relax(); + + smp_mb(); + + return old; +} + +/* + * Update a btree with a write buffered key using the journal seq of the + * original write buffer insert. + * + * It is not safe to rejournal the key once it has been inserted into the write + * buffer because that may break recovery ordering. For example, the key may + * have already been modified in the active write buffer in a seq that comes + * before the current transaction. If we were to journal this key again and + * crash, recovery would process updates in the wrong order. + */ +static int +btree_write_buffered_insert(struct btree_trans *trans, + struct btree_write_buffered_key *wb) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), + BTREE_ITER_CACHED|BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, + bool locked) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct journal_entry_pin pin; + struct btree_write_buffered_key *i, *keys; + struct btree_iter iter = { NULL }; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + bool write_locked = false; + union btree_write_buffer_state s; + int ret = 0; + + memset(&pin, 0, sizeof(pin)); + + if (!locked && !mutex_trylock(&wb->flush_lock)) + return 0; + + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); + bch2_journal_pin_drop(j, &wb->journal_pin); + + s = btree_write_buffer_switch(wb); + keys = wb->keys[s.idx]; + nr = s.nr; + + if (race_fault()) + goto slowpath; + + /* + * We first sort so that we can detect and skip redundant updates, and + * then we attempt to flush in sorted btree order, as this is most + * efficient. + * + * However, since we're not flushing in the order they appear in the + * journal we won't be able to drop our journal pin until everything is + * flushed - which means this could deadlock the journal if we weren't + * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * if it would block taking a journal reservation. + * + * If that happens, simply skip the key so we can optimistically insert + * as many keys as possible in the fast path. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_key_cmp, NULL); + + for (i = keys; i < keys + nr; i++) { + if (i + 1 < keys + nr && + i[0].btree == i[1].btree && + bpos_eq(i[0].k.k.p, i[1].k.k.p)) { + skipped++; + i->journal_seq = 0; + continue; + } + + if (write_locked && + (iter.path->btree_id != i->btree || + bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + write_locked = false; + } + + if (!iter.path || iter.path->btree_id != i->btree) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); + } + + bch2_btree_iter_set_pos(&iter, i->k.k.p); + iter.path->preserve = false; + + do { + ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, + commit_flags, &write_locked, &fast); + if (!write_locked) + bch2_trans_begin(trans); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + slowpath++; + continue; + } + if (ret) + break; + + i->journal_seq = 0; + } + + if (write_locked) + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + bch2_trans_iter_exit(trans, &iter); + + trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); + + if (slowpath) + goto slowpath; + + bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); +out: + bch2_journal_pin_drop(j, &pin); + mutex_unlock(&wb->flush_lock); + return ret; +slowpath: + trace_write_buffer_flush_slowpath(trans, i - keys, nr); + + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_journal_cmp, + NULL); + + commit_flags &= ~BCH_WATERMARK_MASK; + commit_flags |= BCH_WATERMARK_reclaim; + + for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; + + if (i->journal_seq > pin.seq) { + struct journal_entry_pin pin2; + + memset(&pin2, 0, sizeof(pin2)); + + bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); + bch2_journal_pin_drop(j, &pin); + bch2_journal_pin_copy(j, &pin, &pin2, NULL); + bch2_journal_pin_drop(j, &pin2); + } + + ret = commit_do(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM, + btree_write_buffered_insert(trans, i)); + if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) + break; + } + + goto out; +} + +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + mutex_lock(&trans->c->btree_write_buffer.flush_lock); + return __bch2_btree_write_buffer_flush(trans, 0, true); +} + +int bch2_btree_write_buffer_flush(struct btree_trans *trans) +{ + return __bch2_btree_write_buffer_flush(trans, 0, false); +} + +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_lock(&wb->flush_lock); + + return bch2_trans_run(c, + __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); +} + +static inline u64 btree_write_buffer_ref(int idx) +{ + return ((union btree_write_buffer_state) { + .ref0 = idx == 0, + .ref1 = idx == 1, + }).v; +} + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key *i; + union btree_write_buffer_state old, new; + int ret = 0; + u64 v; + + trans_for_each_wb_update(trans, i) { + EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + i->journal_seq = trans->journal_res.seq; + i->journal_offset = trans->journal_res.offset; + } + + preempt_disable(); + v = READ_ONCE(wb->state.v); + do { + old.v = new.v = v; + + new.v += btree_write_buffer_ref(new.idx); + new.nr += trans->nr_wb_updates; + if (new.nr > wb->size) { + ret = -BCH_ERR_btree_insert_need_flush_buffer; + goto out; + } + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + memcpy(wb->keys[new.idx] + old.nr, + trans->wb_updates, + sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); + + atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +out: + preempt_enable(); + return ret; +} + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + + kvfree(wb->keys[1]); + kvfree(wb->keys[0]); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_init(&wb->flush_lock); + wb->size = c->opts.btree_write_buffer_size; + + wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); + wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); + if (!wb->keys[0] || !wb->keys[1]) + return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + + return 0; +} diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h new file mode 100644 index 0000000..322df1c --- /dev/null +++ b/libbcachefs/btree_write_buffer.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_H + +int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush(struct btree_trans *); + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *); + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +int bch2_fs_btree_write_buffer_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h new file mode 100644 index 0000000..99993ba --- /dev/null +++ b/libbcachefs/btree_write_buffer_types.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H + +#include "journal_types.h" + +#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 +#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) + +struct btree_write_buffered_key { + u64 journal_seq; + unsigned journal_offset; + enum btree_id btree; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; + +union btree_write_buffer_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 nr:23; + u64 idx:1; + u64 ref0:20; + u64 ref1:20; + }; +}; + +struct btree_write_buffer { + struct mutex flush_lock; + struct journal_entry_pin journal_pin; + + union btree_write_buffer_state state; + size_t size; + + struct btree_write_buffered_key *keys[2]; +}; + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index bf01837..7bb7f0c 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -21,9 +21,9 @@ #include "reflink.h" #include "replicas.h" #include "subvolume.h" +#include "trace.h" #include -#include static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, enum bch_data_type data_type, @@ -102,18 +102,6 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) } while (read_seqcount_retry(&c->usage_lock, seq)); } -static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, - unsigned journal_seq, - bool gc) -{ - percpu_rwsem_assert_held(&c->mark_lock); - BUG_ON(!gc && !journal_seq); - - return this_cpu_ptr(gc - ? c->usage_gc - : c->usage[journal_seq & JOURNAL_BUF_MASK]); -} - u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) { ssize_t offset = v - (u64 *) c->usage_base; @@ -137,25 +125,32 @@ u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) { struct bch_fs_usage_online *ret; - unsigned seq, i, u64s; + unsigned nr_replicas = READ_ONCE(c->replicas.nr); + unsigned seq, i; +retry: + ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); + if (unlikely(!ret)) + return NULL; percpu_down_read(&c->mark_lock); - ret = kmalloc(sizeof(struct bch_fs_usage_online) + - sizeof(u64) * c->replicas.nr, GFP_NOFS); - if (unlikely(!ret)) { + if (nr_replicas != c->replicas.nr) { + nr_replicas = c->replicas.nr; percpu_up_read(&c->mark_lock); - return NULL; + kfree(ret); + goto retry; } ret->online_reserved = percpu_u64_get(c->online_reserved); - u64s = fs_usage_u64s(c); do { seq = read_seqcount_begin(&c->usage_lock); - memcpy(&ret->u, c->usage_base, u64s * sizeof(u64)); + unsafe_memcpy(&ret->u, c->usage_base, + __fs_usage_u64s(nr_replicas) * sizeof(u64), + "embedded variable length struct"); for (i = 0; i < ARRAY_SIZE(c->usage); i++) - acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s); + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], + __fs_usage_u64s(nr_replicas)); } while (read_seqcount_retry(&c->usage_lock, seq)); return ret; @@ -367,7 +362,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, struct bch_replicas_entry *r, s64 sectors, unsigned journal_seq, bool gc) { - struct bch_fs_usage __percpu *fs_usage; + struct bch_fs_usage *fs_usage; int idx, ret = 0; struct printbuf buf = PRINTBUF; @@ -416,8 +411,8 @@ static inline int update_cached_sectors(struct bch_fs *c, return update_replicas(c, k, &r.e, sectors, journal_seq, gc); } -static struct replicas_delta_list * -replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, + gfp_t gfp) { struct replicas_delta_list *d = trans->fs_usage_deltas; unsigned new_size = d ? (d->size + more) * 2 : 128; @@ -426,12 +421,16 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more) WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); if (!d || d->used + more > d->size) { - d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO); + d = krealloc(d, alloc_size, gfp|__GFP_ZERO); + + if (unlikely(!d)) { + if (alloc_size > REPLICAS_DELTA_LIST_MAX) + return -ENOMEM; - BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX); + d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); + if (!d) + return -ENOMEM; - if (!d) { - d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO); memset(d, 0, REPLICAS_DELTA_LIST_MAX); if (trans->fs_usage_deltas) @@ -445,49 +444,64 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more) d->size = new_size; trans->fs_usage_deltas = d; } - return d; + + return 0; } -static inline void update_replicas_list(struct btree_trans *trans, +int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +{ + return allocate_dropping_locks_errcode(trans, + __replicas_deltas_realloc(trans, more, _gfp)); +} + +static inline int update_replicas_list(struct btree_trans *trans, struct bch_replicas_entry *r, s64 sectors) { struct replicas_delta_list *d; struct replicas_delta *n; unsigned b; + int ret; if (!sectors) - return; + return 0; b = replicas_entry_bytes(r) + 8; - d = replicas_deltas_realloc(trans, b); + ret = bch2_replicas_deltas_realloc(trans, b); + if (ret) + return ret; + d = trans->fs_usage_deltas; n = (void *) d->d + d->used; n->delta = sectors; memcpy((void *) n + offsetof(struct replicas_delta, r), r, replicas_entry_bytes(r)); bch2_replicas_entry_sort(&n->r); d->used += b; + return 0; } -static inline void update_cached_sectors_list(struct btree_trans *trans, +static inline int update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors) { struct bch_replicas_padded r; bch2_replicas_entry_cached(&r.e, dev); - update_replicas_list(trans, &r.e, sectors); + return update_replicas_list(trans, &r.e, sectors); } int bch2_mark_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq; struct bch_fs *c = trans->c; - struct bch_alloc_v4 old_a, new_a; + struct bch_alloc_v4 old_a_convert, new_a_convert; + const struct bch_alloc_v4 *old_a, *new_a; struct bch_dev *ca; int ret = 0; @@ -504,36 +518,38 @@ int bch2_mark_alloc(struct btree_trans *trans, ca = bch_dev_bkey_exists(c, new.k->p.inode); - bch2_alloc_to_v4(old, &old_a); - bch2_alloc_to_v4(new, &new_a); + old_a = bch2_alloc_to_v4(old, &old_a_convert); + new_a = bch2_alloc_to_v4(new, &new_a_convert); + + bucket_journal_seq = new_a->journal_seq; if ((flags & BTREE_TRIGGER_INSERT) && - data_type_is_empty(old_a.data_type) != - data_type_is_empty(new_a.data_type) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && new.k->type == KEY_TYPE_alloc_v4) { struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; - BUG_ON(!journal_seq); + EBUG_ON(!journal_seq); /* * If the btree updates referring to a bucket weren't flushed * before the bucket became empty again, then the we don't have * to wait on a journal flush before we can reuse the bucket: */ - new_a.journal_seq = data_type_is_empty(new_a.data_type) && + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && (journal_seq == v->journal_seq || bch2_journal_noflush_seq(&c->journal, v->journal_seq)) ? 0 : journal_seq; - v->journal_seq = new_a.journal_seq; } - if (!data_type_is_empty(old_a.data_type) && - data_type_is_empty(new_a.data_type) && - new_a.journal_seq) { + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, c->journal.flushed_seq_ondisk, new.k->p.inode, new.k->p.offset, - new_a.journal_seq); + bucket_journal_seq); if (ret) { bch2_fs_fatal_error(c, "error setting bucket_needs_journal_commit: %i", ret); @@ -542,10 +558,10 @@ int bch2_mark_alloc(struct btree_trans *trans, } percpu_down_read(&c->mark_lock); - if (!gc && new_a.gen != old_a.gen) - *bucket_gen(ca, new.k->p.offset) = new_a.gen; + if (!gc && new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; - bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); + bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); if (gc) { struct bucket *g = gc_bucket(ca, new.k->p.offset); @@ -553,12 +569,12 @@ int bch2_mark_alloc(struct btree_trans *trans, bucket_lock(g); g->gen_valid = 1; - g->gen = new_a.gen; - g->data_type = new_a.data_type; - g->stripe = new_a.stripe; - g->stripe_redundancy = new_a.stripe_redundancy; - g->dirty_sectors = new_a.dirty_sectors; - g->cached_sectors = new_a.cached_sectors; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; bucket_unlock(g); } @@ -570,9 +586,9 @@ int bch2_mark_alloc(struct btree_trans *trans, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old_a.cached_sectors) { + old_a->cached_sectors) { ret = update_cached_sectors(c, new, ca->dev_idx, - -((s64) old_a.cached_sectors), + -((s64) old_a->cached_sectors), journal_seq, gc); if (ret) { bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", @@ -581,20 +597,20 @@ int bch2_mark_alloc(struct btree_trans *trans, } } - if (new_a.data_type == BCH_DATA_free && - (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) closure_wake_up(&c->freelist_wait); - if (new_a.data_type == BCH_DATA_need_discard && - (!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk)) + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) bch2_do_discards(c); - if (old_a.data_type != BCH_DATA_cached && - new_a.data_type == BCH_DATA_cached && + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_do_invalidates(c); - if (new_a.data_type == BCH_DATA_need_gc_gens) + if (new_a->data_type == BCH_DATA_need_gc_gens) bch2_do_gc_gens(c); return 0; @@ -654,13 +670,14 @@ err: return ret; } -static int check_bucket_ref(struct bch_fs *c, +static int check_bucket_ref(struct btree_trans *trans, struct bkey_s_c k, const struct bch_extent_ptr *ptr, s64 sectors, enum bch_data_type ptr_data_type, u8 b_gen, u8 bucket_data_type, u32 dirty_sectors, u32 cached_sectors) { + struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); u16 bucket_sectors = !ptr->cached @@ -717,7 +734,7 @@ static int check_bucket_ref(struct bch_fs *c, if (b_gen != ptr->gen) { ret = 1; - goto err; + goto out; } if (!data_type_is_empty(bucket_data_type) && @@ -747,9 +764,12 @@ static int check_bucket_ref(struct bch_fs *c, ret = -EIO; goto err; } -err: +out: printbuf_exit(&buf); return ret; +err: + bch2_dump_trans_updates(trans); + goto out; } static int mark_stripe_bucket(struct btree_trans *trans, @@ -762,7 +782,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; - enum bch_data_type data_type = parity ? BCH_DATA_parity : 0; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); @@ -791,14 +811,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, bucket_lock(g); old = *g; - ret = check_bucket_ref(c, k, ptr, sectors, data_type, + ret = check_bucket_ref(trans, k, ptr, sectors, data_type, g->gen, g->data_type, g->dirty_sectors, g->cached_sectors); if (ret) goto err; - if (data_type) - g->data_type = data_type; + g->data_type = data_type; g->dirty_sectors += sectors; g->stripe = k.k->p.offset; @@ -823,7 +842,7 @@ static int __mark_pointer(struct btree_trans *trans, u32 *dst_sectors = !ptr->cached ? dirty_sectors : cached_sectors; - int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type, + int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, bucket_gen, *bucket_data_type, *dirty_sectors, *cached_sectors); @@ -837,15 +856,17 @@ static int __mark_pointer(struct btree_trans *trans, } static int bch2_mark_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, - s64 sectors, enum bch_data_type data_type, + s64 sectors, unsigned flags) { u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket old, new, *g; + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); u8 bucket_data_type; int ret = 0; @@ -891,13 +912,13 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.idx); - return -ENOMEM; + return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - spin_lock(&c->ec_stripes_heap_lock); + mutex_lock(&c->ec_stripes_heap_lock); if (!m || !m->alive) { - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); bch2_inconsistent_error(c); @@ -907,7 +928,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, m->block_sectors[p.block] += sectors; r = m->r; - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); r.e.data_type = data_type; update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); @@ -916,6 +937,7 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans, } int bch2_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { @@ -948,8 +970,7 @@ int bch2_mark_extent(struct btree_trans *trans, if (flags & BTREE_TRIGGER_OVERWRITE) disk_sectors = -disk_sectors; - ret = bch2_mark_pointer(trans, k, p, disk_sectors, - data_type, flags); + ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); if (ret < 0) return ret; @@ -999,6 +1020,7 @@ int bch2_mark_extent(struct btree_trans *trans, } int bch2_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { @@ -1018,7 +1040,7 @@ int bch2_mark_stripe(struct btree_trans *trans, if (!gc) { struct stripe *m = genradix_ptr(&c->stripes, idx); - if (!m || (old_s && !m->alive)) { + if (!m) { struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; @@ -1034,13 +1056,10 @@ int bch2_mark_stripe(struct btree_trans *trans, } if (!new_s) { - spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_del(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); memset(m, 0, sizeof(*m)); } else { - m->alive = true; m->sectors = le16_to_cpu(new_s->sectors); m->algorithm = new_s->algorithm; m->nr_blocks = new_s->nr_blocks; @@ -1050,9 +1069,10 @@ int bch2_mark_stripe(struct btree_trans *trans, for (i = 0; i < new_s->nr_blocks; i++) m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, idx); - spin_unlock(&c->ec_stripes_heap_lock); + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); } } else { struct gc_stripe *m = @@ -1061,7 +1081,7 @@ int bch2_mark_stripe(struct btree_trans *trans, if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return -ENOMEM; + return -BCH_ERR_ENOMEM_mark_stripe; } /* * This will be wrong when we bring back runtime gc: we should @@ -1105,44 +1125,14 @@ int bch2_mark_stripe(struct btree_trans *trans, return 0; } -int bch2_mark_inode(struct btree_trans *trans, - struct bkey_s_c old, struct bkey_s_c new, - unsigned flags) -{ - struct bch_fs *c = trans->c; - struct bch_fs_usage __percpu *fs_usage; - u64 journal_seq = trans->journal_res.seq; - - if (flags & BTREE_TRIGGER_INSERT) { - struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; - - BUG_ON(!journal_seq); - BUG_ON(new.k->type != KEY_TYPE_inode_v3); - - v->bi_journal_seq = cpu_to_le64(journal_seq); - } - - if (flags & BTREE_TRIGGER_GC) { - percpu_down_read(&c->mark_lock); - preempt_disable(); - - fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); - fs_usage->nr_inodes += bkey_is_inode(new.k); - fs_usage->nr_inodes -= bkey_is_inode(old.k); - - preempt_enable(); - percpu_up_read(&c->mark_lock); - } - return 0; -} - int bch2_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { struct bch_fs *c = trans->c; struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; - struct bch_fs_usage __percpu *fs_usage; + struct bch_fs_usage *fs_usage; unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; @@ -1199,23 +1189,31 @@ not_found: " missing range %llu-%llu", (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), *idx, next_idx)) { - struct bkey_i_error new; - - bkey_init(&new.k); - new.k.type = KEY_TYPE_error; - new.k.p = bkey_start_pos(p.k); - new.k.p.offset += *idx - start; - bch2_key_resize(&new.k, next_idx - *idx); - ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new.k_i); + struct bkey_i_error *new; + + new = bch2_trans_kmalloc(trans, sizeof(*new)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = bkey_start_pos(p.k); + new->k.p.offset += *idx - start; + bch2_key_resize(&new->k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i, + BTREE_TRIGGER_NORUN); } *idx = next_idx; +err: fsck_err: printbuf_exit(&buf); return ret; } int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { @@ -1230,7 +1228,7 @@ int bch2_mark_reflink_p(struct btree_trans *trans, BUG_ON(!(flags & BTREE_TRIGGER_GC)); - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { idx -= le32_to_cpu(p.v->front_pad); end += le32_to_cpu(p.v->back_pad); } @@ -1254,34 +1252,45 @@ int bch2_mark_reflink_p(struct btree_trans *trans, return ret; } -static noinline __cold -void fs_usage_apply_warn(struct btree_trans *trans, - unsigned disk_res_sectors, - s64 should_not_have_added) +void bch2_trans_fs_usage_revert(struct btree_trans *trans, + struct replicas_delta_list *deltas) { struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct printbuf buf = PRINTBUF; + struct bch_fs_usage *dst; + struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; + s64 added = 0; + unsigned i; - prt_printf(&buf, - bch2_fmt(c, "disk usage increased %lli more than %u sectors reserved)"), - should_not_have_added, disk_res_sectors); + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); - trans_for_each_update(trans, i) { - struct bkey_s_c old = { &i->old_k, i->old_v }; + /* revert changes: */ + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); + } - prt_str(&buf, "new "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - prt_newline(&buf); + dst->nr_inodes -= deltas->nr_inodes; - prt_str(&buf, "old "); - bch2_bkey_val_to_text(&buf, c, old); - prt_newline(&buf); + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added -= deltas->persistent_reserved[i]; + dst->reserved -= deltas->persistent_reserved[i]; + dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; } - __WARN(); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); + if (added > 0) { + trans->disk_res->sectors += added; + this_cpu_add(*c->online_reserved, added); + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); } int bch2_trans_fs_usage_apply(struct btree_trans *trans, @@ -1348,7 +1357,9 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans, percpu_up_read(&c->mark_lock); if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) - fs_usage_apply_warn(trans, disk_res_sectors, should_not_have_added); + bch2_trans_inconsistent(trans, + "disk usage increased %lli more than %u sectors reserved)", + should_not_have_added, disk_res_sectors); return 0; need_mark: /* revert changes: */ @@ -1362,7 +1373,7 @@ need_mark: /* trans_mark: */ -static int bch2_trans_mark_pointer(struct btree_trans *trans, +static inline int bch2_trans_mark_pointer(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, unsigned flags) @@ -1370,38 +1381,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); struct btree_iter iter; struct bkey_i_alloc_v4 *a; - struct bpos bucket_pos; + struct bpos bucket; struct bch_backpointer bp; s64 sectors; int ret; - bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket_pos, &bp); + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); sectors = bp.bucket_len; if (!insert) sectors = -sectors; - a = bch2_trans_start_alloc_update(trans, &iter, bucket_pos); + a = bch2_trans_start_alloc_update(trans, &iter, bucket); if (IS_ERR(a)) return PTR_ERR(a); ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, a->v.gen, &a->v.data_type, - &a->v.dirty_sectors, &a->v.cached_sectors); + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + if (ret) - goto err; + return ret; if (!p.ptr.cached) { - ret = insert - ? bch2_bucket_backpointer_add(trans, a, bp, k) - : bch2_bucket_backpointer_del(trans, a, bp, k); + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); if (ret) - goto err; + return ret; } - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; + return 0; } static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, @@ -1409,28 +1418,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, s64 sectors, enum bch_data_type data_type) { struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_stripe *s; struct bch_replicas_padded r; int ret = 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, POS(0, p.ec.idx), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_stripe) { - bch2_trans_inconsistent(trans, + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, "pointer to nonexistent stripe %llu", (u64) p.ec.idx); - ret = -EIO; goto err; } - if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) { + if (!bch2_ptr_matches_stripe(&s->v, p)) { bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); @@ -1438,23 +1441,13 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, goto err; } - s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) - goto err; - - bkey_reassemble(&s->k_i, k); stripe_blockcount_set(&s->v, p.ec.block, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - ret = bch2_trans_update(trans, &iter, &s->k_i, 0); - if (ret) - goto err; - bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); r.e.data_type = data_type; - update_replicas_list(trans, &r.e, sectors); + ret = update_replicas_list(trans, &r.e, sectors); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1481,7 +1474,7 @@ int bch2_trans_mark_extent(struct btree_trans *trans, : k.k->size; s64 dirty_sectors = 0; bool stale; - int ret; + int ret = 0; r.e.data_type = data_type; r.e.nr_devs = 0; @@ -1500,9 +1493,12 @@ int bch2_trans_mark_extent(struct btree_trans *trans, stale = ret > 0; if (p.ptr.cached) { - if (!stale) - update_cached_sectors_list(trans, p.ptr.dev, - disk_sectors); + if (!stale) { + ret = update_cached_sectors_list(trans, p.ptr.dev, + disk_sectors); + if (ret) + return ret; + } } else if (!p.has_ec) { dirty_sectors += disk_sectors; r.e.devs[r.e.nr_devs++] = p.ptr.dev; @@ -1517,9 +1513,9 @@ int bch2_trans_mark_extent(struct btree_trans *trans, } if (r.e.nr_devs) - update_replicas_list(trans, &r.e, dirty_sectors); + ret = update_replicas_list(trans, &r.e, dirty_sectors); - return 0; + return ret; } static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, @@ -1542,7 +1538,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, if (IS_ERR(a)) return PTR_ERR(a); - ret = check_bucket_ref(c, s.s_c, ptr, sectors, data_type, + ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, a->v.gen, a->v.data_type, a->v.dirty_sectors, a->v.cached_sectors); if (ret) @@ -1572,6 +1568,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, a->v.stripe = s.k->p.offset; a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; } else { if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || a->v.stripe_redundancy != s.v->nr_redundant, trans, @@ -1584,6 +1581,7 @@ static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, a->v.stripe = 0; a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); } a->v.dirty_sectors += sectors; @@ -1634,14 +1632,18 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, s64 sectors = le16_to_cpu(new_s->sectors); bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); - update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; } if (old_s) { s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); bch2_bkey_to_replicas(&r.e, old); - update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; } for (i = 0; i < nr_blocks; i++) { @@ -1669,23 +1671,6 @@ int bch2_trans_mark_stripe(struct btree_trans *trans, return ret; } -int bch2_trans_mark_inode(struct btree_trans *trans, - enum btree_id btree_id, unsigned level, - struct bkey_s_c old, - struct bkey_i *new, - unsigned flags) -{ - int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); - - if (nr) { - struct replicas_delta_list *d = - replicas_deltas_realloc(trans, 0); - d->nr_inodes += nr; - } - - return 0; -} - int bch2_trans_mark_reservation(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, @@ -1698,13 +1683,17 @@ int bch2_trans_mark_reservation(struct btree_trans *trans, unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; s64 sectors = (s64) k.k->size; struct replicas_delta_list *d; + int ret; if (flags & BTREE_TRIGGER_OVERWRITE) sectors = -sectors; sectors *= replicas; - d = replicas_deltas_realloc(trans, 0); + ret = bch2_replicas_deltas_realloc(trans, 0); + if (ret) + return ret; + d = trans->fs_usage_deltas; replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(d->persistent_reserved)); @@ -1718,29 +1707,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i *n; + struct bkey_i *k; __le64 *refcount; int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; struct printbuf buf = PRINTBUF; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(n); + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); if (ret) goto err; - bkey_reassemble(n, k); - - refcount = bkey_refcount(n); + refcount = bkey_refcount(k); if (!refcount) { bch2_bkey_val_to_text(&buf, c, p.s_c); bch2_trans_inconsistent(trans, @@ -1764,12 +1744,12 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, u64 pad; pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(k.k)); + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); BUG_ON(pad > U32_MAX); v->front_pad = cpu_to_le32(pad); pad = max_t(s64, le32_to_cpu(v->back_pad), - k.k->p.offset - p.k->size - le64_to_cpu(v->idx)); + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); BUG_ON(pad > U32_MAX); v->back_pad = cpu_to_le32(pad); } @@ -1777,11 +1757,11 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, le64_add_cpu(refcount, add); bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, n, 0); + ret = bch2_trans_update(trans, &iter, k, 0); if (ret) goto err; - *idx = k.k->p.offset; + *idx = k->k.p.offset; err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); @@ -1837,7 +1817,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, if (IS_ERR(a)) return PTR_ERR(a); - if (a->v.data_type && a->v.data_type != type) { + if (a->v.data_type && type && a->v.data_type != type) { bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", @@ -1943,7 +1923,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) { - return bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); + if (ret) + bch_err_fn(c, ret); + return ret; } /* Disk reservations: */ @@ -2027,15 +2010,21 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; unsigned long *buckets_nouse = NULL; bool resize = ca->bucket_gens != NULL; - int ret = -ENOMEM; + int ret; if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO)) || - (c->opts.buckets_nouse && + GFP_KERNEL|__GFP_ZERO))) { + ret = -BCH_ERR_ENOMEM_bucket_gens; + goto err; + } + + if ((c->opts.buckets_nouse && !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) + GFP_KERNEL|__GFP_ZERO)))) { + ret = -BCH_ERR_ENOMEM_buckets_nouse; goto err; + } bucket_gens->first_bucket = ca->mi.first_bucket; bucket_gens->nbuckets = nbuckets; @@ -2105,12 +2094,12 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); if (!ca->usage_base) - return -ENOMEM; + return -BCH_ERR_ENOMEM_usage_init; for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { ca->usage[i] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[i]) - return -ENOMEM; + return -BCH_ERR_ENOMEM_usage_init; } return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index b6a1db7..a418f66 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -150,21 +150,27 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) void bch2_dev_usage_init(struct bch_dev *); -static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reserve reserve) +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { s64 reserved = 0; - switch (reserve) { - case RESERVE_none: + switch (watermark) { + case BCH_WATERMARK_NR: + unreachable(); + case BCH_WATERMARK_stripe: reserved += ca->mi.nbuckets >> 6; fallthrough; - case RESERVE_movinggc: + case BCH_WATERMARK_normal: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case BCH_WATERMARK_copygc: reserved += ca->nr_btree_reserve; fallthrough; - case RESERVE_btree: + case BCH_WATERMARK_btree: reserved += ca->nr_btree_reserve; fallthrough; - case RESERVE_btree_movinggc: + case BCH_WATERMARK_btree_copygc: + case BCH_WATERMARK_reclaim: break; } @@ -173,17 +179,17 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum alloc_reser static inline u64 dev_buckets_free(struct bch_dev *ca, struct bch_dev_usage usage, - enum alloc_reserve reserve) + enum bch_watermark watermark) { return max_t(s64, 0, usage.d[BCH_DATA_free].buckets - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, reserve)); + bch2_dev_buckets_reserved(ca, watermark)); } static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage usage, - enum alloc_reserve reserve) + enum bch_watermark watermark) { return max_t(s64, 0, usage.d[BCH_DATA_free].buckets @@ -191,21 +197,35 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, + usage.d[BCH_DATA_need_gc_gens].buckets + usage.d[BCH_DATA_need_discard].buckets - ca->nr_open_buckets - - bch2_dev_buckets_reserved(ca, reserve)); + - bch2_dev_buckets_reserved(ca, watermark)); } static inline u64 dev_buckets_available(struct bch_dev *ca, - enum alloc_reserve reserve) + enum bch_watermark watermark) { - return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); } /* Filesystem usage: */ +static inline unsigned __fs_usage_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; +} + static inline unsigned fs_usage_u64s(struct bch_fs *c) { - return sizeof(struct bch_fs_usage) / sizeof(u64) + - READ_ONCE(c->replicas.nr); + return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_online_u64s(struct bch_fs *c) +{ + return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); } static inline unsigned dev_usage_u64s(void) @@ -229,25 +249,43 @@ bch2_fs_usage_read_short(struct bch_fs *); /* key/bucket marking: */ +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) +{ + percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & JOURNAL_BUF_MASK]); +} + +int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); + void bch2_fs_usage_initialize(struct bch_fs *); int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_alloc(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_extent(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_stripe(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_inode(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reservation(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); -int bch2_mark_reflink_p(struct btree_trans *, struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 1dbba7d..2a9dab9 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -89,15 +89,4 @@ struct disk_reservation { unsigned nr_replicas; }; -struct copygc_heap_entry { - u8 dev; - u8 gen; - u8 replicas; - u32 fragmentation; - u32 sectors; - u64 bucket; -}; - -typedef HEAP(struct copygc_heap_entry) copygc_heap; - #endif /* _BUCKETS_TYPES_H */ diff --git a/libbcachefs/buckets_waiting_for_journal.c b/libbcachefs/buckets_waiting_for_journal.c index 0f4ef9e..81ab685 100644 --- a/libbcachefs/buckets_waiting_for_journal.c +++ b/libbcachefs/buckets_waiting_for_journal.c @@ -2,28 +2,24 @@ #include "bcachefs.h" #include "buckets_waiting_for_journal.h" +#include #include -#include static inline struct bucket_hashed * bucket_hash(struct buckets_waiting_for_journal_table *t, unsigned hash_seed_idx, u64 dev_bucket) { - unsigned h = siphash_1u64(dev_bucket, &t->hash_seeds[hash_seed_idx]); - - EBUG_ON(!is_power_of_2(t->size)); - - return t->d + (h & (t->size - 1)); + return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); } -static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t size) +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) { unsigned i; - t->size = size; + t->bits = bits; for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); - memset(t->d, 0, sizeof(t->d[0]) * size); + memset(t->d, 0, sizeof(t->d[0]) << t->bits); } bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, @@ -97,7 +93,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, .dev_bucket = (u64) dev << 56 | bucket, .journal_seq = journal_seq, }; - size_t i, new_size, nr_elements = 1, nr_rehashes = 0; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; int ret = 0; mutex_lock(&b->lock); @@ -106,25 +102,26 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, goto out; t = b->t; - for (i = 0; i < t->size; i++) + size = 1UL << t->bits; + for (i = 0; i < size; i++) nr_elements += t->d[i].journal_seq > flushed_seq; - new_size = nr_elements < t->size / 3 ? t->size : t->size * 2; + new_bits = t->bits + (nr_elements * 3 > size); - n = kvmalloc(sizeof(*n) + sizeof(n->d[0]) * new_size, GFP_KERNEL); + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; goto out; } retry_rehash: nr_rehashes++; - bucket_table_init(n, new_size); + bucket_table_init(n, new_bits); tmp = new; BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); - for (i = 0; i < t->size; i++) { + for (i = 0; i < 1UL << t->bits; i++) { if (t->d[i].journal_seq <= flushed_seq) continue; @@ -137,7 +134,7 @@ retry_rehash: kvfree(t); pr_debug("took %zu rehashes, table at %zu/%zu elements", - nr_rehashes, nr_elements, b->t->size); + nr_rehashes, nr_elements, 1UL << b->t->bits); out: mutex_unlock(&b->lock); @@ -151,7 +148,7 @@ void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) kvfree(b->t); } -#define INITIAL_TABLE_SIZE 8 +#define INITIAL_TABLE_BITS 3 int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) { @@ -159,10 +156,11 @@ int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) mutex_init(&b->lock); - b->t = kvmalloc(sizeof(*b->t) + sizeof(b->t->d[0]) * INITIAL_TABLE_SIZE, GFP_KERNEL); + b->t = kvmalloc(sizeof(*b->t) + + (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); if (!b->t) - return -ENOMEM; + return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; - bucket_table_init(b->t, INITIAL_TABLE_SIZE); + bucket_table_init(b->t, INITIAL_TABLE_BITS); return 0; } diff --git a/libbcachefs/buckets_waiting_for_journal_types.h b/libbcachefs/buckets_waiting_for_journal_types.h index fea7f94..e593db0 100644 --- a/libbcachefs/buckets_waiting_for_journal_types.h +++ b/libbcachefs/buckets_waiting_for_journal_types.h @@ -10,8 +10,8 @@ struct bucket_hashed { }; struct buckets_waiting_for_journal_table { - size_t size; - siphash_key_t hash_seeds[3]; + unsigned bits; + u64 hash_seeds[3]; struct bucket_hashed d[]; }; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index dbb7e5e..fb603df 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -284,6 +284,8 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, return PTR_ERR(ca); ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); + if (ret) + bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); percpu_ref_put(&ca->ref); return ret; @@ -576,7 +578,7 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, return i; } - return -ENOENT; + return -BCH_ERR_ENOENT_dev_idx_not_found; } static long bch2_ioctl_disk_resize(struct bch_fs *c, @@ -631,11 +633,14 @@ do { \ \ if (copy_from_user(&i, arg, sizeof(i))) \ return -EFAULT; \ - return bch2_ioctl_##_name(c, i); \ + ret = bch2_ioctl_##_name(c, i); \ + goto out; \ } while (0) long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) { + long ret; + switch (cmd) { case BCH_IOCTL_QUERY_UUID: return bch2_ioctl_query_uuid(c, arg); @@ -679,6 +684,10 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) default: return -ENOTTY; } +out: + if (ret < 0) + ret = bch2_err_class(ret); + return ret; } static DEFINE_IDR(bch_chardev_minor); @@ -744,7 +753,7 @@ int __init bch2_chardev_init(void) if (bch_chardev_major < 0) return bch_chardev_major; - bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); + bch_chardev_class = class_create("bcachefs"); if (IS_ERR(bch_chardev_class)) return PTR_ERR(bch_chardev_class); diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 43d22fe..a08997a 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -133,7 +133,7 @@ static inline int do_encrypt(struct crypto_sync_skcipher *tfm, sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); if (!sg) - return -ENOMEM; + return -BCH_ERR_ENOMEM_do_encrypt; sg_init_table(sg, pages); @@ -648,7 +648,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); if (!crypt) { - ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ + ret = -BCH_ERR_ENOSPC_sb_crypt; goto err; } @@ -680,8 +680,6 @@ int bch2_fs_encryption_init(struct bch_fs *c) struct bch_key key; int ret = 0; - pr_verbose_init(c->opts, ""); - c->sha256 = crypto_alloc_shash("sha256", 0, 0); ret = PTR_ERR_OR_ZERO(c->sha256); if (ret) { @@ -707,6 +705,5 @@ int bch2_fs_encryption_init(struct bch_fs *c) goto out; out: memzero_explicit(&key, sizeof(key)); - pr_verbose_init(c->opts, "ret %i", ret); return ret; } diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index 409ad53..1ad1d5f 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -120,12 +120,6 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } -static const unsigned bch2_compression_opt_to_type[] = { -#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, - BCH_COMPRESSION_OPTS() -#undef x -}; - static inline bool bch2_checksum_type_valid(const struct bch_fs *c, unsigned type) { diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index f3ffdbc..f418890 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -122,7 +122,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, } __set_current_state(TASK_RUNNING); - del_singleshot_timer_sync(&wait.cpu_timer); + del_timer_sync(&wait.cpu_timer); destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } @@ -157,6 +157,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) unsigned long now; unsigned i; + out->atomic++; spin_lock(&clock->timer_lock); now = atomic64_read(&clock->now); @@ -165,6 +166,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) clock->timers.data[i]->fn, clock->timers.data[i]->expire - now); spin_unlock(&clock->timer_lock); + --out->atomic; } void bch2_io_clock_exit(struct io_clock *clock) @@ -182,10 +184,10 @@ int bch2_io_clock_init(struct io_clock *clock) clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); if (!clock->pcpu_buf) - return -ENOMEM; + return -BCH_ERR_ENOMEM_io_clock_init; if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) - return -ENOMEM; + return -BCH_ERR_ENOMEM_io_clock_init; return 0; } diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 2b7080b..c9ca7cc 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -28,11 +28,11 @@ static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) BUG_ON(size > c->opts.encoded_extent_max); - b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); + b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); if (b) return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; - b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); if (b) return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; @@ -94,7 +94,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); pages = nr_pages > ARRAY_SIZE(stack_pages) - ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) + ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) : stack_pages; if (!pages) goto bounce; @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, .avail_out = dst_len, }; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); @@ -196,7 +196,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, if (real_src_len > src_len - 4) goto err; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ret = zstd_decompress_dctx(ctx, @@ -240,7 +240,8 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, data = __bounce_alloc(c, dst_len, WRITE); if (__bio_uncompress(c, bio, data.b, *crc)) { - bch_err(c, "error rewriting existing data: decompression error"); + if (!c->opts.no_data_io) + bch_err(c, "error rewriting existing data: decompression error"); bio_unmap_or_unbounce(c, data); return -EIO; } @@ -270,7 +271,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, { struct bbuf dst_data = { NULL }; size_t dst_len = crc.uncompressed_size << 9; - int ret = -ENOMEM; + int ret; if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) @@ -296,21 +297,32 @@ static int attempt_compress(struct bch_fs *c, void *workspace, void *dst, size_t dst_len, void *src, size_t src_len, - enum bch_compression_type compression_type) + struct bch_compression_opt compression) { - switch (compression_type) { - case BCH_COMPRESSION_TYPE_lz4: { - int len = src_len; - int ret = LZ4_compress_destSize( - src, dst, - &len, dst_len, - workspace); + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; - if (len < src_len) - return -len; - - return ret; - } + switch (compression_type) { + case BCH_COMPRESSION_TYPE_lz4: + if (compression.level < LZ4HC_MIN_CLEVEL) { + int len = src_len; + int ret = LZ4_compress_destSize( + src, dst, + &len, dst_len, + workspace); + if (len < src_len) + return -len; + + return ret; + } else { + int ret = LZ4_compress_HC( + src, dst, + src_len, dst_len, + compression.level, + workspace); + + return ret ?: -1; + } case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { .next_in = src, @@ -320,7 +332,11 @@ static int attempt_compress(struct bch_fs *c, }; zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + zlib_deflateInit2(&strm, + compression.level + ? clamp_t(unsigned, compression.level, + Z_BEST_SPEED, Z_BEST_COMPRESSION) + : Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); @@ -333,8 +349,14 @@ static int attempt_compress(struct bch_fs *c, return strm.total_out; } case BCH_COMPRESSION_TYPE_zstd: { + /* + * rescale: + * zstd max compression level is 22, our max level is 15 + */ + unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); + ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); ZSTD_CCtx *ctx = zstd_init_cctx(workspace, - zstd_cctx_workspace_bound(&c->zstd_params.cParams)); + zstd_cctx_workspace_bound(¶ms.cParams)); /* * ZSTD requires that when we decompress we pass in the exact @@ -365,10 +387,12 @@ static int attempt_compress(struct bch_fs *c, static unsigned __bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, - enum bch_compression_type compression_type) + struct bch_compression_opt compression) { struct bbuf src_data = { NULL }, dst_data = { NULL }; void *workspace; + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; unsigned pad; int ret = 0; @@ -382,7 +406,7 @@ static unsigned __bio_compress(struct bch_fs *c, dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); + workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); *src_len = src->bi_iter.bi_size; *dst_len = dst->bi_iter.bi_size; @@ -400,7 +424,7 @@ static unsigned __bio_compress(struct bch_fs *c, ret = attempt_compress(c, workspace, dst_data.b, *dst_len, src_data.b, *src_len, - compression_type); + compression); if (ret > 0) { *dst_len = ret; ret = 0; @@ -447,22 +471,24 @@ static unsigned __bio_compress(struct bch_fs *c, BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); BUG_ON(*dst_len & (block_bytes(c) - 1)); BUG_ON(*src_len & (block_bytes(c) - 1)); + ret = compression_type; out: bio_unmap_or_unbounce(c, src_data); bio_unmap_or_unbounce(c, dst_data); - return compression_type; + return ret; err: - compression_type = BCH_COMPRESSION_TYPE_incompressible; + ret = BCH_COMPRESSION_TYPE_incompressible; goto out; } unsigned bch2_bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, - unsigned compression_type) + unsigned compression_opt) { unsigned orig_dst = dst->bi_iter.bi_size; unsigned orig_src = src->bi_iter.bi_size; + unsigned compression_type; /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, @@ -470,11 +496,9 @@ unsigned bch2_bio_compress(struct bch_fs *c, /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) - compression_type = BCH_COMPRESSION_TYPE_lz4; - compression_type = - __bio_compress(c, dst, dst_len, src, src_len, compression_type); + __bio_compress(c, dst, dst_len, src, src_len, + bch2_compression_decode(compression_opt)); dst->bi_iter.bi_size = orig_dst; src->bi_iter.bi_size = orig_src; @@ -521,8 +545,10 @@ static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) } int bch2_check_set_has_compressed_data(struct bch_fs *c, - unsigned compression_type) + unsigned compression_opt) { + unsigned compression_type = bch2_compression_decode(compression_opt).type; + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); return compression_type @@ -546,14 +572,16 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { size_t decompress_workspace_size = 0; bool decompress_workspace_needed; - ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); + ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), + c->opts.encoded_extent_max); struct { - unsigned feature; - unsigned type; - size_t compress_workspace; - size_t decompress_workspace; + unsigned feature; + enum bch_compression_type type; + size_t compress_workspace; + size_t decompress_workspace; } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, + { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), zlib_inflate_workspacesize(), }, @@ -561,34 +589,27 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) zstd_cctx_workspace_bound(¶ms.cParams), zstd_dctx_workspace_bound() }, }, *i; - int ret = 0; - - pr_verbose_init(c->opts, ""); + bool have_compressed = false; c->zstd_params = params; for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) - if (features & (1 << i->feature)) - goto have_compressed; + have_compressed |= (features & (1 << i->feature)) != 0; - goto out; -have_compressed: + if (!have_compressed) + return 0; - if (!mempool_initialized(&c->compression_bounce[READ])) { - ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max); - if (ret) - goto out; - } + if (!mempool_initialized(&c->compression_bounce[READ]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_read_init; - if (!mempool_initialized(&c->compression_bounce[WRITE])) { - ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max); - if (ret) - goto out; - } + if (!mempool_initialized(&c->compression_bounce[WRITE]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); @@ -605,35 +626,88 @@ have_compressed: if (mempool_initialized(&c->compress_workspace[i->type])) continue; - ret = mempool_init_kvpmalloc_pool( + if (mempool_init_kvpmalloc_pool( &c->compress_workspace[i->type], - 1, i->compress_workspace); - if (ret) - goto out; + 1, i->compress_workspace)) + return -BCH_ERR_ENOMEM_compression_workspace_init; } - if (!mempool_initialized(&c->decompress_workspace)) { - ret = mempool_init_kvpmalloc_pool( - &c->decompress_workspace, - 1, decompress_workspace_size); - if (ret) - goto out; - } -out: - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + if (!mempool_initialized(&c->decompress_workspace) && + mempool_init_kvpmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) + return -BCH_ERR_ENOMEM_decompression_workspace_init; + + return 0; +} + +static u64 compression_opt_to_feature(unsigned v) +{ + unsigned type = bch2_compression_decode(v).type; + return 1ULL << bch2_compression_opt_to_feature[type]; } int bch2_fs_compress_init(struct bch_fs *c) { u64 f = c->sb.features; - if (c->opts.compression) - f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; - - if (c->opts.background_compression) - f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; + f |= compression_opt_to_feature(c->opts.compression); + f |= compression_opt_to_feature(c->opts.background_compression); return __bch2_fs_compress_init(c, f); +} + +int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + struct printbuf *err) +{ + char *val = kstrdup(_val, GFP_KERNEL); + char *p = val, *type_str, *level_str; + struct bch_compression_opt opt = { 0 }; + int ret; + + if (!val) + return -ENOMEM; + + type_str = strsep(&p, ":"); + level_str = p; + + ret = match_string(bch2_compression_opts, -1, type_str); + if (ret < 0 && err) + prt_str(err, "invalid compression type"); + if (ret < 0) + goto err; + + opt.type = ret; + + if (level_str) { + unsigned level; + + ret = kstrtouint(level_str, 10, &level); + if (!ret && !opt.type && level) + ret = -EINVAL; + if (!ret && level > 15) + ret = -EINVAL; + if (ret < 0 && err) + prt_str(err, "invalid compression level"); + if (ret < 0) + goto err; + + opt.level = level; + } + + *res = bch2_compression_encode(opt); +err: + kfree(val); + return ret; +} + +void bch2_opt_compression_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + struct bch_compression_opt opt = bch2_compression_decode(v); + prt_str(out, bch2_compression_opts[opt.type]); + if (opt.level) + prt_printf(out, ":%u", opt.level); } diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h index 4bab1f6..052ea30 100644 --- a/libbcachefs/compress.h +++ b/libbcachefs/compress.h @@ -4,6 +4,35 @@ #include "extents_types.h" +struct bch_compression_opt { + u8 type:4, + level:4; +}; + +static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +{ + return (struct bch_compression_opt) { + .type = v & 15, + .level = v >> 4, + }; +} + +static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) +{ + return opt.type|(opt.level << 4); +} + +static const unsigned __bch2_compression_opt_to_type[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + +static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) +{ + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; +} + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, @@ -15,4 +44,12 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); void bch2_fs_compress_exit(struct bch_fs *); int bch2_fs_compress_init(struct bch_fs *); +int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + +#define bch2_opt_compression (struct bch_opt_fn) { \ + .parse = bch2_opt_compression_parse, \ + .to_text = bch2_opt_compression_to_text, \ +} + #endif /* _BCACHEFS_COMPRESS_H */ diff --git a/libbcachefs/counters.c b/libbcachefs/counters.c index edd1b25..442a9b8 100644 --- a/libbcachefs/counters.c +++ b/libbcachefs/counters.c @@ -5,7 +5,7 @@ /* BCH_SB_FIELD_counters */ -const char * const bch2_counter_names[] = { +static const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x @@ -27,7 +27,7 @@ static int bch2_sb_counters_validate(struct bch_sb *sb, return 0; }; -void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, +static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_counters *ctrs = field_to_type(f, counters); @@ -96,7 +96,7 @@ int bch2_fs_counters_init(struct bch_fs *c) { c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); if (!c->counters) - return -ENOMEM; + return -BCH_ERR_ENOMEM_fs_counters_init; return bch2_sb_counters_to_cpu(c); } diff --git a/libbcachefs/darray.h b/libbcachefs/darray.h index 519ab9b..114f86b 100644 --- a/libbcachefs/darray.h +++ b/libbcachefs/darray.h @@ -19,11 +19,11 @@ struct { \ typedef DARRAY(void) darray_void; -static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) { if (d->nr + more > d->size) { size_t new_size = roundup_pow_of_two(d->nr + more); - void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + void *data = krealloc_array(d->data, new_size, t_size, gfp); if (!data) return -ENOMEM; @@ -35,27 +35,37 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) return 0; } +#define darray_make_room_gfp(_d, _more, _gfp) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + #define darray_make_room(_d, _more) \ - __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) + darray_make_room_gfp(_d, _more, GFP_KERNEL) #define darray_top(_d) ((_d).data[(_d).nr]) -#define darray_push(_d, _item) \ +#define darray_push_gfp(_d, _item, _gfp) \ ({ \ - int _ret = darray_make_room((_d), 1); \ + int _ret = darray_make_room_gfp((_d), 1, _gfp); \ \ if (!_ret) \ (_d)->data[(_d)->nr++] = (_item); \ _ret; \ }) -#define darray_insert_item(_d, _pos, _item) \ +#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) + +#define darray_pop(_d) ((_d)->data[--(_d)->nr]) + +#define darray_first(_d) ((_d).data[0]) +#define darray_last(_d) ((_d).data[(_d).nr - 1]) + +#define darray_insert_item(_d, pos, _item) \ ({ \ - size_t pos = (_pos); \ + size_t _pos = (pos); \ int _ret = darray_make_room((_d), 1); \ \ if (!_ret) \ - array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ + array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ _ret; \ }) diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 3015528..cfc6244 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -7,94 +7,88 @@ #include "buckets.h" #include "data_update.h" #include "ec.h" +#include "error.h" #include "extents.h" #include "io.h" #include "keylist.h" #include "move.h" +#include "nocow_locking.h" #include "subvolume.h" +#include "trace.h" -#include - -static int insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) +static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) { - struct bch_fs *c = trans->c; - struct btree_iter iter, update_iter; - struct bkey_s_c k; - snapshot_id_list s; - int ret; - - if (!btree_type_has_snapshots(id)) - return 0; + if (trace_move_extent_finish_enabled()) { + struct printbuf buf = PRINTBUF; - darray_init(&s); + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); + } +} - if (!bkey_cmp(old_pos, new_pos)) - return 0; +static void trace_move_extent_fail2(struct data_update *m, + struct bkey_s_c new, + struct bkey_s_c wrote, + struct bkey_i *insert, + const char *msg) +{ + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + unsigned i, rewrites_found = 0; - if (!snapshot_t(c, old_pos.snapshot)->children[0]) - return 0; + if (!trace_move_extent_fail_enabled()) + return; - bch2_trans_iter_init(trans, &iter, id, old_pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; + prt_str(&buf, msg); - if (bkey_cmp(old_pos, k.k->p)) - break; + if (insert) { + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + struct bkey_s new_s; + new_s.k = (void *) new.k; + new_s.v = (void *) new.v; - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { - struct bkey_i *update; + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) + rewrites_found |= 1U << i; + i++; + } + } - if (snapshot_list_has_ancestor(c, &s, k.k->p.snapshot)) - continue; + prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", + (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + prt_printf(&buf, "\nrewrites found: %u%u%u%u", + (rewrites_found & (1 << 0)) != 0, + (rewrites_found & (1 << 1)) != 0, + (rewrites_found & (1 << 2)) != 0, + (rewrites_found & (1 << 3)) != 0); - ret = PTR_ERR_OR_ZERO(update); - if (ret) - break; - - bkey_init(&update->k); - update->k.p = new_pos; - update->k.p.snapshot = k.k->p.snapshot; - - bch2_trans_iter_init(trans, &update_iter, id, update->k.p, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_INTENT); - ret = bch2_btree_iter_traverse(&update_iter) ?: - bch2_trans_update(trans, &update_iter, update, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch2_trans_iter_exit(trans, &update_iter); - if (ret) - break; + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); - ret = snapshot_list_add(c, &s, k.k->p.snapshot); - if (ret) - break; - } - } - bch2_trans_iter_exit(trans, &iter); - darray_exit(&s); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, new); - return ret; -} + prt_str(&buf, "\nwrote: "); + bch2_bkey_val_to_text(&buf, c, wrote); -static void bch2_bkey_mark_dev_cached(struct bkey_s k, unsigned dev) -{ - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); - struct bch_extent_ptr *ptr; + if (insert) { + prt_str(&buf, "\ninsert: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + } - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == dev) - ptr->cached = true; + trace_move_extent_fail(c, buf.buf); + printbuf_exit(&buf); } static int __bch2_data_update_index_update(struct btree_trans *trans, @@ -119,15 +113,17 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, while (1) { struct bkey_s_c k; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - struct bkey_i *insert; + struct bkey_i *insert = NULL; struct bkey_i_extent *new; - const union bch_extent_entry *entry; + const union bch_extent_entry *entry_c; + union bch_extent_entry *entry; struct extent_ptr_decoded p; + struct bch_extent_ptr *ptr; + const struct bch_extent_ptr *ptr_c; struct bpos next_pos; - bool did_work = false; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned i; + unsigned rewrites_found = 0, durability, i; bch2_trans_begin(trans); @@ -138,8 +134,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, new = bkey_i_to_extent(bch2_keylist_front(keys)); - if (!bch2_extents_match(k, old)) - goto nomatch; + if (!bch2_extents_match(k, old)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + NULL, "no match:"); + goto nowork; + } bkey_reassemble(_insert.k, k); insert = _insert.k; @@ -162,40 +161,75 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * Fist, drop rewrite_ptrs from @new: */ i = 0; - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { if (((1U << i) & m->data_opts.rewrite_ptrs) && - bch2_extent_has_ptr(old, p, bkey_i_to_s_c(insert))) { + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); /* - * If we're going to be adding a pointer to the - * same device, we have to drop the old one - - * otherwise, we can just mark it cached: - */ - if (bch2_bkey_has_device(bkey_i_to_s_c(&new->k_i), p.ptr.dev)) - bch2_bkey_drop_device_noerror(bkey_i_to_s(insert), p.ptr.dev); - else - bch2_bkey_mark_dev_cached(bkey_i_to_s(insert), p.ptr.dev); + * See comment below: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + */ + rewrites_found |= 1U << i; } i++; } + if (m->data_opts.rewrite_ptrs && + !rewrites_found && + bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + goto nowork; + } - /* Add new ptrs: */ - extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { - /* - * raced with another move op? extent already - * has a pointer to the device we just wrote - * data to - */ - continue; + /* + * A replica that we just wrote might conflict with a replica + * that we want to keep, due to racing with another move: + */ +restart_drop_conflicting_replicas: + extent_for_each_ptr(extent_i_to_s(new), ptr) + if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && + !ptr_c->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); + goto restart_drop_conflicting_replicas; } - bch2_extent_ptr_decoded_append(insert, &p); - did_work = true; + if (!bkey_val_u64s(&new->k)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + goto nowork; + } + + /* Now, drop pointers that conflict with what we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + + durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + + bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); + + /* Now, drop excess replicas: */ +restart_drop_extra_replicas: + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); + /* + * Currently, we're dropping unneeded replicas + * instead of marking them as cached, since + * cached data in stripe buckets prevents them + * from being reused: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + */ + goto restart_drop_extra_replicas; + } } - if (!did_work) - goto nomatch; + /* Finally, add the pointers we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + bch2_extent_ptr_decoded_append(insert, &p); bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); bch2_extent_normalize(c, bkey_i_to_s(insert)); @@ -218,19 +252,25 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, next_pos = insert->k.p; - ret = insert_snapshot_whiteouts(trans, m->btree_id, - k.k->p, insert->k.p) ?: - bch2_trans_update(trans, &iter, insert, + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, insert->k.p); + if (ret) + goto err; + + ret = bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, &op->res, NULL, + BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| m->data_opts.btree_insert_flags); if (!ret) { bch2_btree_iter_set_pos(&iter, next_pos); this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - trace_move_extent_finish(&new->k); + trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); } err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -238,22 +278,21 @@ err: if (ret) break; next: - while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { + while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { bch2_keylist_pop_front(keys); if (bch2_keylist_empty(keys)) goto out; } continue; -nomatch: - if (m->ctxt) { +nowork: + if (m->ctxt && m->ctxt->stats) { BUG_ON(k.k->p.offset <= iter.pos.offset); atomic64_inc(&m->ctxt->stats->keys_raced); atomic64_add(k.k->p.offset - iter.pos.offset, &m->ctxt->stats->sectors_raced); } - this_cpu_add(c->counters[BCH_COUNTER_move_extent_race], new->k.size); - trace_move_extent_race(&new->k); + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); bch2_btree_iter_advance(&iter); goto next; @@ -268,15 +307,7 @@ out: int bch2_data_update_index_update(struct bch_write_op *op) { - struct bch_fs *c = op->c; - struct btree_trans trans; - int ret; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - ret = __bch2_data_update_index_update(&trans, op); - bch2_trans_exit(&trans); - - return ret; + return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); } void bch2_data_update_read_done(struct data_update *m, @@ -298,9 +329,12 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); const struct bch_extent_ptr *ptr; - bkey_for_each_ptr(ptrs, ptr) - bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), 0); + bkey_for_each_ptr(ptrs, ptr) { + if (c->opts.nocow_enabled) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + } bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); @@ -347,9 +381,9 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, &update->op.devs_have, update->op.nr_replicas, update->op.nr_replicas, - update->op.alloc_reserve, + update->op.watermark, 0, &cl, &wp); - if (ret == -EAGAIN) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { bch2_trans_unlock(trans); closure_sync(&cl); continue; @@ -387,17 +421,22 @@ void bch2_update_unwritten_extent(struct btree_trans *trans, } } -int bch2_data_update_init(struct bch_fs *c, struct data_update *m, +int bch2_data_update_init(struct btree_trans *trans, + struct moving_context *ctxt, + struct data_update *m, struct write_point_specifier wp, struct bch_io_opts io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; + const struct bch_extent_ptr *ptr; unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned ptrs_locked = 0; int ret; bch2_bkey_buf_init(&m->k); @@ -410,29 +449,32 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, m->op.version = k.k->version; m->op.target = data_opts.target; m->op.write_point = wp; + m->op.nr_replicas = 0; m->op.flags |= BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_type = - bch2_compression_opt_to_type[io_opts.background_compression ?: - io_opts.compression]; - if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) - m->op.alloc_reserve = RESERVE_movinggc; + m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + bkey_for_each_ptr(ptrs, ptr) + percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); i = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && - p.ptr.cached) - BUG(); + bool locked; - if (!((1U << i) & m->data_opts.rewrite_ptrs)) - bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + if (((1U << i) & m->data_opts.rewrite_ptrs)) { + BUG_ON(p.ptr.cached); - if (((1U << i) & m->data_opts.rewrite_ptrs) && - crc_is_compressed(p.crc)) - reserve_sectors += k.k->size; + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + } else if (!p.ptr.cached) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + } /* * op->csum_type is normally initialized from the fs/file's @@ -447,10 +489,27 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - i++; + if (c->opts.nocow_enabled) { + if (ctxt) { + move_ctxt_wait_event(ctxt, trans, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + !atomic_read(&ctxt->read_sectors)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto err; + } + } + ptrs_locked |= (1U << i); + } - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, &p.ptr), 0); + i++; } if (reserve_sectors) { @@ -459,11 +518,11 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - return ret; + goto err; } - m->op.nr_replicas = m->op.nr_replicas_required = - hweight32(m->data_opts.rewrite_ptrs) + m->data_opts.extra_replicas; + m->op.nr_replicas += m->data_opts.extra_replicas; + m->op.nr_replicas_required = m->op.nr_replicas; BUG_ON(!m->op.nr_replicas); @@ -471,6 +530,19 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, if (bkey_extent_is_unwritten(k)) return -BCH_ERR_unwritten_extent_update; return 0; +err: + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if ((1U << i) & ptrs_locked) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + i++; + } + + bch2_bkey_buf_exit(&m->k, c); + bch2_bio_free_pages_pool(c, &m->op.wbio.bio); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/libbcachefs/data_update.h b/libbcachefs/data_update.h index f304c33..49e9055 100644 --- a/libbcachefs/data_update.h +++ b/libbcachefs/data_update.h @@ -33,7 +33,8 @@ void bch2_data_update_read_done(struct data_update *, void bch2_data_update_exit(struct data_update *); void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); -int bch2_data_update_init(struct bch_fs *, struct data_update *, +int bch2_data_update_init(struct btree_trans *, struct moving_context *, + struct data_update *, struct write_point_specifier, struct bch_io_opts, struct data_update_opts, enum btree_id, struct bkey_s_c); diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index 57602c8..ae47e18 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include @@ -40,7 +39,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct bset *sorted, *inmemory = &b->data->keys; struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); struct bio *bio; - bool failed = false; + bool failed = false, saw_error = false; if (!bch2_dev_get_ioref(ca, READ)) return false; @@ -48,7 +47,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, bio = bio_alloc_bioset(ca->disk_sb.bdev, buf_pages(n_sorted, btree_bytes(c)), REQ_OP_READ|REQ_META, - GFP_NOIO, + GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; bch2_bio_map(bio, n_sorted, btree_bytes(c)); @@ -61,7 +60,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, memcpy(n_ondisk, n_sorted, btree_bytes(c)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false)) + if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) return false; n_sorted = c->verify_data->data; @@ -154,7 +153,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) BUG_ON(b->nsets != 1); - for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k)) + for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) if (k->type == KEY_TYPE_btree_ptr_v2) { struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); v->mem_ptr = 0; @@ -182,6 +181,125 @@ out: bch2_btree_node_io_unlock(b); } +void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + const struct btree *b) +{ + struct btree_node *n_ondisk = NULL; + struct extent_ptr_decoded pick; + struct bch_dev *ca; + struct bio *bio = NULL; + unsigned offset = 0; + int ret; + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + prt_printf(out, "error getting device to read from: invalid device\n"); + return; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) { + prt_printf(out, "error getting device to read from: not online\n"); + return; + } + + n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!n_ondisk) { + prt_printf(out, "memory allocation failure\n"); + goto out; + } + + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_ondisk, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOFS, + &c->btree_bio); + bio->bi_iter.bi_sector = pick.ptr.offset; + bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + + ret = submit_bio_wait(bio); + if (ret) { + prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); + goto out; + } + + while (offset < btree_sectors(c)) { + struct bset *i; + struct nonce nonce; + struct bch_csum csum; + struct bkey_packed *k; + unsigned sectors; + + if (!offset) { + i = &n_ondisk->keys; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); + + if (bch2_crc_cmp(csum, n_ondisk->csum)) { + prt_printf(out, "invalid checksum\n"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(n_ondisk, c->block_bits); + } else { + struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); + + i = &bne->keys; + + if (i->seq != n_ondisk->keys.seq) + break; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + if (bch2_crc_cmp(csum, bne->csum)) { + prt_printf(out, "invalid checksum"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(bne, c->block_bits); + } + + prt_printf(out, " offset %u version %u, journal seq %llu\n", + offset, + le16_to_cpu(i->version), + le64_to_cpu(i->journal_seq)); + offset += sectors; + + printbuf_indent_add(out, 4); + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { + struct bkey u; + + bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); + prt_newline(out); + } + + printbuf_indent_sub(out, 4); + } +out: + if (bio) + bio_put(bio); + kvpfree(n_ondisk, btree_bytes(c)); + percpu_ref_put(&ca->io_ref); +} + #ifdef CONFIG_DEBUG_FS /* XXX: bch_fs refcounting */ @@ -260,26 +378,25 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->size = size; i->ret = 0; - bch2_trans_init(&trans, i->c, 0, 0); + ret = flush_buf(i); + if (ret) + return ret; + bch2_trans_init(&trans, i->c, 0, 0); ret = for_each_btree_key2(&trans, iter, i->id, i->from, BTREE_ITER_PREFETCH| BTREE_ITER_ALL_SNAPSHOTS, k, ({ - ret = flush_buf(i); - if (ret) - break; - bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); - 0; + drop_locks_do(&trans, flush_buf(i)); })); i->from = iter.pos; + bch2_trans_exit(&trans); + if (!ret) ret = flush_buf(i); - bch2_trans_exit(&trans); - return ret ?: i->ret; } @@ -307,23 +424,28 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, if (ret) return ret; - if (!bpos_cmp(SPOS_MAX, i->from)) + if (bpos_eq(SPOS_MAX, i->from)) return i->ret; bch2_trans_init(&trans, i->c, 0, 0); +retry: + bch2_trans_begin(&trans); for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { - ret = flush_buf(i); - if (ret) - break; - bch2_btree_node_to_text(&i->buf, i->c, b); - i->from = bpos_cmp(SPOS_MAX, b->key.k.p) + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; + + ret = drop_locks_do(&trans, flush_buf(i)); + if (ret) + break; } bch2_trans_iter_exit(&trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); if (!ret) @@ -365,17 +487,13 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); - ret = flush_buf(i); - if (ret) - break; - - if (bpos_cmp(l->b->key.k.p, i->prev_node) > 0) { + if (bpos_gt(l->b->key.k.p, i->prev_node)) { bch2_btree_node_to_text(&i->buf, i->c, l->b); i->prev_node = l->b->key.k.p; } bch2_bfloat_to_text(&i->buf, l->b, _k); - 0; + drop_locks_do(&trans, flush_buf(i)); })); i->from = iter.pos; @@ -501,6 +619,7 @@ static const struct file_operations cached_btree_nodes_ops = { .read = bch2_cached_btree_nodes_read, }; +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { @@ -508,33 +627,45 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; + u32 seq; i->ubuf = buf; i->size = size; i->ret = 0; - - mutex_lock(&c->btree_trans_lock); +restart: + seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { if (trans->locking_wait.task->pid <= i->iter) continue; + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + ret = flush_buf(i); - if (ret) - return ret; + if (ret) { + closure_put(&trans->ref); + goto unlocked; + } bch2_btree_trans_to_text(&i->buf, trans); prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - bch2_prt_backtrace(&i->buf, trans->locking_wait.task); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); i->iter = trans->locking_wait.task->pid; - } - mutex_unlock(&c->btree_trans_lock); + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; + } + seqmutex_unlock(&c->btree_trans_lock); +unlocked: if (i->buf.allocation_failure) ret = -ENOMEM; @@ -550,6 +681,7 @@ static const struct file_operations btree_transactions_ops = { .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; +#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) @@ -695,6 +827,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; + u32 seq; i->ubuf = buf; i->size = size; @@ -702,21 +835,32 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, if (i->iter) goto out; - - mutex_lock(&c->btree_trans_lock); +restart: + seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { if (trans->locking_wait.task->pid <= i->iter) continue; + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + ret = flush_buf(i); - if (ret) - return ret; + if (ret) { + closure_put(&trans->ref); + goto out; + } bch2_check_for_deadlock(trans, &i->buf); i->iter = trans->locking_wait.task->pid; + + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; } - mutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); out: if (i->buf.allocation_failure) ret = -ENOMEM; @@ -756,8 +900,10 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); +#endif debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); diff --git a/libbcachefs/debug.h b/libbcachefs/debug.h index 0b86736..2c37143 100644 --- a/libbcachefs/debug.h +++ b/libbcachefs/debug.h @@ -9,6 +9,8 @@ struct btree; struct bch_fs; void __bch2_btree_verify(struct bch_fs *, struct btree *); +void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, + const struct btree *); static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 288f46b..065ea59 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "btree_update.h" #include "extents.h" @@ -84,54 +85,49 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); unsigned len; - if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*d.v)); - return -EINVAL; - } - len = bch2_dirent_name_bytes(d); if (!len) { prt_printf(err, "empty name"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { prt_printf(err, "value too big (%zu > %u)", bkey_val_u64s(k.k), dirent_val_u64s(len)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (len > BCH_NAME_MAX) { prt_printf(err, "dirent name too big (%u > %u)", len, BCH_NAME_MAX); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { prt_printf(err, "invalid name"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { prt_printf(err, "invalid name"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (memchr(d.v->d_name, '/', len)) { prt_printf(err, "invalid name"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (d.v->d_type != DT_SUBVOL && le64_to_cpu(d.v->d_inum) == d.k->p.inode) { prt_printf(err, "dirent points to own directory"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -224,7 +220,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, int ret = 0; if (d.v->d_type == DT_SUBVOL && - d.v->d_parent_subvol != dir.subvol) + le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) return 1; if (likely(d.v->d_type != DT_SUBVOL)) { @@ -350,8 +346,8 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_init(&new_src->k); new_src->k.p = src_iter.pos; - if (bkey_cmp(dst_pos, src_iter.pos) <= 0 && - bkey_cmp(src_iter.pos, dst_iter.pos) < 0) { + if (bkey_le(dst_pos, src_iter.pos) && + bkey_lt(src_iter.pos, dst_iter.pos)) { /* * We have a hash collision for the new dst key, * and new_src - the key we're deleting - is between @@ -510,8 +506,10 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) struct bkey_s_c_dirent dirent; subvol_inum target; u32 snapshot; + struct bkey_buf sk; int ret; + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -534,10 +532,11 @@ retry: if (ret) continue; - /* - * XXX: dir_emit() can fault and block, while we're holding - * locks - */ + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + dirent = bkey_i_to_s_c_dirent(sk.k); + bch2_trans_unlock(&trans); + ctx->pos = dirent.k->p.offset; if (!dir_emit(ctx, dirent.v->d_name, bch2_dirent_name_bytes(dirent), @@ -560,6 +559,7 @@ err: goto retry; bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); return ret; } diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index 1a2c910..b42f4a1 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -4,14 +4,17 @@ #include "str_hash.h" +enum bkey_invalid_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ .key_invalid = bch2_dirent_invalid, \ .val_to_text = bch2_dirent_to_text, \ + .min_val_size = 16, \ }) struct qstr; diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 6b81f35..de14ca3 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -27,7 +27,7 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field_members *mi = bch2_sb_get_members(sb); unsigned nr_groups = disk_groups_nr(groups); unsigned i, len; - int ret = -EINVAL; + int ret = 0; for (i = 0; i < sb->nr_devices; i++) { struct bch_member *m = mi->members + i; @@ -41,12 +41,12 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, if (g >= nr_groups) { prt_printf(err, "disk %u has invalid label %u (have %u)", i, g, nr_groups); - return -EINVAL; + return -BCH_ERR_invalid_sb_disk_groups; } if (BCH_GROUP_DELETED(&groups->entries[g])) { prt_printf(err, "disk %u has deleted label %u", i, g); - return -EINVAL; + return -BCH_ERR_invalid_sb_disk_groups; } } @@ -62,13 +62,13 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, len = strnlen(g->label, sizeof(g->label)); if (!len) { prt_printf(err, "label %u empty", i); - return -EINVAL; + return -BCH_ERR_invalid_sb_disk_groups; } } sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); if (!sorted) - return -ENOMEM; + return -BCH_ERR_ENOMEM_disk_groups_validate; memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); @@ -79,13 +79,46 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, prt_printf(err, "duplicate label %llu.%.*s", BCH_GROUP_PARENT(g), (int) sizeof(g->label), g->label); + ret = -BCH_ERR_invalid_sb_disk_groups; goto err; } - - ret = 0; err: kfree(sorted); - return 0; + return ret; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_disk_groups_cpu *g; + struct bch_dev *ca; + int i; + unsigned iter; + + out->atomic++; + rcu_read_lock(); + + g = rcu_dereference(c->disk_groups); + if (!g) + goto out; + + for (i = 0; i < g->nr; i++) { + if (i) + prt_printf(out, " "); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + continue; + } + + prt_printf(out, "[parent %d devs", g->entries[i].parent); + for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); + prt_printf(out, "]"); + } + +out: + rcu_read_unlock(); + out->atomic--; } static void bch2_sb_disk_groups_to_text(struct printbuf *out, @@ -135,7 +168,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) cpu_g = kzalloc(sizeof(*cpu_g) + sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); if (!cpu_g) - return -ENOMEM; + return -BCH_ERR_ENOMEM_disk_groups_to_cpu; cpu_g->nr = nr_groups; @@ -175,26 +208,36 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); + struct bch_devs_mask *devs; + + rcu_read_lock(); switch (t.type) { case TARGET_NULL: - return NULL; + devs = NULL; + break; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - return ca ? &ca->self : NULL; + devs = ca ? &ca->self : NULL; + break; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - return g && t.group < g->nr && !g->entries[t.group].deleted + devs = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; + break; } default: BUG(); } + + rcu_read_unlock(); + + return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) @@ -417,30 +460,37 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) return ret; } -int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) +int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) { struct bch_dev *ca; int g; - if (!strlen(buf) || !strcmp(buf, "none")) { - *v = 0; + if (!val) + return -EINVAL; + + if (!c) + return 0; + + if (!strlen(val) || !strcmp(val, "none")) { + *res = 0; return 0; } /* Is it a device? */ - ca = bch2_dev_lookup(c, buf); + ca = bch2_dev_lookup(c, val); if (!IS_ERR(ca)) { - *v = dev_to_target(ca->dev_idx); + *res = dev_to_target(ca->dev_idx); percpu_ref_put(&ca->ref); return 0; } mutex_lock(&c->sb_lock); - g = bch2_disk_path_find(&c->disk_sb, buf); + g = bch2_disk_path_find(&c->disk_sb, val); mutex_unlock(&c->sb_lock); if (g >= 0) { - *v = group_to_target(g); + *res = group_to_target(g); return 0; } diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index e4470c3..bd77117 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -68,6 +68,14 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, return devs; } +static inline bool bch2_target_accepts_data(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); + return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); +} + bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); @@ -77,9 +85,14 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); -int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +#define bch2_opt_target (struct bch_opt_fn) { \ + .parse = bch2_opt_target_parse, \ + .to_text = bch2_opt_target_to_text, \ +} + int bch2_sb_disk_groups_to_cpu(struct bch_fs *); int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); @@ -88,4 +101,6 @@ int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); const char *bch2_sb_validate_disk_groups(struct bch_sb *, struct bch_sb_field *); +void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index dfe3796..f58e84a 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -9,6 +9,7 @@ #include "bset.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "disk_groups.h" #include "ec.h" @@ -104,53 +105,56 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - if (!bkey_cmp(k.k->p, POS_MIN)) { + if (bkey_eq(k.k->p, POS_MIN)) { prt_printf(err, "stripe at POS_MIN"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (k.k->p.inode) { prt_printf(err, "nonzero inode field"); - return -EINVAL; - } - - if (bkey_val_bytes(k.k) < sizeof(*s)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*s)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { prt_printf(err, "incorrect value size (%zu < %u)", bkey_val_u64s(k.k), stripe_val_u64s(s)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; - unsigned i; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", s->algorithm, le16_to_cpu(s->sectors), - s->nr_blocks - s->nr_redundant, + nr_data, s->nr_redundant, s->csum_type, 1U << s->csum_granularity_bits); - for (i = 0; i < s->nr_blocks; i++) - prt_printf(out, " %u:%llu:%u", s->ptrs[i].dev, - (u64) s->ptrs[i].offset, - stripe_blockcount_get(s, i)); + for (i = 0; i < s->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = s->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); + if (i < nr_data) + prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + if (ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } } /* returns blocknr in stripe that we matched: */ @@ -196,18 +200,22 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) { - unsigned i; + if (buf->key.k.type == KEY_TYPE_stripe) { + struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); + unsigned i; - for (i = 0; i < buf->key.v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); - buf->data[i] = NULL; + for (i = 0; i < s->v.nr_blocks; i++) { + kvpfree(buf->data[i], buf->size << 9); + buf->data[i] = NULL; + } } } +/* XXX: this is a non-mempoolified memory allocation: */ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, - unsigned offset, unsigned size) + unsigned offset, unsigned size) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1U << v->csum_granularity_bits; unsigned end = offset + size; unsigned i; @@ -223,7 +231,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); - for (i = 0; i < buf->key.v.nr_blocks; i++) { + for (i = 0; i < v->nr_blocks; i++) { buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; @@ -232,7 +240,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, return 0; err: ec_stripe_buf_exit(buf); - return -ENOMEM; + return -BCH_ERR_ENOMEM_stripe_buf; } /* Checksumming: */ @@ -240,7 +248,7 @@ err: static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, unsigned block, unsigned offset) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1 << v->csum_granularity_bits; unsigned end = buf->offset + buf->size; unsigned len = min(csum_granularity, end - offset); @@ -259,7 +267,7 @@ static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, static void ec_generate_checksums(struct ec_stripe_buf *buf) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned i, j, csums_per_device = stripe_csums_per_device(v); if (!v->csum_type) @@ -276,7 +284,7 @@ static void ec_generate_checksums(struct ec_stripe_buf *buf) static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned csum_granularity = 1 << v->csum_granularity_bits; unsigned i; @@ -299,7 +307,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) if (bch2_crc_cmp(want, got)) { struct printbuf buf2 = PRINTBUF; - bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); bch_err_ratelimited(c, "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", @@ -319,7 +327,7 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) static void ec_generate_ec(struct ec_stripe_buf *buf) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = le16_to_cpu(v->sectors) << 9; @@ -328,13 +336,14 @@ static void ec_generate_ec(struct ec_stripe_buf *buf) static unsigned ec_nr_failed(struct ec_stripe_buf *buf) { - return buf->key.v.nr_blocks - - bitmap_weight(buf->valid, buf->key.v.nr_blocks); + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; + + return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); } static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; unsigned nr_data = v->nr_blocks - v->nr_redundant; unsigned bytes = buf->size << 9; @@ -358,7 +367,7 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) static void ec_block_endio(struct bio *bio) { struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); - struct bch_stripe *v = &ec_bio->buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; @@ -381,15 +390,16 @@ static void ec_block_endio(struct bio *bio) } static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, - unsigned rw, unsigned idx, struct closure *cl) + blk_opf_t opf, unsigned idx, struct closure *cl) { - struct bch_stripe *v = &buf->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; unsigned offset = 0, bytes = buf->size << 9; struct bch_extent_ptr *ptr = &v->ptrs[idx]; struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant + enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant ? BCH_DATA_user : BCH_DATA_parity; + int rw = op_is_write(opf); if (ptr_stale(ca, ptr)) { bch_err_ratelimited(c, @@ -415,7 +425,7 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, nr_iovecs, - rw, + opf, GFP_KERNEL, &c->ec_bioset), struct ec_bio, bio); @@ -441,17 +451,15 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, percpu_ref_put(&ca->io_ref); } -static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, + struct ec_stripe_buf *stripe) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - bch2_trans_iter_init(&trans, &iter, BTREE_ID_stripes, - POS(0, idx), BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + POS(0, idx), BTREE_ITER_SLOTS); ret = bkey_err(k); if (ret) goto err; @@ -459,13 +467,17 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip ret = -ENOENT; goto err; } - bkey_reassemble(&stripe->key.k_i, k); + bkey_reassemble(&stripe->key, k); err: - bch2_trans_iter_exit(&trans, &iter); - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); return ret; } +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +{ + return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); +} + /* recovery read path: */ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) { @@ -479,9 +491,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) BUG_ON(!rbio->pick.has_ec); - buf = kzalloc(sizeof(*buf), GFP_NOIO); + buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) - return -ENOMEM; + return -BCH_ERR_ENOMEM_ec_read_extent; ret = get_stripe_key(c, rbio->pick.ec.idx, buf); if (ret) { @@ -491,7 +503,7 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) return -EIO; } - v = &buf->key.v; + v = &bkey_i_to_stripe(&buf->key)->v; if (!bch2_ptr_matches_stripe(v, rbio->pick)) { bch_err_ratelimited(c, @@ -546,25 +558,25 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) if (idx >= h->size) { if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -ENOMEM; + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - spin_lock(&c->ec_stripes_heap_lock); + mutex_lock(&c->ec_stripes_heap_lock); if (n.size > h->size) { memcpy(n.data, h->data, h->used * sizeof(h->data[0])); n.used = h->used; swap(*h, n); } - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); free_heap(&n); } if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -ENOMEM; + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -ENOMEM; + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; return 0; } @@ -572,23 +584,82 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { - size_t idx = iter->pos.offset; + return allocate_dropping_locks_errcode(trans, + __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); +} - if (!__ec_stripe_mem_alloc(trans->c, idx, GFP_NOWAIT|__GFP_NOWARN)) - return 0; +/* + * Hash table of open stripes: + * Stripes that are being created or modified are kept in a hash table, so that + * stripe deletion can skip them. + */ - bch2_trans_unlock(trans); +static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + struct ec_stripe_new *s; - return __ec_stripe_mem_alloc(trans->c, idx, GFP_KERNEL) ?: - bch2_trans_relock(trans); + hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) + if (s->idx == idx) + return true; + return false; +} + +static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + bool ret = false; + + spin_lock(&c->ec_stripes_new_lock); + ret = __bch2_stripe_is_open(c, idx); + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static bool bch2_try_open_stripe(struct bch_fs *c, + struct ec_stripe_new *s, + u64 idx) +{ + bool ret; + + spin_lock(&c->ec_stripes_new_lock); + ret = !__bch2_stripe_is_open(c, idx); + if (ret) { + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + + s->idx = idx; + hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); + } + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) +{ + BUG_ON(!s->idx); + + spin_lock(&c->ec_stripes_new_lock); + hlist_del_init(&s->hash); + spin_unlock(&c->ec_stripes_new_lock); + + s->idx = 0; } -static ssize_t stripe_idx_to_delete(struct bch_fs *c) +/* Heap of all existing stripes, ordered by blocks_nonempty */ + +static u64 stripe_idx_to_delete(struct bch_fs *c) { ec_stripes_heap *h = &c->ec_stripes_heap; - return h->used && h->data[0].blocks_nonempty == 0 - ? h->data[0].idx : -1; + lockdep_assert_held(&c->ec_stripes_heap_lock); + + if (h->used && + h->data[0].blocks_nonempty == 0 && + !bch2_stripe_is_open(c, h->data[0].idx)) + return h->data[0].idx; + + return 0; } static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, @@ -612,7 +683,6 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ec_stripes_heap *h = &c->ec_stripes_heap; struct stripe *m = genradix_ptr(&c->stripes, idx); - BUG_ON(!m->alive); BUG_ON(m->heap_idx >= h->used); BUG_ON(h->data[m->heap_idx].idx != idx); } @@ -620,28 +690,21 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { - if (!m->on_heap) - return; - - m->on_heap = false; - + mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); heap_del(&c->ec_stripes_heap, m->heap_idx, ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); + mutex_unlock(&c->ec_stripes_heap_lock); } void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { - if (m->on_heap) - return; - + mutex_lock(&c->ec_stripes_heap_lock); BUG_ON(heap_full(&c->ec_stripes_heap)); - m->on_heap = true; - heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { .idx = idx, .blocks_nonempty = m->blocks_nonempty, @@ -650,17 +713,17 @@ void bch2_stripes_heap_insert(struct bch_fs *c, ec_stripes_heap_set_backpointer); heap_verify_backpointer(c, idx); + mutex_unlock(&c->ec_stripes_heap_lock); } void bch2_stripes_heap_update(struct bch_fs *c, struct stripe *m, size_t idx) { ec_stripes_heap *h = &c->ec_stripes_heap; + bool do_deletes; size_t i; - if (!m->on_heap) - return; - + mutex_lock(&c->ec_stripes_heap_lock); heap_verify_backpointer(c, idx); h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; @@ -673,125 +736,137 @@ void bch2_stripes_heap_update(struct bch_fs *c, heap_verify_backpointer(c, idx); - if (stripe_idx_to_delete(c) >= 0 && - !percpu_ref_is_dying(&c->writes)) - schedule_work(&c->ec_stripe_delete_work); + do_deletes = stripe_idx_to_delete(c) != 0; + mutex_unlock(&c->ec_stripes_heap_lock); + + if (do_deletes) + bch2_do_stripe_deletes(c); } /* stripe deletion */ -static int ec_stripe_delete(struct bch_fs *c, size_t idx) +static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - return bch2_btree_delete_range(c, BTREE_ID_stripes, - POS(0, idx), - POS(0, idx + 1), - 0, NULL); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_stripe s; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_stripe) { + bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); + ret = -EINVAL; + goto err; + } + + s = bkey_s_c_to_stripe(k); + for (unsigned i = 0; i < s.v->nr_blocks; i++) + if (stripe_blockcount_get(s.v, i)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; } static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - ssize_t idx; + struct btree_trans trans; + int ret; + u64 idx; + + bch2_trans_init(&trans, c, 0, 0); while (1) { - spin_lock(&c->ec_stripes_heap_lock); + mutex_lock(&c->ec_stripes_heap_lock); idx = stripe_idx_to_delete(c); - if (idx < 0) { - spin_unlock(&c->ec_stripes_heap_lock); - break; - } - - bch2_stripes_heap_del(c, genradix_ptr(&c->stripes, idx), idx); - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); - if (ec_stripe_delete(c, idx)) + if (!idx) break; - } -} - -/* stripe creation: */ -static int ec_stripe_bkey_insert(struct btree_trans *trans, - struct bkey_i_stripe *stripe, - struct disk_reservation *res) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bpos min_pos = POS(0, 1); - struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { - if (start_pos.offset) { - start_pos = min_pos; - bch2_btree_iter_set_pos(&iter, start_pos); - continue; - } - - ret = -BCH_ERR_ENOSPC_stripe_create; + ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_delete(&trans, idx)); + if (ret) { + bch_err_fn(c, ret); break; } - - if (bkey_deleted(k.k)) - break; } - c->ec_stripe_hint = iter.pos.offset; - - if (ret) - goto err; - - ret = ec_stripe_mem_alloc(trans, &iter); - if (ret) - goto err; - - stripe->k.p = iter.pos; + bch2_trans_exit(&trans); - ret = bch2_trans_update(trans, &iter, &stripe->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} - return ret; +void bch2_do_stripe_deletes(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); } -static int ec_stripe_bkey_update(struct btree_trans *trans, - struct bkey_i_stripe *new, - struct disk_reservation *res) +/* stripe creation: */ + +static int ec_stripe_key_update(struct btree_trans *trans, + struct bkey_i_stripe *new, + bool create) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - const struct bch_stripe *existing; - unsigned i; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_stripes, - new->k.p, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); ret = bkey_err(k); if (ret) goto err; - if (!k.k || k.k->type != KEY_TYPE_stripe) { - bch_err(trans->c, "error updating stripe: not found"); - ret = -ENOENT; + if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { + bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type]); + ret = -EINVAL; goto err; } - existing = bkey_s_c_to_stripe(k).v; + if (k.k->type == KEY_TYPE_stripe) { + const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; + unsigned i; - if (existing->nr_blocks != new->v.nr_blocks) { - bch_err(trans->c, "error updating stripe: nr_blocks does not match"); - ret = -EINVAL; - goto err; - } + if (old->nr_blocks != new->v.nr_blocks) { + bch_err(c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } - for (i = 0; i < new->v.nr_blocks; i++) - stripe_blockcount_set(&new->v, i, - stripe_blockcount_get(existing, i)); + for (i = 0; i < new->v.nr_blocks; i++) { + unsigned v = stripe_blockcount_get(old, i); + + BUG_ON(v && + (old->ptrs[i].dev != new->v.ptrs[i].dev || + old->ptrs[i].gen != new->v.ptrs[i].gen || + old->ptrs[i].offset != new->v.ptrs[i].offset)); + + stripe_blockcount_set(&new->v, i, v); + } + } ret = bch2_trans_update(trans, &iter, &new->k_i, 0); err: @@ -799,134 +874,188 @@ err: return ret; } -static void extent_stripe_ptr_add(struct bkey_s_extent e, - struct ec_stripe_buf *s, - struct bch_extent_ptr *ptr, - unsigned block) -{ - struct bch_extent_stripe_ptr *dst = (void *) ptr; - union bch_extent_entry *end = extent_entry_last(e); - - memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); - e.k->u64s += sizeof(*dst) / sizeof(u64); - - *dst = (struct bch_extent_stripe_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, - .block = block, - .redundancy = s->key.v.nr_redundant, - .idx = s->key.k.p.offset, - }; -} - static int ec_stripe_update_extent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - struct ec_stripe_buf *s) + struct bpos bucket, u8 gen, + struct ec_stripe_buf *s, + struct bpos *bp_pos) { + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_fs *c = trans->c; + struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_stripe_ptr stripe_ptr; struct bkey_i *n; int ret, dev, block; - if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + ret = bch2_get_next_backpointer(trans, bucket, gen, + bp_pos, &bp, BTREE_ITER_CACHED); + if (ret) + return ret; + if (bpos_eq(*bp_pos, SPOS_MAX)) + return 0; + + if (bp.level) { + struct printbuf buf = PRINTBUF; + struct btree_iter node_iter; + struct btree *b; + + b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + bch2_trans_iter_exit(trans, &node_iter); + + if (!b) + return 0; + + prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); + bch2_backpointer_to_text(&buf, &bp); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) { + /* + * extent no longer exists - we could flush the btree + * write buffer and retry to verify, but no need: + */ return 0; + } + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + goto out; - ptr_c = bkey_matches_stripe(&s->key.v, k, &block); + ptr_c = bkey_matches_stripe(v, k, &block); /* * It doesn't generally make sense to erasure code cached ptrs: * XXX: should we be incrementing a counter? */ if (!ptr_c || ptr_c->cached) - return 0; + goto out; - dev = s->key.v.ptrs[block].dev; + dev = v->ptrs[block].dev; - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); ret = PTR_ERR_OR_ZERO(n); if (ret) - return ret; + goto out; bkey_reassemble(n, k); bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); - ec_ptr = (void *) bch2_bkey_has_device(bkey_i_to_s_c(n), dev); + ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); BUG_ON(!ec_ptr); - extent_stripe_ptr_add(bkey_i_to_s_extent(n), s, ec_ptr, block); + stripe_ptr = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .redundancy = v->nr_redundant, + .idx = s->key.k.p.offset, + }; + + __extent_entry_insert(n, + (union bch_extent_entry *) ec_ptr, + (union bch_extent_entry *) &stripe_ptr); - return bch2_trans_update(trans, iter, n, 0); + ret = bch2_trans_update(trans, &iter, n, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; } static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, unsigned block) { struct bch_fs *c = trans->c; - struct bch_extent_ptr bucket = s->key.v.ptrs[block]; + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_extent_ptr bucket = v->ptrs[block]; struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); - struct bch_backpointer bp; - struct btree_iter iter; - struct bkey_s_c k; - u64 bp_offset = 0; + struct bpos bp_pos = POS_MIN; int ret = 0; -retry: - while (1) { - bch2_trans_begin(trans); - - ret = bch2_get_next_backpointer(trans, bucket_pos, bucket.gen, - &bp_offset, &bp, - BTREE_ITER_CACHED); - if (ret) - break; - if (bp_offset == U64_MAX) - break; - - if (bch2_fs_inconsistent_on(bp.level, c, "found btree node in erasure coded bucket!?")) { - ret = -EIO; - break; - } - k = bch2_backpointer_get_key(trans, &iter, bucket_pos, bp_offset, bp); - ret = bkey_err(k); + while (1) { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_update_extent(trans, bucket_pos, bucket.gen, + s, &bp_pos)); if (ret) break; - if (!k.k) - continue; - - ret = ec_stripe_update_extent(trans, &iter, k, s); - bch2_trans_iter_exit(trans, &iter); - if (ret) + if (bkey_eq(bp_pos, POS_MAX)) break; - bp_offset++; + bp_pos = bpos_nosnap_successor(bp_pos); } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - return ret; } static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans trans; - struct bch_stripe *v = &s->key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; bch2_trans_init(&trans, c, 0, 0); + ret = bch2_btree_write_buffer_flush(&trans); + if (ret) + goto err; + for (i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(&trans, s, i); if (ret) break; } - - +err: bch2_trans_exit(&trans); return ret; } +static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + struct ec_stripe_new *s, + unsigned block, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; + int ret; + + if (!bch2_dev_get_ioref(ca, WRITE)) { + s->err = -BCH_ERR_erofs_no_writes; + return; + } + + memset(s->new_stripe.data[block] + (offset << 9), + 0, + ob->sectors_free << 9); + + ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + ob->bucket * ca->mi.bucket_size + offset, + ob->sectors_free, + GFP_KERNEL, 0); + + percpu_ref_put(&ca->io_ref); + + if (ret) + s->err = ret; +} + +void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) +{ + if (s->idx) + bch2_stripe_close(c, s); + kfree(s); +} + /* * data buckets of new stripe all written: create the stripe */ @@ -934,8 +1063,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) { struct bch_fs *c = s->c; struct open_bucket *ob; - struct stripe *m; - struct bch_stripe *v = &s->new_stripe.key.v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret; @@ -943,8 +1071,18 @@ static void ec_stripe_create(struct ec_stripe_new *s) closure_sync(&s->iodone); + if (!s->err) { + for (i = 0; i < nr_data; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (ob->sectors_free) + zero_out_rest_of_ec_bucket(c, s, i, ob); + } + } + if (s->err) { - if (s->err != -EROFS) + if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); goto err; } @@ -958,7 +1096,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) } for (i = 0; i < nr_data; i++) - if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) + if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) swap(s->new_stripe.data[i], s->existing_stripe.data[i]); @@ -966,9 +1104,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) } BUG_ON(!s->allocated); - - if (!percpu_ref_tryget_live(&c->writes)) - goto err; + BUG_ON(!s->idx); ec_generate_ec(&s->new_stripe); @@ -981,31 +1117,26 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); - goto err_put_writes; + goto err; } - ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL, - s->have_existing_stripe - ? ec_stripe_bkey_update(&trans, &s->new_stripe.key, &s->res) - : ec_stripe_bkey_insert(&trans, &s->new_stripe.key, &s->res)); + ret = bch2_trans_do(c, &s->res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_key_update(&trans, + bkey_i_to_stripe(&s->new_stripe.key), + !s->have_existing_stripe)); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); - goto err_put_writes; + goto err; } ret = ec_stripe_update_extents(c, &s->new_stripe); - if (ret) + if (ret) { bch_err(c, "error creating stripe: error updating pointers: %s", bch2_err_str(ret)); - - spin_lock(&c->ec_stripes_heap_lock); - m = genradix_ptr(&c->stripes, s->new_stripe.key.k.p.offset); - - BUG_ON(m->on_heap); - bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset); - spin_unlock(&c->ec_stripes_heap_lock); -err_put_writes: - percpu_ref_put(&c->writes); + goto err; + } err: bch2_disk_reservation_put(c, &s->res); @@ -1021,37 +1152,51 @@ err: } } + mutex_lock(&c->ec_stripe_new_lock); + list_del(&s->list); + mutex_unlock(&c->ec_stripe_new_lock); + wake_up(&c->ec_stripe_new_wait); + ec_stripe_buf_exit(&s->existing_stripe); ec_stripe_buf_exit(&s->new_stripe); closure_debug_destroy(&s->iodone); - kfree(s); + + ec_stripe_new_put(c, s, STRIPE_REF_stripe); +} + +static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) +{ + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->ref[STRIPE_REF_io])) + goto out; + s = NULL; +out: + mutex_unlock(&c->ec_stripe_new_lock); + + return s; } static void ec_stripe_create_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_create_work); - struct ec_stripe_new *s, *n; -restart: - mutex_lock(&c->ec_stripe_new_lock); - list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) - if (!atomic_read(&s->pin)) { - list_del(&s->list); - mutex_unlock(&c->ec_stripe_new_lock); - ec_stripe_create(s); - goto restart; - } - mutex_unlock(&c->ec_stripe_new_lock); + struct ec_stripe_new *s; + + while ((s = get_pending_stripe(c))) + ec_stripe_create(s); + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } -static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) +void bch2_ec_do_stripe_creates(struct bch_fs *c) { - BUG_ON(atomic_read(&s->pin) <= 0); + bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); - if (atomic_dec_and_test(&s->pin)) { - BUG_ON(!s->pending); - queue_work(system_long_wq, &c->ec_stripe_create_work); - } + if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); } static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) @@ -1067,18 +1212,7 @@ static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) list_add(&s->list, &c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); - ec_stripe_new_put(c, s); -} - -/* have a full bucket - hand it off to be erasure coded: */ -void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) -{ - struct ec_stripe_new *s = ob->ec; - - if (ob->sectors_free) - s->err = -1; - - ec_stripe_new_put(c, s); + ec_stripe_new_put(c, s, STRIPE_REF_io); } void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) @@ -1097,6 +1231,8 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) if (!ob) return NULL; + BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); + ca = bch_dev_bkey_exists(c, ob->dev); offset = ca->mi.bucket_size - ob->sectors_free; @@ -1150,14 +1286,14 @@ static bool may_create_new_stripe(struct bch_fs *c) } static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i_stripe *s, + struct bkey_i *k, unsigned nr_data, unsigned nr_parity, unsigned stripe_size) { + struct bkey_i_stripe *s = bkey_stripe_init(k); unsigned u64s; - bkey_stripe_init(&s->k_i); s->v.sectors = cpu_to_le16(stripe_size); s->v.algorithm = 0; s->v.nr_blocks = nr_data + nr_parity; @@ -1184,19 +1320,20 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) - return -ENOMEM; + return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; mutex_init(&s->lock); closure_init(&s->iodone, NULL); - atomic_set(&s->pin, 1); + atomic_set(&s->ref[STRIPE_REF_stripe], 1); + atomic_set(&s->ref[STRIPE_REF_io], 1); s->c = c; s->h = h; s->nr_data = min_t(unsigned, h->nr_active_devs, BCH_BKEY_PTRS_MAX) - h->redundancy; s->nr_parity = h->redundancy; - ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, - s->nr_parity, h->blocksize); + ec_stripe_key_init(c, &s->new_stripe.key, + s->nr_data, s->nr_parity, h->blocksize); h->s = s; return 0; @@ -1205,7 +1342,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) static struct ec_stripe_head * ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, unsigned algo, unsigned redundancy, - bool copygc) + enum bch_watermark watermark) { struct ec_stripe_head *h; struct bch_dev *ca; @@ -1216,12 +1353,12 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, return NULL; mutex_init(&h->lock); - mutex_lock(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); h->target = target; h->algo = algo; h->redundancy = redundancy; - h->copygc = copygc; + h->watermark = watermark; rcu_read_lock(); h->devs = target_rw_devs(c, BCH_DATA_user, target); @@ -1252,51 +1389,67 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) mutex_unlock(&h->lock); } -struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, - unsigned target, - unsigned algo, - unsigned redundancy, - bool copygc) +static struct ec_stripe_head * +__bch2_ec_stripe_head_get(struct btree_trans *trans, + unsigned target, + unsigned algo, + unsigned redundancy, + enum bch_watermark watermark) { + struct bch_fs *c = trans->c; struct ec_stripe_head *h; + int ret; if (!redundancy) return NULL; - mutex_lock(&c->ec_stripe_head_lock); + ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); + if (ret) + return ERR_PTR(ret); + + if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + h = ERR_PTR(-BCH_ERR_erofs_no_writes); + goto found; + } + list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && h->redundancy == redundancy && - h->copygc == copygc) { - mutex_lock(&h->lock); + h->watermark == watermark) { + ret = bch2_trans_mutex_lock(trans, &h->lock); + if (ret) + h = ERR_PTR(ret); goto found; } - h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc); + h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); found: mutex_unlock(&c->ec_stripe_head_lock); return h; } -static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, - struct closure *cl) +static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, + enum bch_watermark watermark, struct closure *cl) { + struct bch_fs *c = trans->c; struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; + struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; int ret = 0; - for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { - if (test_bit(i, h->s->blocks_gotten)) { - __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); - if (i < h->s->nr_data) - nr_have_data++; - else - nr_have_parity++; - } + BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); + BUG_ON(v->nr_redundant != h->s->nr_parity); + + for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { + __clear_bit(v->ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; } BUG_ON(nr_have_data > h->s->nr_data); @@ -1304,16 +1457,14 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, buckets.nr = 0; if (nr_have_parity < h->s->nr_parity) { - ret = bch2_bucket_alloc_set(c, &buckets, + ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->parity_stripe, &devs, h->s->nr_parity, &nr_have_parity, - &have_cache, - h->copygc - ? RESERVE_movinggc - : RESERVE_none, - 0, + &have_cache, 0, + BCH_DATA_parity, + watermark, cl); open_bucket_for_each(c, &buckets, ob, i) { @@ -1323,7 +1474,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data + h->s->nr_parity); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1333,16 +1484,14 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, buckets.nr = 0; if (nr_have_data < h->s->nr_data) { - ret = bch2_bucket_alloc_set(c, &buckets, + ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->block_stripe, &devs, h->s->nr_data, &nr_have_data, - &have_cache, - h->copygc - ? RESERVE_movinggc - : RESERVE_none, - 0, + &have_cache, 0, + BCH_DATA_user, + watermark, cl); open_bucket_for_each(c, &buckets, ob, i) { @@ -1351,7 +1500,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, BUG_ON(j >= h->s->nr_data); h->s->blocks[j] = buckets.v[i]; - h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, h->s->blocks_gotten); } @@ -1375,59 +1524,82 @@ static s64 get_existing_stripe(struct bch_fs *c, if (may_create_new_stripe(c)) return -1; - spin_lock(&c->ec_stripes_heap_lock); + mutex_lock(&c->ec_stripes_heap_lock); for (heap_idx = 0; heap_idx < h->used; heap_idx++) { /* No blocks worth reusing, stripe will just be deleted: */ if (!h->data[heap_idx].blocks_nonempty) continue; stripe_idx = h->data[heap_idx].idx; + m = genradix_ptr(&c->stripes, stripe_idx); if (m->algorithm == head->algo && m->nr_redundant == head->redundancy && m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { - bch2_stripes_heap_del(c, m, stripe_idx); + m->blocks_nonempty < m->nr_blocks - m->nr_redundant && + bch2_try_open_stripe(c, head->s, stripe_idx)) { ret = stripe_idx; break; } } - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); return ret; } -static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, - struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) { + struct bch_fs *c = trans->c; + struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *existing_v; unsigned i; s64 idx; int ret; + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ idx = get_existing_stripe(c, h); if (idx < 0) - return -BCH_ERR_ENOSPC_stripe_reuse; + return -BCH_ERR_stripe_alloc_blocked; - h->s->have_existing_stripe = true; - ret = get_stripe_key(c, idx, &h->s->existing_stripe); + ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); if (ret) { - bch2_fs_fatal_error(c, "error reading stripe key: %i", ret); + bch2_stripe_close(c, h->s); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); return ret; } - if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) { - /* - * this is a problem: we have deleted from the - * stripes heap already - */ - BUG(); + existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; + + BUG_ON(existing_v->nr_redundant != h->s->nr_parity); + h->s->nr_data = existing_v->nr_blocks - + existing_v->nr_redundant; + + ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + if (ret) { + bch2_stripe_close(c, h->s); + return ret; } BUG_ON(h->s->existing_stripe.size != h->blocksize); - BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); + BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ + for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); + h->s->blocks[i] = 0; + } + memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); + memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); - for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { - if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { + for (i = 0; i < existing_v->nr_blocks; i++) { + if (stripe_blockcount_get(existing_v, i)) { __set_bit(i, h->s->blocks_gotten); __set_bit(i, h->s->blocks_allocated); } @@ -1435,80 +1607,161 @@ static int __bch2_ec_stripe_head_reuse(struct bch_fs *c, ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); } - bkey_copy(&h->s->new_stripe.key.k_i, - &h->s->existing_stripe.key.k_i); + bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); + h->s->have_existing_stripe = true; return 0; } -static int __bch2_ec_stripe_head_reserve(struct bch_fs *c, - struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) { - return bch2_disk_reservation_get(c, &h->s->res, - h->blocksize, - h->s->nr_parity, 0); + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + + if (!h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { + if (start_pos.offset) { + start_pos = min_pos; + bch2_btree_iter_set_pos(&iter, start_pos); + continue; + } + + ret = -BCH_ERR_ENOSPC_stripe_create; + break; + } + + if (bkey_deleted(k.k) && + bch2_try_open_stripe(c, h->s, k.k->p.offset)) + break; + } + + c->ec_stripe_hint = iter.pos.offset; + + if (ret) + goto err; + + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) { + bch2_stripe_close(c, h->s); + goto err; + } + + h->s->new_stripe.key.k.p = iter.pos; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +err: + bch2_disk_reservation_put(c, &h->s->res); + goto out; } -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, unsigned target, unsigned algo, unsigned redundancy, - bool copygc, + enum bch_watermark watermark, struct closure *cl) { + struct bch_fs *c = trans->c; struct ec_stripe_head *h; + bool waiting = false; int ret; - bool needs_stripe_new; - h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc); - if (!h) { + h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (!h) bch_err(c, "no stripe head"); - return NULL; - } + if (IS_ERR_OR_NULL(h)) + return h; - needs_stripe_new = !h->s; - if (needs_stripe_new) { - if (ec_new_stripe_alloc(c, h)) { - ret = -ENOMEM; + if (!h->s) { + ret = ec_new_stripe_alloc(c, h); + if (ret) { bch_err(c, "failed to allocate new stripe"); goto err; } - - if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) - BUG(); } - /* - * Try reserve a new stripe before reusing an - * existing stripe. This will prevent unnecessary - * read amplification during write oriented workloads. - */ - ret = 0; - if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe) - ret = __bch2_ec_stripe_head_reserve(c, h); - if (ret && needs_stripe_new) - ret = __bch2_ec_stripe_head_reuse(c, h); - if (ret) { - bch_err_ratelimited(c, "failed to get stripe: %s", bch2_err_str(ret)); + if (h->s->allocated) + goto allocated; + + if (h->s->have_existing_stripe) + goto alloc_existing; + + /* First, try to allocate a full stripe: */ + ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (!ret) + goto allocate_buf; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, ENOMEM)) goto err; - } - if (!h->s->allocated) { - ret = new_stripe_alloc_buckets(c, h, cl); - if (ret) + /* + * Not enough buckets available for a full stripe: we must reuse an + * existing stripe: + */ + while (1) { + ret = __bch2_ec_stripe_head_reuse(trans, h); + if (!ret) + break; + if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - h->s->allocated = true; + if (watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (ret) + goto err; + goto allocate_buf; + } + + /* XXX freelist_wait? */ + closure_wait(&c->freelist_wait, cl); + waiting = true; } - return h; + if (waiting) + closure_wake_up(&c->freelist_wait); +alloc_existing: + /* + * Retry allocating buckets, with the watermark for this + * particular write: + */ + ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + if (ret) + goto err; +allocate_buf: + ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + if (ret) + goto err; + + h->s->allocated = true; +allocated: + BUG_ON(!h->s->idx); + BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(trans->restarted); + return h; err: bch2_ec_stripe_head_put(c, h); return ERR_PTR(ret); } -void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) { struct ec_stripe_head *h; struct open_bucket *ob; @@ -1516,12 +1769,14 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - mutex_lock(&h->lock); if (!h->s) goto unlock; - for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (!ca) + goto found; + + for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { if (!h->s->blocks[i]) continue; @@ -1531,7 +1786,7 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) } goto unlock; found: - h->s->err = -EROFS; + h->s->err = -BCH_ERR_erofs_no_writes; ec_stripe_set_pending(c, h); unlock: mutex_unlock(&h->lock); @@ -1539,14 +1794,30 @@ unlock: mutex_unlock(&c->ec_stripe_head_lock); } -void bch2_stripes_heap_start(struct bch_fs *c) +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) { - struct genradix_iter iter; - struct stripe *m; + __bch2_ec_stop(c, ca); +} + +void bch2_fs_ec_stop(struct bch_fs *c) +{ + __bch2_ec_stop(c, NULL); +} + +static bool bch2_fs_ec_flush_done(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->ec_stripe_new_lock); + ret = list_empty(&c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + return ret; +} - genradix_for_each(&c->stripes, iter, m) - if (m->alive) - bch2_stripes_heap_insert(c, m, iter.pos); +void bch2_fs_ec_flush(struct bch_fs *c) +{ + wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); } int bch2_stripes_read(struct bch_fs *c) @@ -1573,7 +1844,6 @@ int bch2_stripes_read(struct bch_fs *c) s = bkey_s_c_to_stripe(k).v; m = genradix_ptr(&c->stripes, k.k->p.offset); - m->alive = true; m->sectors = le16_to_cpu(s->sectors); m->algorithm = s->algorithm; m->nr_blocks = s->nr_blocks; @@ -1583,16 +1853,14 @@ int bch2_stripes_read(struct bch_fs *c) for (i = 0; i < s->nr_blocks; i++) m->blocks_nonempty += !!stripe_blockcount_get(s, i); - spin_lock(&c->ec_stripes_heap_lock); - bch2_stripes_heap_update(c, m, k.k->p.offset); - spin_unlock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_insert(c, m, k.k->p.offset); } bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); if (ret) - bch_err(c, "error reading stripes: %i", ret); + bch_err_fn(c, ret); return ret; } @@ -1603,16 +1871,19 @@ void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) struct stripe *m; size_t i; - spin_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->used, 20); i++) { + mutex_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min_t(size_t, h->used, 50); i++) { m = genradix_ptr(&c->stripes, h->data[i].idx); - prt_printf(out, "%zu %u/%u+%u\n", h->data[i].idx, + prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, h->data[i].blocks_nonempty, m->nr_blocks - m->nr_redundant, m->nr_redundant); + if (bch2_stripe_is_open(c, h->data[i].idx)) + prt_str(out, " open"); + prt_newline(out); } - spin_unlock(&c->ec_stripes_heap_lock); + mutex_unlock(&c->ec_stripes_heap_lock); } void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) @@ -1622,22 +1893,27 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->ec_stripe_head_lock); list_for_each_entry(h, &c->ec_stripe_head_list, list) { - prt_printf(out, "target %u algo %u redundancy %u:\n", - h->target, h->algo, h->redundancy); + prt_printf(out, "target %u algo %u redundancy %u %s:\n", + h->target, h->algo, h->redundancy, + bch2_watermarks[h->watermark]); if (h->s) - prt_printf(out, "\tpending: blocks %u+%u allocated %u\n", - h->s->nr_data, h->s->nr_parity, + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", + h->s->idx, h->s->nr_data, h->s->nr_parity, bitmap_weight(h->s->blocks_allocated, h->s->nr_data)); } mutex_unlock(&c->ec_stripe_head_lock); + prt_printf(out, "in flight:\n"); + mutex_lock(&c->ec_stripe_new_lock); list_for_each_entry(s, &c->ec_stripe_new_list, list) { - prt_printf(out, "\tin flight: blocks %u+%u pin %u\n", - s->nr_data, s->nr_parity, - atomic_read(&s->pin)); + prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", + s->idx, s->nr_data, s->nr_parity, + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); } mutex_unlock(&c->ec_stripe_new_lock); } @@ -1645,6 +1921,7 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_ec_exit(struct bch_fs *c) { struct ec_stripe_head *h; + unsigned i; while (1) { mutex_lock(&c->ec_stripe_head_lock); @@ -1656,7 +1933,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) if (!h) break; - BUG_ON(h->s); + if (h->s) { + for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); + } kfree(h); } @@ -1669,6 +1951,16 @@ void bch2_fs_ec_exit(struct bch_fs *c) void bch2_fs_ec_init_early(struct bch_fs *c) { + spin_lock_init(&c->ec_stripes_new_lock); + mutex_init(&c->ec_stripes_heap_lock); + + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + init_waitqueue_head(&c->ec_stripe_new_wait); + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); } diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index aba1e82..885ae5d 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -4,9 +4,12 @@ #include "ec_types.h" #include "buckets_types.h" +#include "extents_types.h" + +enum bkey_invalid_flags; int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -16,6 +19,7 @@ void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, .swab = bch2_ptr_swab, \ .trans_trigger = bch2_trans_mark_stripe, \ .atomic_trigger = bch2_mark_stripe, \ + .min_val_size = 8, \ }) static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) @@ -134,23 +138,29 @@ struct ec_stripe_buf { void *data[BCH_BKEY_PTRS_MAX]; - union { - struct bkey_i_stripe key; - u64 pad[255]; - }; + __BKEY_PADDED(key, 255); }; struct ec_stripe_head; +enum ec_stripe_ref { + STRIPE_REF_io, + STRIPE_REF_stripe, + STRIPE_REF_NR +}; + struct ec_stripe_new { struct bch_fs *c; struct ec_stripe_head *h; struct mutex lock; struct list_head list; + + struct hlist_node hash; + u64 idx; + struct closure iodone; - /* counts in flight writes, stripe is created when pin == 0 */ - atomic_t pin; + atomic_t ref[STRIPE_REF_NR]; int err; @@ -176,7 +186,7 @@ struct ec_stripe_head { unsigned target; unsigned algo; unsigned redundancy; - bool copygc; + enum bch_watermark watermark; struct bch_devs_mask devs; unsigned nr_active_devs; @@ -193,24 +203,50 @@ int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, - unsigned, unsigned, unsigned, bool, struct closure *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, + unsigned, unsigned, unsigned, + enum bch_watermark, struct closure *); void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); +void bch2_do_stripe_deletes(struct bch_fs *); +void bch2_ec_do_stripe_creates(struct bch_fs *); +void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); -void bch2_ec_flush_new_stripes(struct bch_fs *); +static inline void ec_stripe_new_get(struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + atomic_inc(&s->ref[ref]); +} + +static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + BUG_ON(atomic_read(&s->ref[ref]) <= 0); + + if (atomic_dec_and_test(&s->ref[ref])) + switch (ref) { + case STRIPE_REF_stripe: + bch2_ec_stripe_new_free(c, s); + break; + case STRIPE_REF_io: + bch2_ec_do_stripe_creates(c); + break; + default: + unreachable(); + } +} -void bch2_stripes_heap_start(struct bch_fs *); +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); +void bch2_fs_ec_stop(struct bch_fs *); +void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index edd93da..e2b02a8 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_EC_TYPES_H #define _BCACHEFS_EC_TYPES_H -#include +#include "bcachefs_format.h" struct bch_replicas_padded { struct bch_replicas_entry e; @@ -11,15 +11,10 @@ struct bch_replicas_padded { struct stripe { size_t heap_idx; - u16 sectors; u8 algorithm; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ - unsigned on_heap:1; u8 blocks_nonempty; }; diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 93b515e..735eb24 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -3,24 +3,111 @@ #define _BCACHEFS_ERRCODE_H #define BCH_ERRCODES() \ + x(ENOMEM, ENOMEM_stripe_buf) \ + x(ENOMEM, ENOMEM_replicas_table) \ + x(ENOMEM, ENOMEM_cpu_replicas) \ + x(ENOMEM, ENOMEM_replicas_gc) \ + x(ENOMEM, ENOMEM_disk_groups_validate) \ + x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ + x(ENOMEM, ENOMEM_mark_snapshot) \ + x(ENOMEM, ENOMEM_mark_stripe) \ + x(ENOMEM, ENOMEM_mark_stripe_ptr) \ + x(ENOMEM, ENOMEM_btree_key_cache_create) \ + x(ENOMEM, ENOMEM_btree_key_cache_fill) \ + x(ENOMEM, ENOMEM_btree_key_cache_insert) \ + x(ENOMEM, ENOMEM_trans_kmalloc) \ + x(ENOMEM, ENOMEM_trans_log_msg) \ + x(ENOMEM, ENOMEM_do_encrypt) \ + x(ENOMEM, ENOMEM_ec_read_extent) \ + x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ + x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ + x(ENOMEM, ENOMEM_fs_btree_cache_init) \ + x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ + x(ENOMEM, ENOMEM_fs_counters_init) \ + x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ + x(ENOMEM, ENOMEM_io_clock_init) \ + x(ENOMEM, ENOMEM_blacklist_table_init) \ + x(ENOMEM, ENOMEM_sb_realloc_injected) \ + x(ENOMEM, ENOMEM_sb_bio_realloc) \ + x(ENOMEM, ENOMEM_sb_buf_realloc) \ + x(ENOMEM, ENOMEM_sb_journal_validate) \ + x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ + x(ENOMEM, ENOMEM_journal_entry_add) \ + x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ + x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ + x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ + x(ENOMEM, ENOMEM_bio_read_init) \ + x(ENOMEM, ENOMEM_bio_read_split_init) \ + x(ENOMEM, ENOMEM_bio_write_init) \ + x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ + x(ENOMEM, ENOMEM_writepage_bioset_init) \ + x(ENOMEM, ENOMEM_dio_read_bioset_init) \ + x(ENOMEM, ENOMEM_dio_write_bioset_init) \ + x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ + x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_compression_bounce_read_init) \ + x(ENOMEM, ENOMEM_compression_bounce_write_init) \ + x(ENOMEM, ENOMEM_compression_workspace_init) \ + x(ENOMEM, ENOMEM_decompression_workspace_init) \ + x(ENOMEM, ENOMEM_bucket_gens) \ + x(ENOMEM, ENOMEM_buckets_nouse) \ + x(ENOMEM, ENOMEM_usage_init) \ + x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ + x(ENOMEM, ENOMEM_btree_node_reclaim) \ + x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ + x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ + x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ + x(ENOMEM, ENOMEM_dev_journal_init) \ + x(ENOMEM, ENOMEM_journal_pin_fifo) \ + x(ENOMEM, ENOMEM_journal_buf) \ + x(ENOMEM, ENOMEM_gc_start) \ + x(ENOMEM, ENOMEM_gc_alloc_start) \ + x(ENOMEM, ENOMEM_gc_reflink_start) \ + x(ENOMEM, ENOMEM_gc_gens) \ + x(ENOMEM, ENOMEM_gc_repair_key) \ + x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ + x(ENOMEM, ENOMEM_fsck_add_nlink) \ + x(ENOMEM, ENOMEM_journal_key_insert) \ + x(ENOMEM, ENOMEM_journal_keys_sort) \ + x(ENOMEM, ENOMEM_journal_replay) \ + x(ENOMEM, ENOMEM_read_superblock_clean) \ + x(ENOMEM, ENOMEM_fs_alloc) \ + x(ENOMEM, ENOMEM_fs_name_alloc) \ + x(ENOMEM, ENOMEM_fs_other_alloc) \ + x(ENOMEM, ENOMEM_dev_alloc) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ x(ENOSPC, ENOSPC_stripe_create) \ - x(ENOSPC, ENOSPC_stripe_reuse) \ x(ENOSPC, ENOSPC_inode_create) \ x(ENOSPC, ENOSPC_str_hash_create) \ x(ENOSPC, ENOSPC_snapshot_create) \ x(ENOSPC, ENOSPC_subvolume_create) \ x(ENOSPC, ENOSPC_sb) \ x(ENOSPC, ENOSPC_sb_journal) \ + x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ x(ENOSPC, ENOSPC_sb_quota) \ x(ENOSPC, ENOSPC_sb_replicas) \ x(ENOSPC, ENOSPC_sb_members) \ + x(ENOSPC, ENOSPC_sb_crypt) \ + x(ENOSPC, ENOSPC_btree_slot) \ + x(ENOSPC, ENOSPC_snapshot_tree) \ + x(ENOENT, ENOENT_bkey_type_mismatch) \ + x(ENOENT, ENOENT_str_hash_lookup) \ + x(ENOENT, ENOENT_str_hash_set_must_replace) \ + x(ENOENT, ENOENT_inode) \ + x(ENOENT, ENOENT_not_subvol) \ + x(ENOENT, ENOENT_directory_dead) \ + x(ENOENT, ENOENT_subvolume) \ + x(ENOENT, ENOENT_snapshot_tree) \ + x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ + x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_idx_not_found) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ - x(0, insufficient_devices) \ x(0, transaction_restart) \ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ @@ -43,6 +130,7 @@ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ x(0, no_btree_node) \ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ @@ -53,18 +141,78 @@ x(BCH_ERR_no_btree_node, no_btree_node_down) \ x(BCH_ERR_no_btree_node, no_btree_node_init) \ x(BCH_ERR_no_btree_node, no_btree_node_cached) \ + x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ + x(0, btree_insert_fail) \ + x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ - x(0, fsck) \ + x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_ignore) \ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(0, need_snapshot_cleanup) \ - x(0, need_topology_repair) \ - x(0, unwritten_extent_update) + x(0, restart_recovery) \ + x(0, unwritten_extent_update) \ + x(EINVAL, device_state_not_allowed) \ + x(EINVAL, member_info_missing) \ + x(EINVAL, mismatched_block_size) \ + x(EINVAL, block_size_too_small) \ + x(EINVAL, bucket_size_too_small) \ + x(EINVAL, device_size_too_small) \ + x(EINVAL, device_not_a_member_of_filesystem) \ + x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_already_online) \ + x(EINVAL, insufficient_devices_to_start) \ + x(EINVAL, invalid) \ + x(EINVAL, internal_fsck_err) \ + x(EROFS, erofs_trans_commit) \ + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ + x(EROFS, erofs_sb_err) \ + x(EROFS, erofs_unfixed_errors) \ + x(EROFS, erofs_norecovery) \ + x(EROFS, erofs_nochanges) \ + x(EROFS, insufficient_devices) \ + x(0, operation_blocked) \ + x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ + x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_invalid, invalid_sb) \ + x(BCH_ERR_invalid_sb, invalid_sb_magic) \ + x(BCH_ERR_invalid_sb, invalid_sb_version) \ + x(BCH_ERR_invalid_sb, invalid_sb_features) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum) \ + x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ + x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ + x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_layout) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ + x(BCH_ERR_invalid_sb, invalid_sb_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ + x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ + x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ + x(BCH_ERR_invalid_sb, invalid_sb_clean) \ + x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ enum bch_errcode { BCH_ERR_START = 2048, @@ -79,13 +227,13 @@ bool __bch2_err_matches(int, int); static inline bool _bch2_err_matches(int err, int class) { - return err && __bch2_err_matches(err, class); + return err < 0 && __bch2_err_matches(err, class); } #define bch2_err_matches(_err, _class) \ ({ \ BUILD_BUG_ON(!__builtin_constant_p(_class)); \ - _bch2_err_matches(_err, _class); \ + unlikely(_bch2_err_matches(_err, _class)); \ }) int __bch2_err_class(int); diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 3e49d72..39009cf 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -28,7 +28,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) bch2_inconsistent_error(c); } @@ -62,10 +62,52 @@ void bch2_io_error(struct bch_dev *ca) //queue_work(system_long_wq, &ca->io_error_work); } +enum ask_yn { + YN_NO, + YN_YES, + YN_ALLNO, + YN_ALLYES, +}; + #ifdef __KERNEL__ -#define ask_yn() false +#define bch2_fsck_ask_yn() YN_NO #else + #include "tools-util.h" + +enum ask_yn bch2_fsck_ask_yn(void) +{ + char *buf = NULL; + size_t buflen = 0; + bool ret; + + while (true) { + fputs(" (y,n, or Y,N for all errors of this type) ", stdout); + fflush(stdout); + + if (getline(&buf, &buflen, stdin) < 0) + die("error reading from standard input"); + + strim(buf); + if (strlen(buf) != 1) + continue; + + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + } + + free(buf); + return ret; +} + #endif static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) @@ -95,7 +137,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) INIT_LIST_HEAD(&s->list); s->fmt = fmt; - s->buf = PRINTBUF; list_add(&s->list, &c->fsck_errors); return s; } @@ -108,9 +149,28 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + mutex_lock(&c->fsck_error_lock); s = fsck_err_get(c, fmt); if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { + ret = s->ret; + mutex_unlock(&c->fsck_error_lock); + printbuf_exit(&buf); + return ret; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (c->opts.ratelimit_errors && !(flags & FSCK_NO_RATELIMIT) && s->nr >= FSCK_ERR_RATELIMIT_NR) { @@ -120,8 +180,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) print = false; } - printbuf_reset(&s->buf); - out = &s->buf; s->nr++; } @@ -130,10 +188,6 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) prt_printf(out, bch2_log_msg(c, "")); #endif - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { @@ -147,18 +201,32 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) prt_str(out, ", continuing"); ret = -BCH_ERR_fsck_ignore; } - } else if (c->opts.fix_errors == FSCK_OPT_EXIT) { + } else if (c->opts.fix_errors == FSCK_FIX_exit) { prt_str(out, ", exiting"); ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { - if (c->opts.fix_errors == FSCK_OPT_ASK) { + int fix = s && s->fix + ? s->fix + : c->opts.fix_errors; + + if (fix == FSCK_FIX_ask) { + int ask; + prt_str(out, ": fix?"); bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - ret = ask_yn() + + ask = bch2_fsck_ask_yn(); + + if (ask >= YN_ALLNO && s) + s->fix = ask == YN_ALLNO + ? FSCK_FIX_no + : FSCK_FIX_yes; + + ret = ask & 1 ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; - } else if (c->opts.fix_errors == FSCK_OPT_YES || + } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { prt_str(out, ", fixing"); @@ -173,7 +241,7 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) } if (ret == -BCH_ERR_fsck_ignore && - (c->opts.fix_errors == FSCK_OPT_EXIT || + (c->opts.fix_errors == FSCK_FIX_exit || !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; @@ -187,6 +255,9 @@ int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) else if (suppressing) bch_err(c, "Ratelimiting new instances of previous error"); + if (s) + s->ret = ret; + mutex_unlock(&c->fsck_error_lock); printbuf_exit(&buf); @@ -211,11 +282,11 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_lock(&c->fsck_error_lock); list_for_each_entry_safe(s, n, &c->fsck_errors, list) { - if (s->ratelimited) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf.buf); + if (s->ratelimited && s->last_msg) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); list_del(&s->list); - printbuf_exit(&s->buf); + kfree(s->last_msg); kfree(s); } diff --git a/libbcachefs/error.h b/libbcachefs/error.h index dae7262..7ce9540 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -73,8 +73,8 @@ do { \ #define bch2_trans_inconsistent(trans, ...) \ ({ \ bch_err(trans->c, __VA_ARGS__); \ - bch2_inconsistent_error(trans->c); \ bch2_dump_trans_updates(trans); \ + bch2_inconsistent_error(trans->c); \ }) #define bch2_trans_inconsistent_on(cond, trans, ...) \ @@ -91,19 +91,14 @@ do { \ * be able to repair: */ -enum fsck_err_opts { - FSCK_OPT_EXIT, - FSCK_OPT_YES, - FSCK_OPT_NO, - FSCK_OPT_ASK, -}; - struct fsck_err_state { struct list_head list; const char *fmt; u64 nr; bool ratelimited; - struct printbuf buf; + int ret; + int fix; + char *last_msg; }; #define FSCK_CAN_FIX (1 << 0) diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 2fd5d96..21af6fb 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -73,8 +73,7 @@ static int count_iters_for_insert(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), BTREE_ITER_SLOTS, r_k, ret2) { - if (bkey_cmp(bkey_start_pos(r_k.k), - POS(0, idx + sectors)) >= 0) + if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; /* extent_update_to_keys(), for the reflink_v update */ @@ -129,14 +128,10 @@ int bch2_extent_atomic_end(struct btree_trans *trans, bch2_trans_copy_iter(©, iter); - for_each_btree_key_continue_norestart(copy, 0, k, ret) { + for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { unsigned offset = 0; - if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) - break; - - if (bkey_cmp(bkey_start_pos(&insert->k), - bkey_start_pos(k.k)) > 0) + if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) offset = bkey_start_offset(&insert->k) - bkey_start_offset(k.k); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 1274d50..c13e0af 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -22,12 +22,9 @@ #include "replicas.h" #include "super.h" #include "super-io.h" +#include "trace.h" #include "util.h" -#include - -static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); - static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -166,15 +163,16 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { prt_printf(err, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, @@ -184,30 +182,16 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); - - if (bkey_val_bytes(k.k) <= sizeof(*bp.v)) { - prt_printf(err, "value too small (%zu <= %zu)", - bkey_val_bytes(k.k), sizeof(*bp.v)); - return -EINVAL; - } - if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { prt_printf(err, "value too big (%zu > %zu)", bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); - return -EINVAL; - } - - if (c->sb.version < bcachefs_metadata_version_snapshot && - bp.v->min_key.snapshot) { - prt_printf(err, "invalid min_key.snapshot (%u != 0)", - bp.v->min_key.snapshot); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, @@ -234,8 +218,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); if (version < bcachefs_metadata_version_inode_btree_change && - btree_node_type_is_extents(btree_id) && - bkey_cmp(bp.v->min_key, POS_MIN)) + btree_id_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) bp.v->min_key = write ? bpos_nosnap_predecessor(bp.v->min_key) : bpos_nosnap_successor(bp.v->min_key); @@ -389,20 +373,15 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(*r.v)); - return -EINVAL; - } - if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { prt_printf(err, "invalid nr_replicas (%u)", r.v->nr_replicas); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -512,7 +491,7 @@ restart_narrow_pointers: bkey_for_each_ptr_decode(&k->k, ptrs, p, i) if (can_narrow_crc(p.crc, n)) { - __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); p.ptr.offset += p.crc.offset; p.crc = n; bch2_extent_ptr_decoded_append(k, &p); @@ -538,7 +517,7 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, switch (type) { case BCH_EXTENT_ENTRY_crc32: set_common_fields(dst->crc32, src); - dst->crc32.csum = *((__le32 *) &src.csum.lo); + memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); break; case BCH_EXTENT_ENTRY_crc64: set_common_fields(dst->crc64, src); @@ -665,9 +644,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return replicas; } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { - unsigned durability = 0; struct bch_dev *ca; if (p->ptr.cached) @@ -675,13 +653,28 @@ unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded ca = bch_dev_bkey_exists(c, p->ptr.dev); - if (ca->mi.state != BCH_MEMBER_STATE_failed) - durability = max_t(unsigned, durability, ca->mi.durability); + return ca->mi.durability + + (p->has_ec + ? p->ec.redundancy + : 0); +} + +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca; - if (p->has_ec) - durability += p->ec.redundancy; + if (p->ptr.cached) + return 0; - return durability; + ca = bch_dev_bkey_exists(c, p->ptr.dev); + + if (ca->mi.state == BCH_MEMBER_STATE_failed) + return 0; + + return ca->mi.durability + + (p->has_ec + ? p->ec.redundancy + : 0); } unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) @@ -692,53 +685,32 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) unsigned durability = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c,& p); + durability += bch2_extent_ptr_durability(c, &p); return durability; } -void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) -{ - union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); - union bch_extent_entry *next = extent_entry_next(entry); - - memmove_u64s(entry, next, (u64 *) end - (u64 *) next); - k->k.u64s -= extent_entry_u64s(entry); -} - -void bch2_bkey_append_ptr(struct bkey_i *k, - struct bch_extent_ptr ptr) +static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) { - EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); - - switch (k->k.type) { - case KEY_TYPE_btree_ptr: - case KEY_TYPE_btree_ptr_v2: - case KEY_TYPE_extent: - EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; - ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) + durability += bch2_extent_ptr_durability(c, &p); - memcpy((void *) &k->v + bkey_val_bytes(&k->k), - &ptr, - sizeof(ptr)); - k->u64s++; - break; - default: - BUG(); - } + return durability; } -static inline void __extent_entry_insert(struct bkey_i *k, - union bch_extent_entry *dst, - union bch_extent_entry *new) +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) { union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); - memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - k->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); } void bch2_extent_ptr_decoded_append(struct bkey_i *k, @@ -800,8 +772,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) /* * Returns pointer to the next entry after the one being dropped: */ -static union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k, - struct bch_extent_ptr *ptr) +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry = to_entry(ptr), *next; @@ -844,7 +816,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, { bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; union bch_extent_entry *ret = - __bch2_bkey_drop_ptr(k, ptr); + bch2_bkey_drop_ptr_noerror(k, ptr); /* * If we deleted all the dirty pointers and there's still cached @@ -875,14 +847,13 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) { - struct bch_extent_ptr *ptr = (void *) bch2_bkey_has_device(k.s_c, dev); + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); if (ptr) - __bch2_bkey_drop_ptr(k, ptr); + bch2_bkey_drop_ptr_noerror(k, ptr); } -const struct bch_extent_ptr * -bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; @@ -957,11 +928,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) } } -bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, - struct bkey_s_c k2) +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) { - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry2; + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; struct extent_ptr_decoded p2; bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) @@ -969,9 +940,32 @@ bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, p1.ptr.gen == p2.ptr.gen && (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) - return true; + return &entry2->ptr; - return false; + return NULL; +} + +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } + + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } + + BUG(); } /* @@ -1004,6 +998,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca; bool first = true; + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); + bkey_extent_entry_for_each(ptrs, entry) { if (!first) prt_printf(out, " "); @@ -1075,14 +1072,14 @@ static int extent_ptr_invalid(const struct bch_fs *c, if (!bch2_dev_exists2(c, ptr->dev)) { prt_printf(err, "pointer to invalid device (%u)", ptr->dev); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } ca = bch_dev_bkey_exists(c, ptr->dev); bkey_for_each_ptr(ptrs, ptr2) if (ptr != ptr2 && ptr->dev == ptr2->dev) { prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); @@ -1090,26 +1087,27 @@ static int extent_ptr_invalid(const struct bch_fs *c, if (bucket >= ca->mi.nbuckets) { prt_printf(err, "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { prt_printf(err, "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bucket_offset + size_ondisk > ca->mi.bucket_size) { prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", bucket_offset, size_ondisk, ca->mi.bucket_size); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; } int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1117,7 +1115,7 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, unsigned size_ondisk = k.k->size; unsigned nonce = UINT_MAX; unsigned nr_ptrs = 0; - bool unwritten = false; + bool unwritten = false, have_ec = false, crc_since_last_ptr = false; int ret; if (bkey_is_btree_ptr(k.k)) @@ -1127,13 +1125,13 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { prt_printf(err, "invalid extent entry type (got %u, max %u)", __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bkey_is_btree_ptr(k.k) && !extent_entry_is_ptr(entry)) { prt_printf(err, "has non ptr field"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } switch (extent_entry_type(entry)) { @@ -1145,15 +1143,22 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, if (nr_ptrs && unwritten != entry->ptr.unwritten) { prt_printf(err, "extent with unwritten and written ptrs"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { prt_printf(err, "has unwritten ptrs"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; + } + + if (entry->ptr.cached && have_ec) { + prt_printf(err, "cached, erasure coded ptr"); + return -BCH_ERR_invalid_bkey; } unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; nr_ptrs++; break; case BCH_EXTENT_ENTRY_crc32: @@ -1164,19 +1169,19 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, if (crc.offset + crc.live_size > crc.uncompressed_size) { prt_printf(err, "checksum offset + key size > uncompressed size"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } size_ondisk = crc.compressed_size; if (!bch2_checksum_type_valid(c, crc.csum_type)) { prt_printf(err, "invalid checksum type"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { prt_printf(err, "invalid compression type"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bch2_csum_type_is_encryption(crc.csum_type)) { @@ -1184,18 +1189,46 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, nonce = crc.offset + crc.nonce; else if (nonce != crc.offset + crc.nonce) { prt_printf(err, "incorrect nonce"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + crc_since_last_ptr = true; break; case BCH_EXTENT_ENTRY_stripe_ptr: + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + have_ec = true; + break; + case BCH_EXTENT_ENTRY_rebalance: break; } } + if (!nr_ptrs) { + prt_str(err, "no ptrs"); + return -BCH_ERR_invalid_bkey; + } + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { prt_str(err, "too many ptrs"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; + } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; } return 0; @@ -1233,6 +1266,8 @@ void bch2_ptr_swab(struct bkey_s k) break; case BCH_EXTENT_ENTRY_stripe_ptr: break; + case BCH_EXTENT_ENTRY_rebalance: + break; } } } @@ -1245,10 +1280,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 sub; - if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) + if (bkey_le(where, bkey_start_pos(k.k))) return 0; - EBUG_ON(bkey_cmp(where, k.k->p) > 0); + EBUG_ON(bkey_gt(where, k.k->p)); sub = where.offset - bkey_start_offset(k.k); @@ -1283,6 +1318,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) break; case BCH_EXTENT_ENTRY_stripe_ptr: break; + case BCH_EXTENT_ENTRY_rebalance: + break; } if (extent_entry_is_crc(entry)) @@ -1325,10 +1362,10 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k) int val_u64s_delta; u64 len = 0; - if (bkey_cmp(where, k.k->p) >= 0) + if (bkey_ge(where, k.k->p)) return 0; - EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); len = where.offset - bkey_start_offset(k.k); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 84737af..6e9d23a 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -8,6 +8,7 @@ struct bch_fs; struct btree_trans; +enum bkey_invalid_flags; /* extent entries: */ @@ -76,6 +77,18 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) return extent_entry_bytes(entry) / sizeof(u64); } +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); +} + static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) { return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; @@ -142,11 +155,7 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) common_fields(crc->crc32), }; - *((__le32 *) &ret.csum.lo) = crc->crc32.csum; - - memcpy(&ret.csum.lo, &crc->crc32.csum, - sizeof(crc->crc32.csum)); - + memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); return ret; } case BCH_EXTENT_ENTRY_crc64: { @@ -156,8 +165,8 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) .csum.lo = (__force __le64) crc->crc64.csum_lo, }; - *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; - + u16 hi = crc->crc64.csum_hi; + memcpy(&ret.csum.hi, &hi, sizeof(hi)); return ret; } case BCH_EXTENT_ENTRY_crc128: { @@ -306,6 +315,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).ec = _entry->stripe_ptr; \ (_ptr).has_ec = true; \ break; \ + default: \ + /* nothing */ \ + break; \ } \ out: \ _entry < (_end); \ @@ -371,11 +383,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ -int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -395,6 +409,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, .compat = bch2_btree_ptr_v2_compat, \ .trans_trigger = bch2_trans_mark_extent, \ .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 40, \ }) /* KEY_TYPE_extent: */ @@ -414,7 +429,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -424,6 +439,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_merge = bch2_reservation_merge, \ .trans_trigger = bch2_trans_mark_reservation, \ .atomic_trigger = bch2_mark_reservation, \ + .min_val_size = 8, \ }) /* Extent checksum entries: */ @@ -596,13 +612,50 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); +void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); + +static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) +{ + return (void *) bch2_bkey_has_device_c(k.s_c, dev); +} + +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); -void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); + +static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) +{ + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->k.u64s++; + break; + default: + BUG(); + } +} + void bch2_extent_ptr_decoded_append(struct bkey_i *, struct extent_ptr_decoded *); +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, + struct bch_extent_ptr *); union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, struct bch_extent_ptr *); @@ -623,21 +676,19 @@ do { \ } \ } while (0) -void bch2_bkey_drop_device(struct bkey_s, unsigned); -void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); - bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, struct bch_extent_ptr, u64); bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -bool bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s_c); +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); + +void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_ptr_swab(struct bkey_s); @@ -654,9 +705,8 @@ enum bch_extent_overlap { static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, const struct bkey *m) { - int cmp1 = bkey_cmp(k->p, m->p) < 0; - int cmp2 = bkey_cmp(bkey_start_pos(k), - bkey_start_pos(m)) > 0; + int cmp1 = bkey_lt(k->p, m->p); + int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); return (cmp1 << 1) + cmp2; } diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 1f2e1fc..bb53054 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -281,7 +281,7 @@ int bch2_unlink_trans(struct btree_trans *trans, } if (deleting_snapshot && !inode_u->bi_subvol) { - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_not_subvol; goto err; } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index dc2352d..d433f4d 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -19,6 +19,7 @@ #include "keylist.h" #include "quota.h" #include "reflink.h" +#include "trace.h" #include #include @@ -32,9 +33,114 @@ #include #include -#include #include +static int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); + +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; + size_t fv_len; +}; + +static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) +{ + + struct folio *folio = page_folio(bv.bv_page); + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + + bv.bv_offset; + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); + + return (struct folio_vec) { + .fv_folio = folio, + .fv_offset = offset, + .fv_len = len, + }; +} + +static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, + struct bvec_iter iter) +{ + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); +} + +#define __bio_for_each_folio(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) + +/** + * bio_for_each_folio - iterate over folios within a bio + * + * Like other non-_all versions, this iterates over what bio->bi_iter currently + * points to. This version is for drivers, where the bio may have previously + * been split or cloned. + */ +#define bio_for_each_folio(bvl, bio, iter) \ + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) + +/* + * Use u64 for the end pos and sector helpers because if the folio covers the + * max supported range of the mapping, the start offset of the next folio + * overflows loff_t. This breaks much of the range based processing in the + * buffered write path. + */ +static inline u64 folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline u64 folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +typedef DARRAY(struct folio *) folios; + +static int filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + int fgp_flags, gfp_t gfp, + folios *folios) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(folios->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(folios, f); + } + + if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return folios->nr ? 0 : ret; +} + struct nocow_flush { struct closure *cl; struct bch_dev *ca; @@ -229,6 +335,9 @@ static int bch2_quota_reservation_add(struct bch_fs *c, { int ret; + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + mutex_lock(&inode->ei_quota_lock); ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); @@ -310,7 +419,9 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA - if (quota_res && sectors > 0) { + if (quota_res && + !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && + sectors > 0) { BUG_ON(sectors > quota_res->sectors); BUG_ON(sectors > inode->ei_quota_reserved); @@ -336,7 +447,66 @@ static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* stored in page->private: */ -struct bch_page_sector { +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ unsigned nr_replicas:4; @@ -344,112 +514,143 @@ struct bch_page_sector { unsigned replicas_reserved:4; /* i_sectors: */ - enum { - SECTOR_UNALLOCATED, - SECTOR_RESERVED, - SECTOR_DIRTY, - SECTOR_DIRTY_RESERVED, - SECTOR_ALLOCATED, - } state:8; + enum bch_folio_sector_state state:8; }; -struct bch_page_state { +struct bch_folio { spinlock_t lock; atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ bool uptodate; - struct bch_page_sector s[PAGE_SECTORS]; + struct bch_folio_sector s[]; }; -static inline struct bch_page_state *__bch2_page_state(struct page *page) +static inline void folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) +{ + s->s[i].state = n; +} + +/* file offset (to folio offset) to bch_folio_sector index */ +static inline int folio_pos_to_s(struct folio *folio, loff_t pos) +{ + u64 f_offset = pos - folio_pos(folio); + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); + return f_offset >> SECTOR_SHIFT; +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) { - return page_has_private(page) - ? (struct bch_page_state *) page_private(page) + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) : NULL; } -static inline struct bch_page_state *bch2_page_state(struct page *page) +static inline struct bch_folio *bch2_folio(struct folio *folio) { - EBUG_ON(!PageLocked(page)); + EBUG_ON(!folio_test_locked(folio)); - return __bch2_page_state(page); + return __bch2_folio(folio); } -/* for newly allocated pages: */ -static void __bch2_page_state_release(struct page *page) +/* for newly allocated folios: */ +static void __bch2_folio_release(struct folio *folio) { - kfree(detach_page_private(page)); + kfree(folio_detach_private(folio)); } -static void bch2_page_state_release(struct page *page) +static void bch2_folio_release(struct folio *folio) { - EBUG_ON(!PageLocked(page)); - __bch2_page_state_release(page); + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); } -/* for newly allocated pages: */ -static struct bch_page_state *__bch2_page_state_create(struct page *page, - gfp_t gfp) +/* for newly allocated folios: */ +static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) { - struct bch_page_state *s; + struct bch_folio *s; - s = kzalloc(sizeof(*s), GFP_NOFS|gfp); + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); if (!s) return NULL; spin_lock_init(&s->lock); - attach_page_private(page, s); + folio_attach_private(folio, s); return s; } -static struct bch_page_state *bch2_page_state_create(struct page *page, - gfp_t gfp) +static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) { - return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); } static unsigned bkey_to_sector_state(struct bkey_s_c k) { if (bkey_extent_is_reservation(k)) - return SECTOR_RESERVED; + return SECTOR_reserved; if (bkey_extent_is_allocation(k.k)) - return SECTOR_ALLOCATED; - return SECTOR_UNALLOCATED; + return SECTOR_allocated; + return SECTOR_unallocated; } -static void __bch2_page_state_set(struct page *page, - unsigned pg_offset, unsigned pg_len, - unsigned nr_ptrs, unsigned state) +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) { - struct bch_page_state *s = bch2_page_state_create(page, __GFP_NOFAIL); - unsigned i; + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); spin_lock(&s->lock); for (i = pg_offset; i < pg_offset + pg_len; i++) { - s->s[i].nr_replicas = nr_ptrs; - s->s[i].state = state; + s->s[i].nr_replicas = nr_ptrs; + folio_sector_set(folio, s, i, state); } - if (i == PAGE_SECTORS) + if (i == sectors) s->uptodate = true; spin_unlock(&s->lock); } -static int bch2_page_state_set(struct bch_fs *c, subvol_inum inum, - struct page **pages, unsigned nr_pages) +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **folios, unsigned nr_folios) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - u64 offset = pages[0]->index << PAGE_SECTORS_SHIFT; - unsigned pg_idx = 0; + struct bch_folio *s; + u64 offset = folio_sector(folios[0]); + unsigned folio_idx; u32 snapshot; + bool need_set = false; int ret; + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); @@ -464,25 +665,25 @@ retry: unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); - while (pg_idx < nr_pages) { - struct page *page = pages[pg_idx]; - u64 pg_start = page->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (page->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(bkey_start_offset(k.k), pg_start) - pg_start; - unsigned pg_len = min(k.k->p.offset, pg_end) - pg_offset - pg_start; + while (folio_idx < nr_folios) { + struct folio *folio = folios[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; - BUG_ON(k.k->p.offset < pg_start); - BUG_ON(bkey_start_offset(k.k) > pg_end); + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) - __bch2_page_state_set(page, pg_offset, pg_len, nr_ptrs, state); + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); - if (k.k->p.offset < pg_end) + if (k.k->p.offset < folio_end) break; - pg_idx++; + folio_idx++; } - if (pg_idx == nr_pages) + if (folio_idx == nr_folios) break; } @@ -499,14 +700,16 @@ err: static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; - struct bio_vec bv; + struct folio_vec fv; unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); unsigned state = bkey_to_sector_state(k); - bio_for_each_segment(bv, bio, iter) - __bch2_page_state_set(bv.bv_page, bv.bv_offset >> 9, - bv.bv_len >> 9, nr_ptrs, state); + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); } static void mark_pagecache_unallocated(struct bch_inode_info *inode, @@ -526,22 +729,20 @@ static void mark_pagecache_unallocated(struct bch_inode_info *inode, &index, end_index, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(start, pg_start) - pg_start; - unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; - struct bch_page_state *s; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; - BUG_ON(end <= pg_start); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(end <= folio_start); folio_lock(folio); - s = bch2_page_state(&folio->page); + s = bch2_folio(folio); if (s) { spin_lock(&s->lock); - for (j = pg_offset; j < pg_offset + pg_len; j++) + for (j = folio_offset; j < folio_offset + folio_len; j++) s->s[j].nr_replicas = 0; spin_unlock(&s->lock); } @@ -572,33 +773,23 @@ static void mark_pagecache_reserved(struct bch_inode_info *inode, &index, end_index, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - u64 pg_start = folio->index << PAGE_SECTORS_SHIFT; - u64 pg_end = (folio->index + 1) << PAGE_SECTORS_SHIFT; - unsigned pg_offset = max(start, pg_start) - pg_start; - unsigned pg_len = min(end, pg_end) - pg_offset - pg_start; - struct bch_page_state *s; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; - BUG_ON(end <= pg_start); - BUG_ON(pg_offset >= PAGE_SECTORS); - BUG_ON(pg_offset + pg_len > PAGE_SECTORS); + BUG_ON(end <= folio_start); folio_lock(folio); - s = bch2_page_state(&folio->page); + s = bch2_folio(folio); if (s) { spin_lock(&s->lock); - for (j = pg_offset; j < pg_offset + pg_len; j++) - switch (s->s[j].state) { - case SECTOR_UNALLOCATED: - s->s[j].state = SECTOR_RESERVED; - break; - case SECTOR_DIRTY: - s->s[j].state = SECTOR_DIRTY_RESERVED; - i_sectors_delta--; - break; - default: - break; - } + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); + } spin_unlock(&s->lock); } @@ -619,28 +810,28 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info : c->opts.data_replicas; } -static inline unsigned sectors_to_reserve(struct bch_page_sector *s, - unsigned nr_replicas) +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) { return max(0, (int) nr_replicas - s->nr_replicas - s->replicas_reserved); } -static int bch2_get_page_disk_reservation(struct bch_fs *c, +static int bch2_get_folio_disk_reservation(struct bch_fs *c, struct bch_inode_info *inode, - struct page *page, bool check_enospc) + struct folio *folio, bool check_enospc) { - struct bch_page_state *s = bch2_page_state_create(page, 0); + struct bch_folio *s = bch2_folio_create(folio, 0); unsigned nr_replicas = inode_nr_replicas(c, inode); struct disk_reservation disk_res = { 0 }; - unsigned i, disk_res_sectors = 0; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; int ret; if (!s) return -ENOMEM; - for (i = 0; i < ARRAY_SIZE(s->s); i++) + for (i = 0; i < sectors; i++) disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); if (!disk_res_sectors) @@ -654,41 +845,42 @@ static int bch2_get_page_disk_reservation(struct bch_fs *c, if (unlikely(ret)) return ret; - for (i = 0; i < ARRAY_SIZE(s->s); i++) + for (i = 0; i < sectors; i++) s->s[i].replicas_reserved += sectors_to_reserve(&s->s[i], nr_replicas); return 0; } -struct bch2_page_reservation { +struct bch2_folio_reservation { struct disk_reservation disk; struct quota_res quota; }; -static void bch2_page_reservation_init(struct bch_fs *c, +static void bch2_folio_reservation_init(struct bch_fs *c, struct bch_inode_info *inode, - struct bch2_page_reservation *res) + struct bch2_folio_reservation *res) { memset(res, 0, sizeof(*res)); res->disk.nr_replicas = inode_nr_replicas(c, inode); } -static void bch2_page_reservation_put(struct bch_fs *c, +static void bch2_folio_reservation_put(struct bch_fs *c, struct bch_inode_info *inode, - struct bch2_page_reservation *res) + struct bch2_folio_reservation *res) { bch2_disk_reservation_put(c, &res->disk); bch2_quota_reservation_put(c, inode, &res->quota); } -static int bch2_page_reservation_get(struct bch_fs *c, - struct bch_inode_info *inode, struct page *page, - struct bch2_page_reservation *res, +static int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, unsigned offset, unsigned len) { - struct bch_page_state *s = bch2_page_state_create(page, 0); + struct bch_folio *s = bch2_folio_create(folio, 0); unsigned i, disk_sectors = 0, quota_sectors = 0; int ret; @@ -702,7 +894,7 @@ static int bch2_page_reservation_get(struct bch_fs *c, i++) { disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas); - quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; + quota_sectors += s->s[i].state == SECTOR_unallocated; } if (disk_sectors) { @@ -728,55 +920,49 @@ static int bch2_page_reservation_get(struct bch_fs *c, return 0; } -static void bch2_clear_page_bits(struct page *page) +static void bch2_clear_folio_bits(struct folio *folio) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_page_state *s = bch2_page_state(page); + struct bch_folio *s = bch2_folio(folio); struct disk_reservation disk_res = { 0 }; - int i, dirty_sectors = 0; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; if (!s) return; - EBUG_ON(!PageLocked(page)); - EBUG_ON(PageWriteback(page)); + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); - for (i = 0; i < ARRAY_SIZE(s->s); i++) { + for (i = 0; i < sectors; i++) { disk_res.sectors += s->s[i].replicas_reserved; s->s[i].replicas_reserved = 0; - switch (s->s[i].state) { - case SECTOR_DIRTY: - s->s[i].state = SECTOR_UNALLOCATED; - --dirty_sectors; - break; - case SECTOR_DIRTY_RESERVED: - s->s[i].state = SECTOR_RESERVED; - break; - default: - break; - } + dirty_sectors -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); } bch2_disk_reservation_put(c, &disk_res); i_sectors_acct(c, inode, NULL, dirty_sectors); - bch2_page_state_release(page); + bch2_folio_release(folio); } -static void bch2_set_page_dirty(struct bch_fs *c, - struct bch_inode_info *inode, struct page *page, - struct bch2_page_reservation *res, +static void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, unsigned offset, unsigned len) { - struct bch_page_state *s = bch2_page_state(page); + struct bch_folio *s = bch2_folio(folio); unsigned i, dirty_sectors = 0; - WARN_ON((u64) page_offset(page) + offset + len > + WARN_ON((u64) folio_pos(folio) + offset + len > round_up((u64) i_size_read(&inode->v), block_bytes(c))); + BUG_ON(!s->uptodate); + spin_lock(&s->lock); for (i = round_down(offset, block_bytes(c)) >> 9; @@ -794,25 +980,17 @@ static void bch2_set_page_dirty(struct bch_fs *c, s->s[i].replicas_reserved += sectors; res->disk.sectors -= sectors; - switch (s->s[i].state) { - case SECTOR_UNALLOCATED: - s->s[i].state = SECTOR_DIRTY; - dirty_sectors++; - break; - case SECTOR_RESERVED: - s->s[i].state = SECTOR_DIRTY_RESERVED; - break; - default: - break; - } + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); } spin_unlock(&s->lock); i_sectors_acct(c, inode, &res->quota, dirty_sectors); - if (!PageDirty(page)) - __set_page_dirty_nobuffers(page); + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); } vm_fault_t bch2_page_fault(struct vm_fault *vmf) @@ -821,7 +999,7 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct address_space *fdm = faults_disabled_mapping(); struct bch_inode_info *inode = file_bch_inode(file); - int ret; + vm_fault_t ret; if (fdm == mapping) return VM_FAULT_SIGBUS; @@ -855,17 +1033,17 @@ got_lock: vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct file *file = vmf->vma->vm_file; struct bch_inode_info *inode = file_bch_inode(file); struct address_space *mapping = file->f_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation res; + struct bch2_folio_reservation res; unsigned len; loff_t isize; - int ret; + vm_fault_t ret; - bch2_page_reservation_init(c, inode, &res); + bch2_folio_reservation_init(c, inode, &res); sb_start_pagefault(inode->v.i_sb); file_update_time(file); @@ -878,35 +1056,28 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) */ bch2_pagecache_add_get(inode); - lock_page(page); + folio_lock(folio); isize = i_size_read(&inode->v); - if (page->mapping != mapping || page_offset(page) >= isize) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - if (bch2_page_state_set(c, inode_inum(inode), &page, 1)) { - unlock_page(page); - ret = VM_FAULT_SIGBUS; - goto out; - } - } - - if (bch2_page_reservation_get(c, inode, page, &res, 0, len)) { - unlock_page(page); + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); ret = VM_FAULT_SIGBUS; goto out; } - bch2_set_page_dirty(c, inode, page, &res, 0, len); - bch2_page_reservation_put(c, inode, &res); + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; out: bch2_pagecache_add_put(inode); @@ -920,7 +1091,7 @@ void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) if (offset || length < folio_size(folio)) return; - bch2_clear_page_bits(&folio->page); + bch2_clear_folio_bits(folio); } bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) @@ -928,7 +1099,7 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) if (folio_test_dirty(folio) || folio_test_writeback(folio)) return false; - bch2_clear_page_bits(&folio->page); + bch2_clear_folio_bits(folio); return true; } @@ -936,19 +1107,16 @@ bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) static void bch2_readpages_end_io(struct bio *bio) { - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) { - struct page *page = bv->bv_page; + struct folio_iter fi; + bio_for_each_folio_all(fi, bio) { if (!bio->bi_status) { - SetPageUptodate(page); + folio_mark_uptodate(fi.folio); } else { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(fi.folio); + folio_set_error(fi.folio); } - unlock_page(page); + folio_unlock(fi.folio); } bio_put(bio); @@ -956,44 +1124,48 @@ static void bch2_readpages_end_io(struct bio *bio) struct readpages_iter { struct address_space *mapping; - struct page **pages; - unsigned nr_pages; unsigned idx; - pgoff_t offset; + folios folios; }; static int readpages_iter_init(struct readpages_iter *iter, struct readahead_control *ractl) { - unsigned i, nr_pages = readahead_count(ractl); + struct folio **fi; + int ret; memset(iter, 0, sizeof(*iter)); - iter->mapping = ractl->mapping; - iter->offset = readahead_index(ractl); - iter->nr_pages = nr_pages; + iter->mapping = ractl->mapping; - iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!iter->pages) - return -ENOMEM; + ret = filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; - nr_pages = __readahead_batch(ractl, iter->pages, nr_pages); - for (i = 0; i < nr_pages; i++) { - __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); - put_page(iter->pages[i]); + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); + folio_put(*fi); + folio_put(*fi); } return 0; } -static inline struct page *readpage_iter_next(struct readpages_iter *iter) +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) { - if (iter->idx >= iter->nr_pages) + if (iter->idx >= iter->folios.nr) return NULL; + return iter->folios.data[iter->idx]; +} - EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); - - return iter->pages[iter->idx]; +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; } static bool extent_partial_reads_expensive(struct bkey_s_c k) @@ -1008,52 +1180,57 @@ static bool extent_partial_reads_expensive(struct bkey_s_c k) return false; } -static void readpage_bio_extend(struct readpages_iter *iter, - struct bio *bio, - unsigned sectors_this_extent, - bool get_more) +static int readpage_bio_extend(struct btree_trans *trans, + struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, + bool get_more) { + /* Don't hold btree locks while allocating memory: */ + bch2_trans_unlock(trans); + while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { - pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; - struct page *page = readpage_iter_next(iter); + struct folio *folio = readpage_iter_peek(iter); int ret; - if (page) { - if (iter->offset + iter->idx != page_offset) - break; - - iter->idx++; + if (folio) { + readpage_iter_advance(iter); } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + if (!get_more) break; - page = xa_load(&iter->mapping->i_pages, page_offset); - if (page && !xa_is_value(page)) + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) break; - page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); - if (!page) + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) break; - if (!__bch2_page_state_create(page, 0)) { - put_page(page); + if (!__bch2_folio_create(folio, GFP_KERNEL)) { + folio_put(folio); break; } - ret = add_to_page_cache_lru(page, iter->mapping, - page_offset, GFP_NOFS); + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); if (ret) { - __bch2_page_state_release(page); - put_page(page); + __bch2_folio_release(folio); + folio_put(folio); break; } - put_page(page); + folio_put(folio); } - BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); } + + return bch2_trans_relock(trans); } static void bchfs_read(struct btree_trans *trans, @@ -1121,9 +1298,12 @@ retry: sectors = min(sectors, k.k->size - offset_into_extent); - if (readpages_iter) - readpage_bio_extend(readpages_iter, &rbio->bio, sectors, - extent_partial_reads_expensive(k)); + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); + if (ret) + break; + } bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -1168,12 +1348,14 @@ void bch2_readahead(struct readahead_control *ractl) { struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct bch_io_opts opts; struct btree_trans trans; - struct page *page; + struct folio *folio; struct readpages_iter readpages_iter; int ret; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + ret = readpages_iter_init(&readpages_iter, ractl); BUG_ON(ret); @@ -1181,70 +1363,72 @@ void bch2_readahead(struct readahead_control *ractl) bch2_pagecache_add_get(inode); - while ((page = readpage_iter_next(&readpages_iter))) { - pgoff_t index = readpages_iter.offset + readpages_iter.idx; + while ((folio = readpage_iter_peek(&readpages_iter))) { unsigned n = min_t(unsigned, - readpages_iter.nr_pages - + readpages_iter.folios.nr - readpages_iter.idx, BIO_MAX_VECS); struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, - GFP_NOFS, &c->bio_read), + GFP_KERNEL, &c->bio_read), opts); - readpages_iter.idx++; + readpage_iter_advance(&readpages_iter); - rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTORS_SHIFT; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); rbio->bio.bi_end_io = bch2_readpages_end_io; - BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(&trans, rbio, inode_inum(inode), &readpages_iter); + bch2_trans_unlock(&trans); } bch2_pagecache_add_put(inode); bch2_trans_exit(&trans); - kfree(readpages_iter.pages); + darray_exit(&readpages_iter.folios); } -static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, - subvol_inum inum, struct page *page) +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) { struct btree_trans trans; - bch2_page_state_create(page, __GFP_NOFAIL); + bch2_folio_create(folio, __GFP_NOFAIL); - bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); - rbio->bio.bi_iter.bi_sector = - (sector_t) page->index << PAGE_SECTORS_SHIFT; - BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bch2_trans_init(&trans, c, 0, 0); bchfs_read(&trans, rbio, inum, NULL); bch2_trans_exit(&trans); } -static void bch2_read_single_page_end_io(struct bio *bio) +static void bch2_read_single_folio_end_io(struct bio *bio) { complete(bio->bi_private); } -static int bch2_read_single_page(struct page *page, - struct address_space *mapping) +static int bch2_read_single_folio(struct folio *folio, + struct address_space *mapping) { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; + struct bch_io_opts opts; int ret; DECLARE_COMPLETION_ONSTACK(done); - rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, &c->bio_read), - io_opts(c, &inode->ei_inode)); + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), + opts); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_page_end_io; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - __bchfs_readpage(c, rbio, inode_inum(inode), page); + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -1253,16 +1437,15 @@ static int bch2_read_single_page(struct page *page, if (ret < 0) return ret; - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } int bch2_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret; - ret = bch2_read_single_page(page, page->mapping); + ret = bch2_read_single_folio(folio, folio->mapping); folio_unlock(folio); return bch2_err_class(ret); } @@ -1272,14 +1455,17 @@ int bch2_read_folio(struct file *file, struct folio *folio) struct bch_writepage_state { struct bch_writepage_io *io; struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; }; static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, struct bch_inode_info *inode) { - return (struct bch_writepage_state) { - .opts = io_opts(c, &inode->ei_inode) - }; + struct bch_writepage_state ret = { 0 }; + + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); + return ret; } static void bch2_writepage_io_done(struct bch_write_op *op) @@ -1288,34 +1474,33 @@ static void bch2_writepage_io_done(struct bch_write_op *op) container_of(op, struct bch_writepage_io, op); struct bch_fs *c = io->op.c; struct bio *bio = &io->op.wbio.bio; - struct bvec_iter_all iter; - struct bio_vec *bvec; + struct folio_iter fi; unsigned i; if (io->op.error) { set_bit(EI_INODE_ERROR, &io->inode->ei_flags); - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s; + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; - SetPageError(bvec->bv_page); - mapping_set_error(bvec->bv_page->mapping, -EIO); + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, -EIO); - s = __bch2_page_state(bvec->bv_page); + s = __bch2_folio(fi.folio); spin_lock(&s->lock); - for (i = 0; i < PAGE_SECTORS; i++) + for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } } if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s; + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; - s = __bch2_page_state(bvec->bv_page); + s = __bch2_folio(fi.folio); spin_lock(&s->lock); - for (i = 0; i < PAGE_SECTORS; i++) + for (i = 0; i < folio_sectors(fi.folio); i++) s->s[i].nr_replicas = 0; spin_unlock(&s->lock); } @@ -1340,11 +1525,11 @@ static void bch2_writepage_io_done(struct bch_write_op *op) */ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); - bio_for_each_segment_all(bvec, bio, iter) { - struct bch_page_state *s = __bch2_page_state(bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); if (atomic_dec_and_test(&s->write_count)) - end_page_writeback(bvec->bv_page); + folio_end_writeback(fi.folio); } bio_put(&io->op.wbio.bio); @@ -1373,7 +1558,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, REQ_OP_WRITE, - GFP_NOFS, + GFP_KERNEL, &c->writepage_bioset), struct bch_writepage_io, op.wbio.bio); @@ -1392,56 +1577,63 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } -static int __bch2_writepage(struct page *page, +static int __bch2_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct bch_inode_info *inode = to_bch_ei(page->mapping->host); + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; - struct bch_page_state *s, orig; - unsigned i, offset, nr_replicas_this_write = U32_MAX; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; loff_t i_size = i_size_read(&inode->v); - pgoff_t end_index = i_size >> PAGE_SHIFT; int ret; - EBUG_ON(!PageUptodate(page)); + EBUG_ON(!folio_test_uptodate(folio)); - /* Is the page fully inside i_size? */ - if (page->index < end_index) + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) goto do_io; - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE - 1); - if (page->index > end_index || !offset) { - unlock_page(page); + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); return 0; } /* - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_segment(page, offset, PAGE_SIZE); + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); do_io: - s = bch2_page_state_create(page, __GFP_NOFAIL); + f_sectors = folio_sectors(folio); + s = bch2_folio(folio); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kzalloc(sizeof(struct bch_folio_sector) * + f_sectors, __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } /* * Things get really hairy with errors during writeback: */ - ret = bch2_get_page_disk_reservation(c, inode, page, false); + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); BUG_ON(ret); /* Before unlocking the page, get copy of reservations: */ spin_lock(&s->lock); - orig = *s; - spin_unlock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); - for (i = 0; i < PAGE_SECTORS; i++) { - if (s->s[i].state < SECTOR_DIRTY) + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) continue; nr_replicas_this_write = @@ -1450,50 +1642,51 @@ do_io: s->s[i].replicas_reserved); } - for (i = 0; i < PAGE_SECTORS; i++) { - if (s->s[i].state < SECTOR_DIRTY) + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) continue; s->s[i].nr_replicas = w->opts.compression ? 0 : nr_replicas_this_write; s->s[i].replicas_reserved = 0; - s->s[i].state = SECTOR_ALLOCATED; + folio_sector_set(folio, s, i, SECTOR_allocated); } + spin_unlock(&s->lock); BUG_ON(atomic_read(&s->write_count)); atomic_set(&s->write_count, 1); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); - unlock_page(page); + folio_unlock(folio); offset = 0; while (1) { unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; u64 sector; - while (offset < PAGE_SECTORS && - orig.s[offset].state < SECTOR_DIRTY) + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) offset++; - if (offset == PAGE_SECTORS) + if (offset == f_sectors) break; - while (offset + sectors < PAGE_SECTORS && - orig.s[offset + sectors].state >= SECTOR_DIRTY) { - reserved_sectors += orig.s[offset + sectors].replicas_reserved; - dirty_sectors += orig.s[offset + sectors].state == SECTOR_DIRTY; + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; sectors++; } BUG_ON(!sectors); - sector = ((u64) page->index << PAGE_SECTORS_SHIFT) + offset; + sector = folio_sector(folio) + offset; if (w->io && (w->io->op.res.nr_replicas != nr_replicas_this_write || - bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || + bio_full(&w->io->op.wbio.bio, sectors << 9) || w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= (BIO_MAX_VECS * PAGE_SIZE) || bio_end_sector(&w->io->op.wbio.bio) != sector)) @@ -1506,7 +1699,7 @@ do_io: atomic_inc(&s->write_count); BUG_ON(inode != w->io->inode); - BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); /* Check for writing past i_size: */ @@ -1526,7 +1719,7 @@ do_io: } if (atomic_dec_and_test(&s->write_count)) - end_page_writeback(page); + folio_end_writeback(folio); return 0; } @@ -1544,6 +1737,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc if (w.io) bch2_writepage_do_io(&w); blk_finish_plug(&plug); + kfree(w.tmp); return bch2_err_class(ret); } @@ -1555,61 +1749,63 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation *res; - pgoff_t index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - struct page *page; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; int ret = -ENOMEM; res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) return -ENOMEM; - bch2_page_reservation_init(c, inode, res); + bch2_folio_reservation_init(c, inode, res); *fsdata = res; bch2_pagecache_add_get(inode); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (IS_ERR_OR_NULL(folio)) goto err_unlock; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - /* If we're writing entire page, don't need to read it in first: */ - if (len == PAGE_SIZE) + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) goto out; if (!offset && pos + len >= inode->v.i_size) { - zero_user_segment(page, len, PAGE_SIZE); - flush_dcache_page(page); + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); goto out; } - if (index > inode->v.i_size >> PAGE_SHIFT) { - zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); - flush_dcache_page(page); + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); goto out; } readpage: - ret = bch2_read_single_page(page, mapping); + ret = bch2_read_single_folio(folio, mapping); if (ret) goto err; out: - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - ret = bch2_page_state_set(c, inode_inum(inode), &page, 1); - if (ret) - goto err; - } + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto err; - ret = bch2_page_reservation_get(c, inode, page, res, offset, len); + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); if (ret) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* - * If the page hasn't been read in, we won't know if we + * If the folio hasn't been read in, we won't know if we * actually need a reservation - we don't actually need - * to read here, we just need to check if the page is + * to read here, we just need to check if the folio is * fully backed by uncompressed data: */ goto readpage; @@ -1618,11 +1814,11 @@ out: goto err; } - *pagep = page; + *pagep = &folio->page; return 0; err: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; err_unlock: bch2_pagecache_add_put(inode); @@ -1637,19 +1833,21 @@ int bch2_write_end(struct file *file, struct address_space *mapping, { struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch2_page_reservation *res = fsdata; - unsigned offset = pos & (PAGE_SIZE - 1); + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); - if (unlikely(copied < len && !PageUptodate(page))) { + if (unlikely(copied < len && !folio_test_uptodate(folio))) { /* - * The page needs to be read in, but that would destroy + * The folio needs to be read in, but that would destroy * our partial write - simplest thing is to just force * userspace to redo the write: */ - zero_user(page, 0, PAGE_SIZE); - flush_dcache_page(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); copied = 0; } @@ -1659,25 +1857,33 @@ int bch2_write_end(struct file *file, struct address_space *mapping, spin_unlock(&inode->v.i_lock); if (copied) { - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); - bch2_set_page_dirty(c, inode, page, res, offset, copied); + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); inode->ei_last_dirtied = (unsigned long) current; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); bch2_pagecache_add_put(inode); - bch2_page_reservation_put(c, inode, res); + bch2_folio_reservation_put(c, inode, res); kfree(res); return copied; } -#define WRITE_BATCH_PAGES 32 +static noinline void folios_trunc(folios *folios, struct folio **fi) +{ + while (folios->data + folios->nr > fi) { + struct folio *f = darray_pop(folios); + + folio_unlock(f); + folio_put(f); + } +} static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, @@ -1685,64 +1891,57 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, loff_t pos, unsigned len) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct page *pages[WRITE_BATCH_PAGES]; - struct bch2_page_reservation res; - unsigned long index = pos >> PAGE_SHIFT; - unsigned offset = pos & (PAGE_SIZE - 1); - unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - unsigned i, reserved = 0, set_dirty = 0; - unsigned copied = 0, nr_pages_copied = 0; + struct bch2_folio_reservation res; + folios folios; + struct folio **fi, *f; + unsigned copied = 0, f_offset; + u64 end = pos + len, f_pos; + loff_t last_folio_pos = inode->v.i_size; int ret = 0; BUG_ON(!len); - BUG_ON(nr_pages > ARRAY_SIZE(pages)); - bch2_page_reservation_init(c, inode, &res); + bch2_folio_reservation_init(c, inode, &res); + darray_init(&folios); - for (i = 0; i < nr_pages; i++) { - pages[i] = grab_cache_page_write_begin(mapping, index + i); - if (!pages[i]) { - nr_pages = i; - if (!i) { - ret = -ENOMEM; - goto out; - } - len = min_t(unsigned, len, - nr_pages * PAGE_SIZE - offset); - break; - } - } + ret = filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &folios); + if (ret) + goto out; + + BUG_ON(!folios.nr); - if (offset && !PageUptodate(pages[0])) { - ret = bch2_read_single_page(pages[0], mapping); + f = darray_first(folios); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } - if ((pos + len) & (PAGE_SIZE - 1) && - !PageUptodate(pages[nr_pages - 1])) { - if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { - zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); + f = darray_last(folios); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); } else { - ret = bch2_read_single_page(pages[nr_pages - 1], mapping); + ret = bch2_read_single_folio(f, mapping); if (ret) goto out; } } - while (reserved < len) { - unsigned i = (offset + reserved) >> PAGE_SHIFT; - struct page *page = pages[i]; - unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, len - reserved, - PAGE_SIZE - pg_offset); + ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); + if (ret) + goto out; - if (!bch2_page_state_create(page, __GFP_NOFAIL)->uptodate) { - ret = bch2_page_state_set(c, inode_inum(inode), - pages + i, nr_pages - i); - if (ret) - goto out; - } + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; /* * XXX: per POSIX and fstests generic/275, on -ENOSPC we're @@ -1752,79 +1951,98 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, * we aren't completely out of disk space - we don't do that * yet: */ - ret = bch2_page_reservation_get(c, inode, page, &res, - pg_offset, pg_len); + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); if (unlikely(ret)) { - if (!reserved) + folios_trunc(&folios, fi); + if (!folios.nr) goto out; + + end = min(end, folio_end_pos(darray_last(folios))); break; } - reserved += pg_len; + f_pos = folio_end_pos(f); + f_offset = 0; } if (mapping_writably_mapped(mapping)) - for (i = 0; i < nr_pages; i++) - flush_dcache_page(pages[i]); - - while (copied < reserved) { - struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; - unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, reserved - copied, - PAGE_SIZE - pg_offset); - unsigned pg_copied = copy_page_from_iter_atomic(page, - pg_offset, pg_len, iter); - - if (!pg_copied) + darray_for_each(folios, fi) + flush_dcache_folio(*fi); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + + if (!f_copied) { + folios_trunc(&folios, fi); break; + } - if (!PageUptodate(page) && - pg_copied != PAGE_SIZE && - pos + copied + pg_copied < inode->v.i_size) { - zero_user(page, 0, PAGE_SIZE); + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&folios, fi); break; } - flush_dcache_page(page); - copied += pg_copied; + flush_dcache_folio(f); + copied += f_copied; - if (pg_copied != pg_len) + if (f_copied != f_len) { + folios_trunc(&folios, fi + 1); break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; } if (!copied) goto out; + end = pos + copied; + spin_lock(&inode->v.i_lock); - if (pos + copied > inode->v.i_size) - i_size_write(&inode->v, pos + copied); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); spin_unlock(&inode->v.i_lock); - while (set_dirty < copied) { - struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; - unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); - unsigned pg_len = min_t(unsigned, copied - set_dirty, - PAGE_SIZE - pg_offset); + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; - if (!PageUptodate(page)) - SetPageUptodate(page); + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); - bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); - unlock_page(page); - put_page(page); + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); - set_dirty += pg_len; + f_pos = folio_end_pos(f); + f_offset = 0; } - nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); inode->ei_last_dirtied = (unsigned long) current; out: - for (i = nr_pages_copied; i < nr_pages; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + darray_for_each(folios, fi) { + folio_unlock(*fi); + folio_put(*fi); } - bch2_page_reservation_put(c, inode, &res); + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&folios); + bch2_folio_reservation_put(c, inode, &res); return copied ?: ret; } @@ -1842,8 +2060,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) do { unsigned offset = pos & (PAGE_SIZE - 1); - unsigned bytes = min_t(unsigned long, iov_iter_count(iter), - PAGE_SIZE * WRITE_BATCH_PAGES - offset); + unsigned bytes = iov_iter_count(iter); again: /* * Bring in the user page that we will copy from _first_. @@ -1945,7 +2162,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct file *file = req->ki_filp; struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; loff_t offset = req->ki_pos; @@ -1953,6 +2170,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) size_t shorten; ssize_t ret; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + if ((offset|iter->count) & (block_bytes(c) - 1)) return -EINVAL; @@ -2009,7 +2228,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) &c->bio_read); bio->bi_end_io = bch2_direct_IO_read_split_endio; start: - bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); + bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; bio->bi_private = dio; @@ -2109,7 +2328,7 @@ retry: for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, SPOS(inum.inum, offset, snapshot), BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) break; if (k.k->p.snapshot != snapshot || @@ -2145,10 +2364,29 @@ static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) static void bch2_dio_write_loop_async(struct bch_write_op *); static __always_inline long bch2_dio_write_done(struct dio_write *dio); +/* + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the + * caller's stack, we're not guaranteed that it will live for the duration of + * the IO: + */ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) { struct iovec *iov = dio->inline_vecs; + /* + * iov_iter has a single embedded iovec - nothing to do: + */ + if (iter_is_ubuf(&dio->iter)) + return 0; + + /* + * We don't currently handle non-iovec iov_iters here - return an error, + * and we'll fall back to doing the IO synchronously: + */ + if (!iter_is_iovec(&dio->iter)) + return -1; + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), GFP_KERNEL); @@ -2158,8 +2396,8 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) dio->free_iov = true; } - memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); - dio->iter.iov = iov; + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.__iov = iov; return 0; } @@ -2219,7 +2457,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) bch2_pagecache_block_put(inode); if (dio->free_iov) - kfree(dio->iter.iov); + kfree(dio->iter.__iov); ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); @@ -2243,8 +2481,6 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) struct kiocb *req = dio->req; struct bch_inode_info *inode = dio->inode; struct bio *bio = &dio->op.wbio.bio; - struct bvec_iter_all iter; - struct bio_vec *bv; req->ki_pos += (u64) dio->op.written << 9; dio->written += dio->op.written; @@ -2263,25 +2499,26 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) mutex_unlock(&inode->ei_quota_lock); } - if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF))) - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); + bio_release_pages(bio, false); if (unlikely(dio->op.error)) set_bit(EI_INODE_ERROR, &inode->ei_flags); } -static long bch2_dio_write_loop(struct dio_write *dio) +static __always_inline long bch2_dio_write_loop(struct dio_write *dio) { struct bch_fs *c = dio->op.c; struct kiocb *req = dio->req; struct address_space *mapping = dio->mapping; struct bch_inode_info *inode = dio->inode; + struct bch_io_opts opts; struct bio *bio = &dio->op.wbio.bio; unsigned unaligned, iter_count; bool sync = dio->sync, dropped_locks; long ret; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + while (1) { iter_count = dio->iter.count; @@ -2329,7 +2566,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) goto err; } - bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); + bch2_write_op_init(&dio->op, c, opts); dio->op.end_io = sync ? NULL : bch2_dio_write_loop_async; @@ -2381,30 +2618,16 @@ out: err: dio->op.error = ret; - if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { - struct bvec_iter_all iter; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, bio, iter) - put_page(bv->bv_page); - } + bio_release_pages(bio, false); bch2_quota_reservation_put(c, inode, &dio->quota_res); goto out; } -static void bch2_dio_write_loop_async(struct bch_write_op *op) +static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) { - struct dio_write *dio = container_of(op, struct dio_write, op); struct mm_struct *mm = dio->mm; - bch2_dio_write_end(dio); - - if (likely(!dio->iter.count) || dio->op.error) { - bch2_dio_write_done(dio); - return; - } - bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); if (mm) @@ -2414,6 +2637,18 @@ static void bch2_dio_write_loop_async(struct bch_write_op *op) kthread_unuse_mm(mm); } +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + bch2_dio_write_done(dio); + else + bch2_dio_write_continue(dio); +} + static noinline ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) { @@ -2592,15 +2827,11 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (bkey_extent_is_data(k.k)) { + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) + if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { ret = 1; break; } - } start = iter.pos; bch2_trans_iter_exit(&trans, &iter); err: @@ -2611,128 +2842,141 @@ err: return ret; } -static int __bch2_truncate_page(struct bch_inode_info *inode, - pgoff_t index, loff_t start, loff_t end) +static int __bch2_truncate_folio(struct bch_inode_info *inode, + pgoff_t index, loff_t start, loff_t end) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; - struct bch_page_state *s; + struct bch_folio *s; unsigned start_offset = start & (PAGE_SIZE - 1); unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; unsigned i; - struct page *page; + struct folio *folio; s64 i_sectors_delta = 0; int ret = 0; + u64 end_pos; - /* Page boundary? Nothing to do */ - if (!((index == start >> PAGE_SHIFT && start_offset) || - (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) - return 0; - - /* Above i_size? */ - if (index << PAGE_SHIFT >= inode->v.i_size) - return 0; - - page = find_lock_page(mapping, index); - if (!page) { + folio = filemap_lock_folio(mapping, index); + if (IS_ERR_OR_NULL(folio)) { /* * XXX: we're doing two index lookups when we end up reading the - * page + * folio */ ret = range_has_data(c, inode->ei_subvol, - POS(inode->v.i_ino, index << PAGE_SECTORS_SHIFT), - POS(inode->v.i_ino, (index + 1) << PAGE_SECTORS_SHIFT)); + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); if (ret <= 0) return ret; - page = find_or_create_page(mapping, index, GFP_KERNEL); - if (unlikely(!page)) { + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); + if (unlikely(IS_ERR_OR_NULL(folio))) { ret = -ENOMEM; goto out; } } - s = bch2_page_state_create(page, 0); + BUG_ON(start >= folio_end_pos(folio)); + BUG_ON(end <= folio_pos(folio)); + + start_offset = max(start, folio_pos(folio)) - folio_pos(folio); + end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); + + /* Folio boundary? Nothing to do */ + if (start_offset == 0 && + end_offset == folio_size(folio)) { + ret = 0; + goto unlock; + } + + s = bch2_folio_create(folio, 0); if (!s) { ret = -ENOMEM; goto unlock; } - if (!PageUptodate(page)) { - ret = bch2_read_single_page(page, mapping); + if (!folio_test_uptodate(folio)) { + ret = bch2_read_single_folio(folio, mapping); if (ret) goto unlock; } - if (index != start >> PAGE_SHIFT) - start_offset = 0; - if (index != end >> PAGE_SHIFT) - end_offset = PAGE_SIZE; + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto unlock; for (i = round_up(start_offset, block_bytes(c)) >> 9; i < round_down(end_offset, block_bytes(c)) >> 9; i++) { s->s[i].nr_replicas = 0; - if (s->s[i].state == SECTOR_DIRTY) - i_sectors_delta--; - s->s[i].state = SECTOR_UNALLOCATED; + + i_sectors_delta -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, SECTOR_unallocated); } i_sectors_acct(c, inode, NULL, i_sectors_delta); /* - * Caller needs to know whether this page will be written out by + * Caller needs to know whether this folio will be written out by * writeback - doing an i_size update if necessary - or whether it will - * be responsible for the i_size update: + * be responsible for the i_size update. + * + * Note that we shouldn't ever see a folio beyond EOF, but check and + * warn if so. This has been observed by failure to clean up folios + * after a short write and there's still a chance reclaim will fix + * things up. */ - ret = s->s[(min_t(u64, inode->v.i_size - (index << PAGE_SHIFT), - PAGE_SIZE) - 1) >> 9].state >= SECTOR_DIRTY; + WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); + end_pos = folio_end_pos(folio); + if (inode->v.i_size > folio_pos(folio)) + end_pos = min_t(u64, inode->v.i_size, end_pos); + ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; - zero_user_segment(page, start_offset, end_offset); + folio_zero_segment(folio, start_offset, end_offset); /* * Bit of a hack - we don't want truncate to fail due to -ENOSPC. * - * XXX: because we aren't currently tracking whether the page has actual + * XXX: because we aren't currently tracking whether the folio has actual * data in it (vs. just 0s, or only partially written) this wrong. ick. */ - BUG_ON(bch2_get_page_disk_reservation(c, inode, page, false)); + BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); /* * This removes any writeable userspace mappings; we need to force * .page_mkwrite to be called again before any mmapped writes, to * redirty the full page: */ - page_mkclean(page); - __set_page_dirty_nobuffers(page); + folio_mkclean(folio); + filemap_dirty_folio(mapping, folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out: return ret; } -static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) +static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) { - return __bch2_truncate_page(inode, from >> PAGE_SHIFT, - from, round_up(from, PAGE_SIZE)); + return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, + from, ANYSINT_MAX(loff_t)); } -static int bch2_truncate_pages(struct bch_inode_info *inode, - loff_t start, loff_t end) +static int bch2_truncate_folios(struct bch_inode_info *inode, + loff_t start, loff_t end) { - int ret = __bch2_truncate_page(inode, start >> PAGE_SHIFT, - start, end); + int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, + start, end); if (ret >= 0 && start >> PAGE_SHIFT != end >> PAGE_SHIFT) - ret = __bch2_truncate_page(inode, - end >> PAGE_SHIFT, - start, end); + ret = __bch2_truncate_folio(inode, + (end - 1) >> PAGE_SHIFT, + start, end); return ret; } -static int bch2_extend(struct user_namespace *mnt_userns, +static int bch2_extend(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, struct iattr *iattr) @@ -2751,7 +2995,7 @@ static int bch2_extend(struct user_namespace *mnt_userns, truncate_setsize(&inode->v, iattr->ia_size); - return bch2_setattr_nonsize(mnt_userns, inode, iattr); + return bch2_setattr_nonsize(idmap, inode, iattr); } static int bch2_truncate_finish_fn(struct bch_inode_info *inode, @@ -2772,7 +3016,7 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode, return 0; } -int bch2_truncate(struct user_namespace *mnt_userns, +int bch2_truncate(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; @@ -2817,13 +3061,13 @@ int bch2_truncate(struct user_namespace *mnt_userns, (u64) inode->v.i_size, inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(mnt_userns, inode, &inode_u, iattr); + ret = bch2_extend(idmap, inode, &inode_u, iattr); goto err; } iattr->ia_valid &= ~ATTR_SIZE; - ret = bch2_truncate_page(inode, iattr->ia_size); + ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; @@ -2875,7 +3119,7 @@ int bch2_truncate(struct user_namespace *mnt_userns, ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); mutex_unlock(&inode->ei_update_lock); - ret = bch2_setattr_nonsize(mnt_userns, inode, iattr); + ret = bch2_setattr_nonsize(idmap, inode, iattr); err: bch2_pagecache_block_put(inode); return bch2_err_class(ret); @@ -2901,7 +3145,7 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len bool truncated_last_page; int ret = 0; - ret = bch2_truncate_pages(inode, offset, end); + ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) goto err; @@ -3023,7 +3267,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, k = insert ? bch2_btree_iter_peek_prev(&src) - : bch2_btree_iter_peek(&src); + : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); if ((ret = bkey_err(k))) continue; @@ -3031,13 +3275,13 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, break; if (insert && - bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) + bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) break; reassemble: bch2_bkey_buf_reassemble(©, c, k); if (insert && - bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) + bkey_lt(bkey_start_pos(k.k), move_pos)) bch2_cut_front(move_pos, copy.k); copy.k->k.p.offset += shift >> 9; @@ -3047,7 +3291,7 @@ reassemble: if (ret) continue; - if (bkey_cmp(atomic_end, copy.k->k.p)) { + if (!bkey_eq(atomic_end, copy.k->k.p)) { if (insert) { move_pos = atomic_end; move_pos.offset -= shift >> 9; @@ -3116,20 +3360,23 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, struct btree_trans trans; struct btree_iter iter; struct bpos end_pos = POS(inode->v.i_ino, end_sector); - struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct bch_io_opts opts; int ret = 0; + bch2_inode_opts_get(&opts, c, &inode->ei_inode); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, POS(inode->v.i_ino, start_sector), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - while (!ret && bkey_cmp(iter.pos, end_pos) < 0) { + while (!ret && bkey_lt(iter.pos, end_pos)) { s64 i_sectors_delta = 0; struct quota_res quota_res = { 0 }; struct bkey_s_c k; unsigned sectors; + bool is_allocation; + u64 hole_start, hole_end; u32 snapshot; bch2_trans_begin(&trans); @@ -3145,6 +3392,10 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if ((ret = bkey_err(k))) goto bkey_err; + hole_start = iter.pos.offset; + hole_end = bpos_min(k.k->p, end_pos).offset; + is_allocation = bkey_extent_is_allocation(k.k); + /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { @@ -3158,17 +3409,30 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, continue; } - /* - * XXX: for nocow mode, we should promote shared extents to - * unshared here - */ + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + if (bch2_clamp_data_hole(&inode->v, + &hole_start, + &hole_end, + opts.data_replicas, true)) + ret = drop_locks_do(&trans, + (bch2_clamp_data_hole(&inode->v, + &hole_start, + &hole_end, + opts.data_replicas, false), 0)); + bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + + if (ret) + goto bkey_err; - sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset; + if (hole_start == hole_end) + continue; + } - if (!bkey_extent_is_allocation(k.k)) { + sectors = hole_end - hole_start; + + if (!is_allocation) { ret = bch2_quota_reservation_add(c, inode, - "a_res, - sectors, true); + "a_res, sectors, true); if (unlikely(ret)) goto bkey_err; } @@ -3180,15 +3444,15 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, goto bkey_err; i_sectors_acct(c, inode, "a_res, i_sectors_delta); + + drop_locks_do(&trans, + (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); bkey_err: bch2_quota_reservation_put(c, inode, "a_res); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; } - bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ - mark_pagecache_reserved(inode, start_sector, iter.pos.offset); - if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { struct quota_res quota_res = { 0 }; s64 i_sectors_delta = 0; @@ -3221,7 +3485,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode, } if (mode & FALLOC_FL_ZERO_RANGE) { - ret = bch2_truncate_pages(inode, offset, end); + ret = bch2_truncate_folios(inode, offset, end); if (unlikely(ret < 0)) return ret; @@ -3268,7 +3532,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -3292,11 +3556,15 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } +/* + * Take a quota reservation for unallocated blocks in a given file range + * Does not check pagecache + */ static int quota_reserve_range(struct bch_inode_info *inode, struct quota_res *res, u64 start, u64 end) @@ -3432,22 +3700,26 @@ err: /* fseek: */ -static int page_data_offset(struct page *page, unsigned offset) +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) { - struct bch_page_state *s = bch2_page_state(page); - unsigned i; + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); if (s) - for (i = offset >> 9; i < PAGE_SECTORS; i++) - if (s->s[i].state >= SECTOR_DIRTY) - return i << 9; + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; return -1; } static loff_t bch2_seek_pagecache_data(struct inode *vinode, loff_t start_offset, - loff_t end_offset) + loff_t end_offset, + unsigned min_replicas, + bool nonblock) { struct folio_batch fbatch; pgoff_t start_index = start_offset >> PAGE_SHIFT; @@ -3464,21 +3736,23 @@ static loff_t bch2_seek_pagecache_data(struct inode *vinode, for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - folio_lock(folio); + if (!nonblock) { + folio_lock(folio); + } else if (!folio_trylock(folio)) { + folio_batch_release(&fbatch); + return -EAGAIN; + } - offset = page_data_offset(&folio->page, - folio->index == start_index - ? start_offset & (PAGE_SIZE - 1) - : 0); + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); if (offset >= 0) { - ret = clamp(((loff_t) folio->index << PAGE_SHIFT) + - offset, + ret = clamp(folio_pos(folio) + offset, start_offset, end_offset); folio_unlock(folio); folio_batch_release(&fbatch); return ret; } - folio_unlock(folio); } folio_batch_release(&fbatch); @@ -3512,11 +3786,11 @@ retry: if (ret) goto err; - for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, - SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { - if (k.k->p.inode != inode->v.i_ino) { - break; - } else if (bkey_extent_is_data(k.k)) { + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + POS(inode->v.i_ino, U64_MAX), + 0, k, ret) { + if (bkey_extent_is_data(k.k)) { next_data = max(offset, bkey_start_offset(k.k) << 9); break; } else if (k.k->p.offset >> 9 > isize) @@ -3533,7 +3807,7 @@ err: if (next_data > offset) next_data = bch2_seek_pagecache_data(&inode->v, - offset, next_data); + offset, next_data, 0, false); if (next_data >= isize) return -ENXIO; @@ -3541,58 +3815,86 @@ err: return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); } -static int __page_hole_offset(struct page *page, unsigned offset) +static int folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas, bool nonblock) { - struct bch_page_state *s = bch2_page_state(page); - unsigned i; - - if (!s) - return 0; - - for (i = offset >> 9; i < PAGE_SECTORS; i++) - if (s->s[i].state < SECTOR_DIRTY) - return i << 9; - - return -1; -} + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + bool ret = true; -static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) -{ - pgoff_t index = offset >> PAGE_SHIFT; - struct page *page; - int pg_offset; - loff_t ret = -1; + folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, + !nonblock ? FGP_LOCK : 0, 0); + if (IS_ERR_OR_NULL(folio)) + return true; - page = find_lock_page(mapping, index); - if (!page) - return offset; + if (nonblock && !folio_trylock(folio)) { + folio_put(folio); + return -EAGAIN; + } - pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); - if (pg_offset >= 0) - ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; + s = bch2_folio(folio); + if (!s) + goto unlock; - unlock_page(page); + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + *offset = folio_end_pos(folio); + ret = false; +unlock: + folio_unlock(folio); + folio_put(folio); return ret; } static loff_t bch2_seek_pagecache_hole(struct inode *vinode, loff_t start_offset, - loff_t end_offset) + loff_t end_offset, + unsigned min_replicas, + bool nonblock) { struct address_space *mapping = vinode->i_mapping; - loff_t offset = start_offset, hole; + loff_t offset = start_offset; - while (offset < end_offset) { - hole = page_hole_offset(mapping, offset); - if (hole >= 0 && hole <= end_offset) - return max(start_offset, hole); + while (offset < end_offset && + !folio_hole_offset(mapping, &offset, min_replicas, nonblock)) + ; - offset += PAGE_SIZE; - offset &= PAGE_MASK; - } + return min(offset, end_offset); +} - return end_offset; +static int bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas, + bool nonblock) +{ + loff_t ret; + + ret = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_start = ret; + + if (*hole_start == *hole_end) + return 0; + + ret = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; + if (ret < 0) + return ret; + + *hole_end = ret; + return 0; } static loff_t bch2_seek_hole(struct file *file, u64 offset) @@ -3624,12 +3926,12 @@ retry: BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE); + offset, MAX_LFS_FILESIZE, 0, false); break; } else if (!bkey_extent_is_data(k.k)) { next_hole = bch2_seek_pagecache_hole(&inode->v, max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9); + k.k->p.offset << 9, 0, false); if (next_hole < k.k->p.offset << 9) break; @@ -3686,25 +3988,26 @@ void bch2_fs_fsio_exit(struct bch_fs *c) int bch2_fs_fsio_init(struct bch_fs *c) { - int ret = 0; - - pr_verbose_init(c->opts, ""); - if (bioset_init(&c->writepage_bioset, 4, offsetof(struct bch_writepage_io, op.wbio.bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->dio_read_bioset, + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_writepage_bioset_init; + + if (bioset_init(&c->dio_read_bioset, 4, offsetof(struct dio_read, rbio.bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->dio_write_bioset, + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_read_bioset_init; + + if (bioset_init(&c->dio_write_bioset, 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->nocow_flush_bioset, + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_write_bioset_init; + + if (bioset_init(&c->nocow_flush_bioset, 1, offsetof(struct nocow_flush, bio), 0)) - ret = -ENOMEM; + return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + return 0; } #endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index a883529..af90533 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -30,7 +30,7 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); int bch2_fsync(struct file *, loff_t, loff_t, int); -int bch2_truncate(struct user_namespace *, +int bch2_truncate(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 2bb6808..dfa1bf7 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -93,7 +93,7 @@ static int bch2_ioc_setflags(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { ret = -EACCES; goto setflags_out; } @@ -172,7 +172,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, return ret; inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_user_ns(file), &inode->v)) { + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { ret = -EACCES; goto err; } @@ -382,7 +382,7 @@ retry: dir = dst_path.dentry->d_inode; if (IS_DEADDIR(dir)) { - error = -ENOENT; + error = -BCH_ERR_ENOENT_directory_dead; goto err3; } @@ -393,7 +393,7 @@ retry: goto err3; } - error = inode_permission(file_mnt_user_ns(filp), + error = inode_permission(file_mnt_idmap(filp), dir, MAY_WRITE | MAY_EXEC); if (error) goto err3; @@ -409,7 +409,7 @@ retry: !arg.src_ptr) snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; - inode = __bch2_create(file_mnt_user_ns(filp), to_bch_ei(dir), + inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), dst_dentry, arg.mode|S_IFDIR, 0, snapshot_src, create_flags); error = PTR_ERR_OR_ZERO(inode); @@ -451,19 +451,20 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, return ret; if (path.dentry->d_sb->s_fs_info != c) { - path_put(&path); - return -EXDEV; + ret = -EXDEV; + goto err; } dir = path.dentry->d_parent->d_inode; ret = __bch2_unlink(dir, path.dentry, true); - if (!ret) { - fsnotify_rmdir(dir, path.dentry); - d_delete(path.dentry); - } - path_put(&path); + if (ret) + goto err; + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); +err: + path_put(&path); return ret; } diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 8621738..8d2f388 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -105,6 +105,11 @@ retry: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, + "inode %u:%llu not found when updating", + inode_inum(inode).subvol, + inode_inum(inode).inum); + bch2_trans_exit(&trans); return ret < 0 ? ret : 0; } @@ -201,13 +206,17 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) return ERR_PTR(ret); } + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + unlock_new_inode(&inode->v); return &inode->v; } struct bch_inode_info * -__bch2_create(struct user_namespace *mnt_userns, +__bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, umode_t mode, dev_t rdev, subvol_inum snapshot_src, unsigned flags) @@ -253,8 +262,8 @@ retry: inode_inum(dir), &dir_u, &inode_u, !(flags & BCH_CREATE_TMPFILE) ? &dentry->d_name : NULL, - from_kuid(mnt_userns, current_fsuid()), - from_kgid(mnt_userns, current_fsgid()), + from_kuid(i_user_ns(&dir->v), current_fsuid()), + from_kgid(i_user_ns(&dir->v), current_fsgid()), mode, rdev, default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, @@ -314,6 +323,9 @@ err_before_quota: inode = old; } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); /* * we really don't want insert_inode_locked2() to be setting * I_NEW... @@ -358,12 +370,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, return d_splice_alias(vinode, dentry); } -static int bch2_mknod(struct user_namespace *mnt_userns, +static int bch2_mknod(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, rdev, + __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) @@ -373,11 +385,11 @@ static int bch2_mknod(struct user_namespace *mnt_userns, return 0; } -static int bch2_create(struct user_namespace *mnt_userns, +static int bch2_create(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode, bool excl) { - return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFREG, 0); + return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); } static int __bch2_link(struct bch_fs *c, @@ -442,19 +454,27 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_trans_init(&trans, c, 4, 1024); ret = commit_do(&trans, NULL, NULL, - BTREE_INSERT_NOFAIL, - bch2_unlink_trans(&trans, - inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name, - deleting_snapshot)); + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); + if (unlikely(ret)) + goto err; - if (likely(!ret)) { - bch2_inode_update_after_write(&trans, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(&trans, inode, &inode_u, - ATTR_MTIME); - } + bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_MTIME); + if (inode_u.bi_subvol) { + /* + * Subvolume deletion is asynchronous, but we still want to tell + * the VFS that it's been deleted here: + */ + set_nlink(&inode->v, 0); + } +err: bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -466,7 +486,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return __bch2_unlink(vdir, dentry, false); } -static int bch2_symlink(struct user_namespace *mnt_userns, +static int bch2_symlink(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, const char *symname) { @@ -474,7 +494,7 @@ static int bch2_symlink(struct user_namespace *mnt_userns, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(mnt_userns, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); @@ -501,13 +521,13 @@ err: return ret; } -static int bch2_mkdir(struct user_namespace *mnt_userns, +static int bch2_mkdir(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(mnt_userns, vdir, dentry, mode|S_IFDIR, 0); + return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); } -static int bch2_rename2(struct user_namespace *mnt_userns, +static int bch2_rename2(struct mnt_idmap *idmap, struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) @@ -614,7 +634,7 @@ err: return ret; } -static void bch2_setattr_copy(struct user_namespace *mnt_userns, +static void bch2_setattr_copy(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, struct iattr *attr) @@ -623,9 +643,9 @@ static void bch2_setattr_copy(struct user_namespace *mnt_userns, unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) - bi->bi_uid = from_kuid(mnt_userns, attr->ia_uid); + bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); if (ia_valid & ATTR_GID) - bi->bi_gid = from_kgid(mnt_userns, attr->ia_gid); + bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); if (ia_valid & ATTR_SIZE) bi->bi_size = attr->ia_size; @@ -644,13 +664,13 @@ static void bch2_setattr_copy(struct user_namespace *mnt_userns, : inode->v.i_gid; if (!in_group_p(gid) && - !capable_wrt_inode_uidgid(mnt_userns, &inode->v, CAP_FSETID)) + !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) mode &= ~S_ISGID; bi->bi_mode = mode; } } -int bch2_setattr_nonsize(struct user_namespace *mnt_userns, +int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_inode_info *inode, struct iattr *attr) { @@ -667,10 +687,10 @@ int bch2_setattr_nonsize(struct user_namespace *mnt_userns, qid = inode->ei_qid; if (attr->ia_valid & ATTR_UID) - qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); if (attr->ia_valid & ATTR_GID) - qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); ret = bch2_fs_quota_transfer(c, inode, qid, ~0, KEY_TYPE_QUOTA_PREALLOC); @@ -688,7 +708,7 @@ retry: if (ret) goto btree_err; - bch2_setattr_copy(mnt_userns, inode, &inode_u, attr); + bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, @@ -720,7 +740,7 @@ err: return bch2_err_class(ret); } -static int bch2_getattr(struct user_namespace *mnt_userns, +static int bch2_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned query_flags) { @@ -761,7 +781,7 @@ static int bch2_getattr(struct user_namespace *mnt_userns, return 0; } -static int bch2_setattr(struct user_namespace *mnt_userns, +static int bch2_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); @@ -769,28 +789,29 @@ static int bch2_setattr(struct user_namespace *mnt_userns, lockdep_assert_held(&inode->v.i_rwsem); - ret = setattr_prepare(mnt_userns, dentry, iattr); + ret = setattr_prepare(idmap, dentry, iattr); if (ret) return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(mnt_userns, inode, iattr) - : bch2_setattr_nonsize(mnt_userns, inode, iattr); + ? bch2_truncate(idmap, inode, iattr) + : bch2_setattr_nonsize(idmap, inode, iattr); } -static int bch2_tmpfile(struct user_namespace *mnt_userns, - struct inode *vdir, struct dentry *dentry, umode_t mode) +static int bch2_tmpfile(struct mnt_idmap *idmap, + struct inode *vdir, struct file *file, umode_t mode) { struct bch_inode_info *inode = - __bch2_create(mnt_userns, to_bch_ei(vdir), dentry, mode, 0, + __bch2_create(idmap, to_bch_ei(vdir), + file->f_path.dentry, mode, 0, (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return bch2_err_class(PTR_ERR(inode)); - d_mark_tmpfile(dentry, &inode->v); - d_instantiate(dentry, &inode->v); - return 0; + d_mark_tmpfile(file, &inode->v); + d_instantiate(file->f_path.dentry, &inode->v); + return finish_open_simple(file, 0); } static int bch2_fill_extent(struct bch_fs *c, @@ -922,6 +943,7 @@ retry: cur.k->k.p.offset += cur.k->k.size; if (have_extent) { + bch2_trans_unlock(&trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), 0); if (ret) @@ -940,9 +962,11 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; - if (!ret && have_extent) + if (!ret && have_extent) { + bch2_trans_unlock(&trans); ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); + } bch2_trans_exit(&trans); bch2_bkey_buf_exit(&cur, c); @@ -990,7 +1014,7 @@ static const struct file_operations bch_file_operations = { .mmap = bch2_mmap, .open = generic_file_open, .fsync = bch2_fsync, - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = bch2_fallocate_dispatch, .unlocked_ioctl = bch2_fs_file_ioctl, @@ -1240,14 +1264,14 @@ retry: goto err; if (k.k->type != KEY_TYPE_dirent) { - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); if (ret > 0) - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; if (ret) goto err; @@ -1348,6 +1372,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, inode->v.i_op = &bch_special_inode_operations; break; } + + mapping_set_large_folios(inode->v.i_mapping); } static struct inode *bch2_alloc_inode(struct super_block *sb) @@ -1361,6 +1387,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); mutex_init(&inode->ei_quota_lock); return &inode->v; @@ -1425,53 +1452,74 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); } + + mutex_lock(&c->vfs_inodes_lock); + list_del_init(&inode->ei_vfs_inode_list); + mutex_unlock(&c->vfs_inodes_lock); } -void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct super_block *sb = c->vfs_sb; - struct inode *inode; + struct bch_inode_info *inode, **i; + DARRAY(struct bch_inode_info *) grabbed; + bool clean_pass = false, this_pass_clean; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) - continue; + /* + * Initially, we scan for inodes without I_DONTCACHE, then mark them to + * be pruned with d_mark_dontcache(). + * + * Once we've had a clean pass where we didn't find any inodes without + * I_DONTCACHE, we wait for them to be freed: + */ - d_mark_dontcache(inode); - d_prune_aliases(inode); - } - spin_unlock(&sb->s_inode_list_lock); + darray_init(&grabbed); + darray_make_room(&grabbed, 1024); again: cond_resched(); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) + this_pass_clean = true; + + mutex_lock(&c->vfs_inodes_lock); + list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { + if (!snapshot_list_has_id(s, inode->ei_subvol)) continue; - if (!(inode->i_state & I_DONTCACHE)) { - d_mark_dontcache(inode); - d_prune_aliases(inode); - } + if (!(inode->v.i_state & I_DONTCACHE) && + !(inode->v.i_state & I_FREEING) && + igrab(&inode->v)) { + this_pass_clean = false; + + if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { + iput(&inode->v); + break; + } + } else if (clean_pass && this_pass_clean) { + wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); - spin_lock(&inode->i_lock); - if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && - !(inode->i_state & I_FREEING)) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + mutex_unlock(&c->vfs_inodes_lock); + schedule(); finish_wait(wq, &wait.wq_entry); goto again; } + } + mutex_unlock(&c->vfs_inodes_lock); + + darray_for_each(grabbed, i) { + inode = *i; + d_mark_dontcache(&inode->v); + d_prune_aliases(&inode->v); + iput(&inode->v); + } + grabbed.nr = 0; - spin_unlock(&inode->i_lock); + if (!clean_pass || !this_pass_clean) { + clean_pass = this_pass_clean; + goto again; } - spin_unlock(&sb->s_inode_list_lock); + + darray_exit(&grabbed); } static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1769,8 +1817,11 @@ got_sb: kfree(devs[0]); kfree(devs); - if (IS_ERR(sb)) - return ERR_CAST(sb); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + ret = bch2_err_class(ret); + return ERR_PTR(ret); + } c = sb->s_fs_info; @@ -1844,7 +1895,7 @@ out: err_put_super: deactivate_locked_super(sb); - return ERR_PTR(ret); + return ERR_PTR(bch2_err_class(ret)); } static void bch2_kill_sb(struct super_block *sb) @@ -1875,7 +1926,7 @@ int __init bch2_vfs_init(void) { int ret = -ENOMEM; - bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); if (!bch2_inode_cache) goto err; diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 6b91bbe..6170d21 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -13,12 +13,12 @@ struct bch_inode_info { struct inode v; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; struct mutex ei_update_lock; u64 ei_quota_reserved; unsigned long ei_last_dirtied; - two_state_lock_t ei_pagecache_lock; struct mutex ei_quota_lock; @@ -149,7 +149,7 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS struct bch_inode_info * -__bch2_create(struct user_namespace *, struct bch_inode_info *, +__bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); int bch2_fs_quota_transfer(struct bch_fs *, @@ -184,7 +184,7 @@ void bch2_inode_update_after_write(struct btree_trans *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -int bch2_setattr_nonsize(struct user_namespace *, +int bch2_setattr_nonsize(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); @@ -196,6 +196,8 @@ int bch2_vfs_init(void); #else +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index ffc2671..0852dbe 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "bkey_buf.h" #include "btree_update.h" +#include "buckets.h" #include "darray.h" #include "dirent.h" #include "error.h" @@ -31,14 +32,12 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, u64 sectors = 0; int ret; - for_each_btree_key(trans, iter, BTREE_ID_extents, - SPOS(inum, 0, snapshot), 0, k, ret) { - if (k.k->p.inode != inum) - break; - + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) if (bkey_extent_is_allocation(k.k)) sectors += k.k->size; - } bch2_trans_iter_exit(trans, &iter); @@ -54,11 +53,10 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, u64 subdirs = 0; int ret; - for_each_btree_key(trans, iter, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), 0, k, ret) { - if (k.k->p.inode != inum) - break; - + for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) continue; @@ -66,7 +64,6 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, if (d.v->d_type == DT_DIR) subdirs++; } - bch2_trans_iter_exit(trans, &iter); return ret ?: subdirs; @@ -75,26 +72,14 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvol) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS(0, snapshot), 0); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_snapshot) { + struct bch_snapshot s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, + POS(0, snapshot), 0, + snapshot, &s); + if (!ret) + *subvol = le32_to_cpu(s.subvol); + else if (bch2_err_matches(ret, ENOENT)) bch_err(trans->c, "snapshot %u not fonud", snapshot); - ret = -ENOENT; - goto err; - } - - *subvol = le32_to_cpu(bkey_s_c_to_snapshot(k).v->subvol); -err: - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -133,8 +118,8 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, if (ret) goto err; - if (!k.k || bkey_cmp(k.k->p, POS(0, inode_nr))) { - ret = -ENOENT; + if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -155,16 +140,15 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, *snapshot), 0); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, *snapshot), 0); ret = bkey_err(k); if (ret) goto err; ret = bkey_is_inode(k.k) ? bch2_inode_unpack(k, inode) - : -ENOENT; + : -BCH_ERR_ENOENT_inode; if (!ret) *snapshot = iter.pos.snapshot; err: @@ -207,17 +191,18 @@ static int __write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode, u32 snapshot) { - struct btree_iter iter; - int ret; + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode->bi_inum, snapshot), - BTREE_ITER_INTENT); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_inode_write(trans, &iter, inode); - bch2_trans_iter_exit(trans, &iter); - return ret; + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); } static int write_inode(struct btree_trans *trans, @@ -234,71 +219,6 @@ static int write_inode(struct btree_trans *trans, return ret; } -static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; - struct bkey_s_c k; - int ret; - - do { - ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL) ?: - bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, - SPOS(inum, 0, snapshot), - SPOS(inum, U64_MAX, snapshot), - 0, NULL); - } while (ret == -BCH_ERR_transaction_restart_nested); - if (ret) - goto err; -retry: - bch2_trans_begin(trans); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, inum, snapshot), BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - - ret = bkey_err(k); - if (ret) - goto err; - - if (!bkey_is_inode(k.k)) { - bch2_fs_inconsistent(c, - "inode %llu:%u not found when deleting", - inum, snapshot); - ret = -EIO; - goto err; - } - - bch2_inode_unpack(k, &inode_u); - - /* Subvolume root? */ - if (inode_u.bi_subvol) - bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL); -err: - bch2_trans_iter_exit(trans, &iter); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - goto retry; - - return ret ?: -BCH_ERR_transaction_restart_nested; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -321,7 +241,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_exit(trans, &iter); err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -351,7 +271,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, ret = __lookup_dirent(trans, root_hash_info, root_inum, &lostfound_str, &inum, &d_type); - if (ret == -ENOENT) { + if (bch2_err_matches(ret, ENOENT)) { bch_notice(c, "creating lost+found"); goto create_lostfound; } @@ -367,7 +287,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol, } /* - * The check_dirents pass has already run, dangling dirents + * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ return __lookup_inode(trans, inum, lostfound, &snapshot); @@ -456,22 +376,14 @@ static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { struct btree_iter iter; - struct bkey_s_c k; + struct bkey_s_c_dirent d; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, - POS(inode->bi_dir, inode->bi_dir_offset), 0); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k->type != KEY_TYPE_dirent) { - ret = -ENOENT; - goto out; - } - - ret = __remove_dirent(trans, k.k->p); -out: + d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, + POS(inode->bi_dir, inode->bi_dir_offset), 0, + dirent); + ret = bkey_err(d) ?: + __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -496,28 +408,6 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) memset(s, 0, sizeof(*s)); } -static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) -{ - struct snapshots_seen_entry *i, n = { id, id }; - int ret; - - darray_for_each(s->ids, i) { - if (n.equiv < i->equiv) - break; - - if (i->equiv == n.equiv) { - bch_err(c, "%s(): adding duplicate snapshot", __func__); - return -EINVAL; - } - } - - ret = darray_insert_item(&s->ids, i - s->ids.data, n); - if (ret) - bch_err(c, "error reallocating snapshots_seen table (size %zu)", - s->ids.size); - return ret; -} - static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, enum btree_id btree_id, struct bpos pos) { @@ -527,30 +417,34 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, }; int ret = 0; - if (bkey_cmp(s->pos, pos)) + if (!bkey_eq(s->pos, pos)) s->ids.nr = 0; - pos.snapshot = n.equiv; s->pos = pos; + s->pos.snapshot = n.equiv; - darray_for_each(s->ids, i) - if (i->equiv == n.equiv) { - if (fsck_err_on(i->id != n.id, c, - "snapshot deletion did not run correctly:\n" - " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", - bch2_btree_ids[btree_id], - pos.inode, pos.offset, - i->id, n.id, n.equiv)) - return -BCH_ERR_need_snapshot_cleanup; - + darray_for_each(s->ids, i) { + if (i->id == n.id) return 0; + + /* + * We currently don't rigorously track for snapshot cleanup + * needing to be run, so it shouldn't be a fsck error yet: + */ + if (i->equiv == n.equiv) { + bch_err(c, "snapshot deletion did not finish:\n" + " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", + bch2_btree_ids[btree_id], + pos.inode, pos.offset, + i->id, n.id, n.equiv); + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } + } ret = darray_push(&s->ids, n); if (ret) bch_err(c, "error reallocating snapshots_seen table (size %zu)", s->ids.size); -fsck_err: return ret; } @@ -564,15 +458,14 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see u32 id, u32 ancestor) { ssize_t i; - u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0; - BUG_ON(id > ancestor); - BUG_ON(!bch2_snapshot_is_equiv(c, id)); - BUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); + EBUG_ON(id > ancestor); + EBUG_ON(!bch2_snapshot_is_equiv(c, id)); + EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); /* @ancestor should be the snapshot most recently added to @seen */ - BUG_ON(ancestor != seen->pos.snapshot); - BUG_ON(ancestor != top); + EBUG_ON(ancestor != seen->pos.snapshot); + EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); if (id == ancestor) return true; @@ -580,11 +473,20 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see if (!bch2_snapshot_is_ancestor(c, id, ancestor)) return false; + /* + * We know that @id is a descendant of @ancestor, we're checking if + * we've seen a key that overwrote @ancestor - i.e. also a descendent of + * @ascestor and with @id as a descendent. + * + * But we already know that we're scanning IDs between @id and @ancestor + * numerically, since snapshot ID lists are kept sorted, so if we find + * an id that's an ancestor of @id we're done: + */ + for (i = seen->ids.nr - 2; i >= 0 && seen->ids.data[i].equiv >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) && - bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) return false; return true; @@ -609,6 +511,20 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, : bch2_snapshot_is_ancestor(c, src, dst); } +static int ref_visible2(struct bch_fs *c, + u32 src, struct snapshots_seen *src_seen, + u32 dst, struct snapshots_seen *dst_seen) +{ + src = bch2_snapshot_equiv(c, src); + dst = bch2_snapshot_equiv(c, dst); + + if (dst > src) { + swap(dst, src); + swap(dst_seen, src_seen); + } + return key_visible_in_snapshot(c, src_seen, dst, src); +} + #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ (_i)->snapshot <= (_snapshot); _i++) \ @@ -617,12 +533,14 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, struct inode_walker_entry { struct bch_inode_unpacked inode; u32 snapshot; + bool seen_this_pos; u64 count; }; struct inode_walker { bool first_this_inode; - u64 cur_inum; + bool recalculate_sums; + struct bpos last_pos; DARRAY(struct inode_walker_entry) inodes; }; @@ -650,28 +568,21 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, })); } -static int __walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos) +static int get_inodes_all_snapshots(struct btree_trans *trans, + struct inode_walker *w, u64 inum) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; u32 restart_count = trans->restart_count; - unsigned i; int ret; - pos.snapshot = bch2_snapshot_equiv(c, pos.snapshot); - - if (pos.inode == w->cur_inum) { - w->first_this_inode = false; - goto lookup_snapshot; - } - + w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { - if (k.k->p.offset != pos.inode) + if (k.k->p.offset != inum) break; if (bkey_is_inode(k.k)) @@ -682,40 +593,76 @@ static int __walk_inode(struct btree_trans *trans, if (ret) return ret; - w->cur_inum = pos.inode; - w->first_this_inode = true; + w->first_this_inode = true; if (trans_was_restarted(trans, restart_count)) return -BCH_ERR_transaction_restart_nested; -lookup_snapshot: - for (i = 0; i < w->inodes.nr; i++) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) + return 0; +} + +static struct inode_walker_entry * +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, + u32 snapshot, bool is_whiteout) +{ + struct inode_walker_entry *i; + + snapshot = bch2_snapshot_equiv(c, snapshot); + + darray_for_each(w->inodes, i) + if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; - return INT_MAX; + + return NULL; found: - BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); + BUG_ON(snapshot > i->snapshot); - if (pos.snapshot != w->inodes.data[i].snapshot) { - struct inode_walker_entry e = w->inodes.data[i]; + if (snapshot != i->snapshot && !is_whiteout) { + struct inode_walker_entry new = *i; + size_t pos; + int ret; - e.snapshot = pos.snapshot; - e.count = 0; + new.snapshot = snapshot; + new.count = 0; bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - pos.inode, pos.snapshot, w->inodes.data[i].snapshot); + w->last_pos.inode, snapshot, i->snapshot); - while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) + while (i > w->inodes.data && i[-1].snapshot > snapshot) --i; - ret = darray_insert_item(&w->inodes, i, e); + pos = i - w->inodes.data; + ret = darray_insert_item(&w->inodes, pos, new); if (ret) - return ret; + return ERR_PTR(ret); + + i = w->inodes.data + pos; } return i; } +static struct inode_walker_entry *walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos, + bool is_whiteout) +{ + if (w->last_pos.inode != pos.inode) { + int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (ret) + return ERR_PTR(ret); + } else if (bkey_cmp(w->last_pos, pos)) { + struct inode_walker_entry *i; + + darray_for_each(w->inodes, i) + i->seen_this_pos = false; + + } + + w->last_pos = pos; + + return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); +} + static int __get_visible_inodes(struct btree_trans *trans, struct inode_walker *w, struct snapshots_seen *s, @@ -779,12 +726,10 @@ static int hash_redo_key(struct btree_trans *trans, if (IS_ERR(delete)) return PTR_ERR(delete); - tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + tmp = bch2_bkey_make_mut_noupdate(trans, k); if (IS_ERR(tmp)) return PTR_ERR(tmp); - bkey_reassemble(tmp, k); - bkey_init(&delete->k); delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: @@ -823,9 +768,9 @@ static int hash_check_key(struct btree_trans *trans, goto bad_hash; for_each_btree_key_norestart(trans, iter, desc.btree_id, - POS(hash_k.k->p.inode, hash), + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), BTREE_ITER_SLOTS, k, ret) { - if (!bkey_cmp(k.k->p, hash_k.k->p)) + if (bkey_eq(k.k->p, hash_k.k->p)) break; if (fsck_err_on(k.k->type == desc.key_type && @@ -853,10 +798,10 @@ bad_hash: (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); - if (ret) { + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); + if (ret) return ret; - } ret = -BCH_ERR_transaction_restart_nested; } fsck_err: @@ -890,7 +835,7 @@ static int check_inode(struct btree_trans *trans, * particular is not atomic, so on the internal snapshot nodes * we can see inodes marked for deletion after a clean shutdown */ - if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) + if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) return 0; if (!bkey_is_inode(k.k)) @@ -921,7 +866,7 @@ static int check_inode(struct btree_trans *trans, bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); - ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err(c, "error in fsck: error while deleting inode: %s", bch2_err_str(ret)); @@ -946,11 +891,11 @@ static int check_inode(struct btree_trans *trans, iter->pos.snapshot), POS(u.bi_inum, U64_MAX), 0, NULL); - if (ret) { + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err(c, "error in fsck: error truncating inode: %s", bch2_err_str(ret)); + if (ret) return ret; - } /* * We truncated without our normal sector accounting hook, just @@ -999,13 +944,14 @@ static int check_inode(struct btree_trans *trans, err: fsck_err: if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } noinline_for_stack -static int check_inodes(struct bch_fs *c, bool full) +int bch2_check_inodes(struct bch_fs *c) { + bool full = c->opts.fsck; struct btree_trans trans; struct btree_iter iter; struct bch_inode_unpacked prev = { 0 }; @@ -1025,69 +971,15 @@ static int check_inodes(struct bch_fs *c, bool full) bch2_trans_exit(&trans); snapshots_seen_exit(&s); if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); - return ret; -} - -/* - * Checking for overlapping extents needs to be reimplemented - */ -#if 0 -static int fix_overlapping_extent(struct btree_trans *trans, - struct bkey_s_c k, struct bpos cut_at) -{ - struct btree_iter iter; - struct bkey_i *u; - int ret; - - u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - - bkey_reassemble(u, k); - bch2_cut_front(cut_at, u); - - - /* - * We don't want to go through the extent_handle_overwrites path: - * - * XXX: this is going to screw up disk accounting, extent triggers - * assume things about extent overwrites - we should be running the - * triggers manually here - */ - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, - BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); - - BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: - bch2_trans_commit(trans, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - bch2_trans_iter_exit(trans, &iter); + bch_err_fn(c, ret); return ret; } -#endif static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, struct btree_iter *iter, struct bpos pos) { - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); - if (!ret && k.k->type != KEY_TYPE_dirent) - ret = -ENOENT; - if (ret) { - bch2_trans_iter_exit(trans, iter); - return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }; - } - - return bkey_s_c_to_dirent(k); + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); } static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, @@ -1115,9 +1007,9 @@ static int inode_backpointer_exists(struct btree_trans *trans, d = dirent_get_by_pos(trans, &iter, SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); - ret = bkey_err(d.s_c); + ret = bkey_err(d); if (ret) - return ret == -ENOENT ? 0 : ret; + return bch2_err_matches(ret, ENOENT) ? 0 : ret; ret = dirent_points_to_inode(d, inode); bch2_trans_iter_exit(trans, &iter); @@ -1136,19 +1028,20 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot); + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); - if (i->count != count2) { - bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", - i->count, count2); + if (w->recalculate_sums) i->count = count2; - if (i->inode.bi_sectors == i->count) - continue; + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); + return -BCH_ERR_internal_fsck_err; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->cur_inum, i->snapshot, + w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = write_inode(trans, &i->inode, i->snapshot); @@ -1158,185 +1051,372 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) } fsck_err: if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); if (!ret && trans_was_restarted(trans, restart_count)) ret = -BCH_ERR_transaction_restart_nested; return ret; } -static int check_extent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - struct inode_walker *inode, - struct snapshots_seen *s) +struct extent_end { + u32 snapshot; + u64 offset; + struct snapshots_seen seen; +}; + +struct extent_ends { + struct bpos last_pos; + DARRAY(struct extent_end) e; +}; + +static void extent_ends_reset(struct extent_ends *extent_ends) { - struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - struct printbuf buf = PRINTBUF; - struct bpos equiv; - int ret = 0; + struct extent_end *i; - ret = check_key_has_snapshot(trans, iter, k); - if (ret) { - ret = ret < 0 ? ret : 0; - goto out; + darray_for_each(extent_ends->e, i) + snapshots_seen_exit(&i->seen); + + extent_ends->e.nr = 0; +} + +static void extent_ends_exit(struct extent_ends *extent_ends) +{ + extent_ends_reset(extent_ends); + darray_exit(&extent_ends->e); +} + +static void extent_ends_init(struct extent_ends *extent_ends) +{ + memset(extent_ends, 0, sizeof(*extent_ends)); +} + +static int extent_ends_at(struct bch_fs *c, + struct extent_ends *extent_ends, + struct snapshots_seen *seen, + struct bkey_s_c k) +{ + struct extent_end *i, n = (struct extent_end) { + .offset = k.k->p.offset, + .snapshot = k.k->p.snapshot, + .seen = *seen, + }; + + n.seen.ids.data = kmemdup(seen->ids.data, + sizeof(seen->ids.data[0]) * seen->ids.size, + GFP_KERNEL); + if (!n.seen.ids.data) + return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + + darray_for_each(extent_ends->e, i) { + if (i->snapshot == k.k->p.snapshot) { + snapshots_seen_exit(&i->seen); + *i = n; + return 0; + } + + if (i->snapshot >= k.k->p.snapshot) + break; } - equiv = k.k->p; - equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); +} - ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); +static int overlapping_extents_found(struct btree_trans *trans, + enum btree_id btree, + struct bpos pos1, struct bkey pos2, + bool *fixed, + struct extent_end *extent_end) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter iter1, iter2 = { NULL }; + struct bkey_s_c k1, k2; + int ret; + + BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); + + bch2_trans_iter_init(trans, &iter1, btree, pos1, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOT_EXTENTS); + k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k1); if (ret) goto err; - if (k.k->type == KEY_TYPE_whiteout) - goto out; + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k1); - if (inode->cur_inum != k.k->p.inode) { - ret = check_i_sectors(trans, inode); + if (!bpos_eq(pos1, k1.k->p)) { + prt_str(&buf, "\n wanted\n "); + bch2_bpos_to_text(&buf, pos1); + prt_str(&buf, "\n "); + bch2_bkey_to_text(&buf, &pos2); + + bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } + + bch2_trans_copy_iter(&iter2, &iter1); + + while (1) { + bch2_btree_iter_advance(&iter2); + + k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k2); if (ret) goto err; + + if (bpos_ge(k2.k->p, pos2.p)) + break; } - BUG_ON(!iter->path->should_be_locked); -#if 0 - if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { - char buf1[200]; - char buf2[200]; + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k2); - bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); - bch2_bkey_val_to_text(&PBUF(buf2), c, k); + if (bpos_gt(k2.k->p, pos2.p) || + pos2.size != k2.k->size) { + bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", + __func__, buf.buf); + ret = -BCH_ERR_internal_fsck_err; + goto err; + } - if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { - ret = fix_overlapping_extent(trans, k, prev.k->k.p) - ?: -BCH_ERR_transaction_restart_nested; - goto out; + prt_printf(&buf, "\n overwriting %s extent", + pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); + + if (fsck_err(c, "overlapping extents%s", buf.buf)) { + struct btree_iter *old_iter = &iter1; + struct disk_reservation res = { 0 }; + + if (pos1.snapshot < pos2.p.snapshot) { + old_iter = &iter2; + swap(k1, k2); } + + trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); + + ret = bch2_trans_update_extent_overwrite(trans, old_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, + k1, k2) ?: + bch2_trans_commit(trans, &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &res); + + if (ret) + goto err; + + *fixed = true; + + if (pos1.snapshot == pos2.p.snapshot) + extent_end->offset = bkey_start_offset(&pos2); + else + ret = -BCH_ERR_transaction_restart_nested; } -#endif - ret = __walk_inode(trans, inode, equiv); - if (ret < 0) +fsck_err: +err: + bch2_trans_iter_exit(trans, &iter2); + bch2_trans_iter_exit(trans, &iter1); + printbuf_exit(&buf); + return ret; +} + +static int check_overlapping_extents(struct btree_trans *trans, + struct snapshots_seen *seen, + struct extent_ends *extent_ends, + struct bkey_s_c k, + u32 equiv, + struct btree_iter *iter, + bool *fixed) +{ + struct bch_fs *c = trans->c; + struct extent_end *i; + int ret = 0; + + /* transaction restart, running again */ + if (bpos_eq(extent_ends->last_pos, k.k->p)) + return 0; + + if (extent_ends->last_pos.inode != k.k->p.inode) + extent_ends_reset(extent_ends); + + darray_for_each(extent_ends->e, i) { + if (i->offset <= bkey_start_offset(k.k)) + continue; + + if (!ref_visible2(c, + k.k->p.snapshot, seen, + i->snapshot, &i->seen)) + continue; + + ret = overlapping_extents_found(trans, iter->btree_id, + SPOS(iter->pos.inode, + i->offset, + i->snapshot), + *k.k, fixed, i); + if (ret) + goto err; + } + + ret = extent_ends_at(c, extent_ends, seen, k); + if (ret) goto err; - if (fsck_err_on(ret == INT_MAX, c, - "extent in missing inode:\n %s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + extent_ends->last_pos = k.k->p; +err: + return ret; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct inode_walker *inode, + struct snapshots_seen *s, + struct extent_ends *extent_ends) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + struct bpos equiv = k.k->p; + int ret = 0; + + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { + ret = ret < 0 ? ret : 0; goto out; } - if (ret == INT_MAX) { - ret = 0; - goto out; + if (inode->last_pos.inode != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + goto err; } - i = inode->inodes.data + ret; - ret = 0; + i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + goto err; - if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && - !S_ISLNK(i->inode.bi_mode), c, - "extent in non regular inode mode %o:\n %s", - i->inode.bi_mode, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - goto out; + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_whiteout) { + if (fsck_err_on(!i, c, + "extent in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + if (fsck_err_on(i && + !S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + goto delete; + + ret = check_overlapping_extents(trans, s, extent_ends, k, + equiv.snapshot, iter, + &inode->recalculate_sums); + if (ret) + goto err; } /* - * Check inodes in reverse order, from oldest snapshots to newest, so - * that we emit the fewest number of whiteouts necessary: + * Check inodes in reverse order, from oldest snapshots to newest, + * starting from the inode that matches this extent's snapshot. If we + * didn't have one, iterate over all inodes: */ - for (i = inode->inodes.data + inode->inodes.nr - 1; - i >= inode->inodes.data; + if (!i) + i = inode->inodes.data + inode->inodes.nr - 1; + + for (; + inode->inodes.data && i >= inode->inodes.data; --i) { if (i->snapshot > equiv.snapshot || !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) continue; - if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && - !bkey_extent_is_reservation(k), c, - "extent type past end of inode %llu:%u, i_size %llu\n %s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct btree_iter iter2; - - bch2_trans_copy_iter(&iter2, iter); - bch2_btree_iter_set_snapshot(&iter2, i->snapshot); - ret = bch2_btree_iter_traverse(&iter2) ?: - bch2_btree_delete_at(trans, &iter2, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - bch2_trans_iter_exit(trans, &iter2); - if (ret) - goto err; - - if (i->snapshot != equiv.snapshot) { - ret = snapshots_seen_add(c, s, i->snapshot); + if (k.k->type != KEY_TYPE_whiteout) { + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + !bkey_extent_is_reservation(k), c, + "extent type past end of inode %llu:%u, i_size %llu\n %s", + i->inode.bi_inum, i->snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + bch2_btree_iter_set_snapshot(&iter2, i->snapshot); + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_btree_delete_at(trans, &iter2, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter2); if (ret) goto err; + + iter->k.type = KEY_TYPE_whiteout; } - } - } - if (bkey_extent_is_allocation(k.k)) - for_each_visible_inode(c, s, inode, equiv.snapshot, i) - i->count += k.k->size; -#if 0 - bch2_bkey_buf_reassemble(&prev, c, k); -#endif + if (bkey_extent_is_allocation(k.k)) + i->count += k.k->size; + } + i->seen_this_pos = true; + } out: err: fsck_err: printbuf_exit(&buf); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; +delete: + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; } /* * Walk extents: verify that extents have a corresponding S_ISREG inode, and * that i_size an i_sectors are consistent */ -noinline_for_stack -static int check_extents(struct bch_fs *c) +int bch2_check_extents(struct bch_fs *c) { struct inode_walker w = inode_walker_init(); struct snapshots_seen s; struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct extent_ends extent_ends; + struct disk_reservation res = { 0 }; int ret = 0; -#if 0 - struct bkey_buf prev; - bch2_bkey_buf_init(&prev); - prev.k->k = KEY(0, 0, 0); -#endif snapshots_seen_init(&s); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - bch_verbose(c, "checking extents"); + extent_ends_init(&extent_ends); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, - BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_extent(&trans, &iter, k, &w, &s)); -#if 0 - bch2_bkey_buf_exit(&prev, c); -#endif + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); + check_extent(&trans, &iter, k, &w, &s, &extent_ends); + })) ?: + check_i_sectors(&trans, &w); + + bch2_disk_reservation_put(c, &res); + extent_ends_exit(&extent_ends); inode_walker_exit(&w); bch2_trans_exit(&trans); snapshots_seen_exit(&s); if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -1352,7 +1432,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); if (count2 < 0) return count2; @@ -1366,7 +1446,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) if (fsck_err_on(i->inode.bi_nlink != i->count, c, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { + w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; ret = write_inode(trans, &i->inode, i->snapshot); if (ret) @@ -1375,7 +1455,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) } fsck_err: if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); if (!ret && trans_was_restarted(trans, restart_count)) ret = -BCH_ERR_transaction_restart_nested; return ret; @@ -1496,7 +1576,7 @@ fsck_err: printbuf_exit(&buf); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -1530,7 +1610,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (dir->cur_inum != k.k->p.inode) { + if (dir->last_pos.inode != k.k->p.inode) { ret = check_subdir_count(trans, dir); if (ret) goto err; @@ -1538,11 +1618,16 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(!iter->path->should_be_locked); - ret = __walk_inode(trans, dir, equiv); + i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; - if (fsck_err_on(ret == INT_MAX, c, + if (dir->first_this_inode && dir->inodes.nr) + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + dir->first_this_inode = false; + + if (fsck_err_on(!i, c, "dirent in nonexisting directory:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -1551,13 +1636,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - if (ret == INT_MAX) { - ret = 0; + if (!i) goto out; - } - - i = dir->inodes.data + ret; - ret = 0; if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, "dirent in non directory inode type %s:\n%s", @@ -1568,11 +1648,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); - - ret = hash_check_key(trans, bch2_dirent_hash_desc, - hash_info, iter, k); + ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -1594,19 +1670,19 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, ret = __subvol_lookup(trans, target_subvol, &target_snapshot, &target_inum); - if (ret && ret != -ENOENT) + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; if (fsck_err_on(ret, c, - "dirent points to missing subvolume %llu", - le64_to_cpu(d.v->d_child_subvol))) { + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) { ret = __remove_dirent(trans, d.k->p); goto err; } ret = __lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); - if (ret && ret != -ENOENT) + if (ret && !bch2_err_matches(ret, ENOENT)) goto err; if (fsck_err_on(ret, c, @@ -1666,7 +1742,7 @@ fsck_err: printbuf_exit(&buf); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -1674,8 +1750,7 @@ fsck_err: * Walk dirents: verify that they all have a corresponding S_ISDIR inode, * validate d_type */ -noinline_for_stack -static int check_dirents(struct bch_fs *c) +int bch2_check_dirents(struct bch_fs *c) { struct inode_walker dir = inode_walker_init(); struct inode_walker target = inode_walker_init(); @@ -1686,8 +1761,6 @@ static int check_dirents(struct bch_fs *c) struct bkey_s_c k; int ret = 0; - bch_verbose(c, "checking dirents"); - snapshots_seen_init(&s); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -1705,7 +1778,7 @@ static int check_dirents(struct bch_fs *c) inode_walker_exit(&target); if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -1715,41 +1788,41 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode) { struct bch_fs *c = trans->c; + struct inode_walker_entry *i; int ret; ret = check_key_has_snapshot(trans, iter, k); if (ret) return ret; - ret = __walk_inode(trans, inode, k.k->p); - if (ret < 0) + i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + ret = PTR_ERR_OR_ZERO(i); + if (ret) return ret; - if (fsck_err_on(ret == INT_MAX, c, + if (inode->first_this_inode && inode->inodes.nr) + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + inode->first_this_inode = false; + + if (fsck_err_on(!i, c, "xattr for missing inode %llu", k.k->p.inode)) return bch2_btree_delete_at(trans, iter, 0); - if (ret == INT_MAX) + if (!i) return 0; - ret = 0; - - if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); - ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); fsck_err: if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } /* * Walk xattrs: verify that they all have a corresponding inode */ -noinline_for_stack -static int check_xattrs(struct bch_fs *c) +int bch2_check_xattrs(struct bch_fs *c) { struct inode_walker inode = inode_walker_init(); struct bch_hash_info hash_info; @@ -1758,8 +1831,6 @@ static int check_xattrs(struct bch_fs *c) struct bkey_s_c k; int ret = 0; - bch_verbose(c, "checking xattrs"); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, @@ -1773,7 +1844,7 @@ static int check_xattrs(struct bch_fs *c) bch2_trans_exit(&trans); if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -1786,7 +1857,7 @@ static int check_root_trans(struct btree_trans *trans) int ret; ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); - if (ret && ret != -ENOENT) + if (ret && !bch2_err_matches(ret, ENOENT)) return ret; if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { @@ -1803,7 +1874,8 @@ static int check_root_trans(struct btree_trans *trans) ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, - __bch2_btree_insert(trans, BTREE_ID_subvolumes, &root_subvol.k_i)); + __bch2_btree_insert(trans, BTREE_ID_subvolumes, + &root_subvol.k_i, 0)); if (ret) { bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); goto err; @@ -1812,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans) } ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); - if (ret && ret != -ENOENT) + if (ret && !bch2_err_matches(ret, ENOENT)) return ret; if (mustfix_fsck_err_on(ret, c, "root directory missing") || @@ -1832,15 +1904,18 @@ fsck_err: } /* Get root directory, create if it doesn't exist: */ -noinline_for_stack -static int check_root(struct bch_fs *c) +int bch2_check_root(struct bch_fs *c) { - bch_verbose(c, "checking root directory"); + int ret; - return bch2_trans_do(c, NULL, NULL, + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW, check_root_trans(&trans)); + + if (ret) + bch_err_fn(c, ret); + return ret; } struct pathbuf_entry { @@ -1911,15 +1986,15 @@ static int check_path(struct btree_trans *trans, PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, SPOS(inode->bi_dir, inode->bi_dir_offset, parent_snapshot))).k)); - if (ret && ret != -ENOENT) + if (ret && !bch2_err_matches(ret, ENOENT)) break; if (!ret && !dirent_points_to_inode(d, inode)) { bch2_trans_iter_exit(trans, &dirent_iter); - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; } - if (ret == -ENOENT) { + if (bch2_err_matches(ret, ENOENT)) { if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", inode->bi_inum, snapshot, bch2_d_type_str(inode_d_type(inode)), @@ -1977,17 +2052,16 @@ static int check_path(struct btree_trans *trans, } fsck_err: if (ret) - bch_err(c, "%s: err %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } /* * Check for unreachable inodes, as well as loops in the directory structure: - * After check_dirents(), if an inode backpointer doesn't exist that means it's + * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's * unreachable: */ -noinline_for_stack -static int check_directory_structure(struct bch_fs *c) +int bch2_check_directory_structure(struct bch_fs *c) { struct btree_trans trans; struct btree_iter iter; @@ -2020,13 +2094,16 @@ static int check_directory_structure(struct bch_fs *c) break; } bch2_trans_iter_exit(&trans, &iter); - + bch2_trans_exit(&trans); darray_exit(&path); - bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); return ret; } +/* check_nlink pass: */ + struct nlink_table { size_t nr; size_t size; @@ -2048,7 +2125,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); - return -ENOMEM; + return -BCH_ERR_ENOMEM_fsck_add_nlink; } if (t->d) @@ -2131,7 +2208,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, * Backpointer and directory structure checks are sufficient for * directories, since they can't have hardlinks: */ - if (S_ISDIR(le16_to_cpu(u.bi_mode))) + if (S_ISDIR(u.bi_mode)) continue; if (!u.bi_nlink) @@ -2217,7 +2294,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite BUG_ON(bch2_inode_unpack(k, &u)); - if (S_ISDIR(le16_to_cpu(u.bi_mode))) + if (S_ISDIR(u.bi_mode)) return 0; if (!u.bi_nlink) @@ -2269,15 +2346,12 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, return 0; } -noinline_for_stack -static int check_nlinks(struct bch_fs *c) +int bch2_check_nlinks(struct bch_fs *c) { struct nlink_table links = { 0 }; u64 this_iter_range_start, next_iter_range_start = 0; int ret = 0; - bch_verbose(c, "checking inode nlinks"); - do { this_iter_range_start = next_iter_range_start; next_iter_range_start = U64_MAX; @@ -2303,6 +2377,8 @@ static int check_nlinks(struct bch_fs *c) kvfree(links.d); + if (ret) + bch_err_fn(c, ret); return ret; } @@ -2333,10 +2409,8 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); } -noinline_for_stack -static int fix_reflink_p(struct bch_fs *c) +int bch2_fix_reflink_p(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; @@ -2344,52 +2418,15 @@ static int fix_reflink_p(struct bch_fs *c) if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) return 0; - bch_verbose(c, "fixing reflink_p keys"); - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - - ret = for_each_btree_key_commit(&trans, iter, - BTREE_ID_extents, POS_MIN, - BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - fix_reflink_p_key(&trans, &iter, k)); - - bch2_trans_exit(&trans); - return ret; -} - -/* - * Checks for inconsistencies that shouldn't happen, unless we have a bug. - * Doesn't fix them yet, mainly because they haven't yet been observed: - */ -int bch2_fsck_full(struct bch_fs *c) -{ - int ret; -again: - ret = bch2_fs_check_snapshots(c) ?: - bch2_fs_check_subvols(c) ?: - bch2_delete_dead_snapshots(c) ?: - check_inodes(c, true) ?: - check_extents(c) ?: - check_dirents(c) ?: - check_xattrs(c) ?: - check_root(c) ?: - check_directory_structure(c) ?: - check_nlinks(c) ?: - fix_reflink_p(c); - - if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - goto again; - } + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); return ret; } - -int bch2_fsck_walk_inodes_only(struct bch_fs *c) -{ - return bch2_fs_check_snapshots(c) ?: - bch2_fs_check_subvols(c) ?: - bch2_delete_dead_snapshots(c) ?: - check_inodes(c, false); -} diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h index 264f270..90c87b5 100644 --- a/libbcachefs/fsck.h +++ b/libbcachefs/fsck.h @@ -2,7 +2,13 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H -int bch2_fsck_full(struct bch_fs *); -int bch2_fsck_walk_inodes_only(struct bch_fs *); +int bch2_check_inodes(struct bch_fs *); +int bch2_check_extents(struct bch_fs *); +int bch2_check_dirents(struct bch_fs *); +int bch2_check_xattrs(struct bch_fs *); +int bch2_check_root(struct bch_fs *); +int bch2_check_directory_structure(struct bch_fs *); +int bch2_check_nlinks(struct bch_fs *); +int bch2_fix_reflink_p(struct bch_fs *); #endif /* _BCACHEFS_FSCK_H */ diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index a91465e..fea21e1 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" +#include "btree_write_buffer.h" #include "bkey_methods.h" #include "btree_update.h" #include "buckets.h" @@ -269,6 +270,8 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { + memset(unpacked, 0, sizeof(*unpacked)); + switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); @@ -327,15 +330,14 @@ int bch2_inode_peek(struct btree_trans *trans, if (ret) return ret; - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - flags|BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) - goto err; + return ret; - ret = bkey_is_inode(k.k) ? 0 : -ENOENT; + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; if (ret) goto err; @@ -364,22 +366,25 @@ int bch2_inode_write(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); } -struct bkey_s_c bch2_inode_to_v3(struct btree_trans *trans, struct bkey_s_c k) +struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; struct bkey_inode_buf *inode_p; int ret; + if (!bkey_is_inode(&k->k)) + return ERR_PTR(-ENOENT); + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); if (IS_ERR(inode_p)) - return bkey_s_c_err(PTR_ERR(inode_p)); + return ERR_CAST(inode_p); - ret = bch2_inode_unpack(k, &u); + ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); if (ret) - return bkey_s_c_err(ret); + return ERR_PTR(ret); bch2_inode_pack(inode_p, &u); - return bkey_i_to_s_c(&inode_p->inode.k_i); + return &inode_p->inode.k_i; } static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) @@ -388,109 +393,94 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) if (k.k->p.inode) { prt_printf(err, "nonzero k.p.inode"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (k.k->p.offset < BLOCKDEV_INODE_MAX) { prt_printf(err, "fs inode in blockdev range"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (bch2_inode_unpack(k, &unpacked)) { prt_printf(err, "invalid variable length fields"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { prt_printf(err, "invalid data checksum type (%u >= %u", unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { prt_printf(err, "invalid data checksum type (%u >= %u)", unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && unpacked.bi_nlink != 0) { prt_printf(err, "flagged as unlinked but bi_nlink != 0"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { prt_printf(err, "subvolume root but not a directory"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; } int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); - if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*inode.v)); - return -EINVAL; - } - if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { prt_printf(err, "invalid str hash type (%llu >= %u)", INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return __bch2_inode_invalid(k, err); } int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); - if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*inode.v)); - return -EINVAL; - } - if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { prt_printf(err, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return __bch2_inode_invalid(k, err); } int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); - if (bkey_val_bytes(k.k) < sizeof(*inode.v)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*inode.v)); - return -EINVAL; - } - if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", INODEv3_FIELDS_START(inode.v), INODEv3_FIELDS_START_INITIAL, bkey_val_u64s(inode.k)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { prt_printf(err, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return __bch2_inode_invalid(k, err); @@ -530,18 +520,91 @@ void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c __bch2_inode_unpacked_to_text(out, &inode); } +static inline bool bkey_is_deleted_inode(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return bkey_s_c_to_inode(k).v->bi_flags & + cpu_to_le32(BCH_INODE_UNLINKED); + case KEY_TYPE_inode_v2: + return bkey_s_c_to_inode_v2(k).v->bi_flags & + cpu_to_le32(BCH_INODE_UNLINKED); + case KEY_TYPE_inode_v3: + return bkey_s_c_to_inode_v3(k).v->bi_flags & + cpu_to_le64(BCH_INODE_UNLINKED); + default: + return false; + } +} + +int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + bool old_deleted = bkey_is_deleted_inode(old); + bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); + + if (nr) { + int ret = bch2_replicas_deltas_realloc(trans, 0); + struct replicas_delta_list *d = trans->fs_usage_deltas; + + if (ret) + return ret; + + d->nr_inodes += nr; + } + + if (old_deleted != new_deleted) { + int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); + if (ret) + return ret; + } + + return 0; +} + +int bch2_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage; + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; + + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_inode_v3); + + v->bi_journal_seq = cpu_to_le64(journal_seq); + } + + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + } + return 0; +} + int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { if (k.k->p.inode) { prt_printf(err, "nonzero k.p.inode"); - return -EINVAL; - } - - if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_inode_generation)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -657,20 +720,9 @@ int bch2_inode_create(struct btree_trans *trans, again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && - bkey_cmp(k.k->p, POS(0, max)) < 0) { - while (pos < iter->pos.offset) { - if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) - goto found_slot; - - pos++; - } - - if (k.k->p.snapshot == snapshot && - !bkey_is_inode(k.k) && - !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) { - bch2_btree_iter_advance(iter); - continue; - } + bkey_lt(k.k->p, POS(0, max))) { + if (pos < iter->pos.offset) + goto found_slot; /* * We don't need to iterate over keys in every snapshot once @@ -680,12 +732,8 @@ again: bch2_btree_iter_set_pos(iter, POS(0, pos)); } - while (!ret && pos < max) { - if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos))) - goto found_slot; - - pos++; - } + if (!ret && pos < max) + goto found_slot; if (!ret && start == min) ret = -BCH_ERR_ENOSPC_inode_create; @@ -708,11 +756,6 @@ found_slot: return ret; } - /* We may have raced while the iterator wasn't pointing at pos: */ - if (bkey_is_inode(k.k) || - bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p)) - goto again; - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; inode_u->bi_generation = bkey_generation(k); @@ -729,11 +772,11 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, int ret = 0; /* - * We're never going to be deleting extents, no need to use an extent - * iterator: + * We're never going to be deleting partial extents, no need to use an + * extent iterator: */ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), - BTREE_ITER_INTENT); + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); while (1) { bch2_trans_begin(trans); @@ -755,14 +798,6 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bkey_init(&delete.k); delete.k.p = iter.pos; - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - bch2_key_resize(&delete.k, k.k->p.offset - iter.pos.offset); - - ret = bch2_extent_trim_atomic(trans, &iter, &delete); - if (ret) - goto err; - } - ret = bch2_trans_update(trans, &iter, &delete, 0) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); @@ -807,28 +842,23 @@ retry: if (ret) goto err; - bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, - SPOS(0, inum.inum, snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); - k = bch2_btree_iter_peek_slot(&iter); - + k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); ret = bkey_err(k); if (ret) goto err; if (!bkey_is_inode(k.k)) { bch2_fs_inconsistent(trans.c, - "inode %llu not found when deleting", - inum.inum); + "inode %llu:%u not found when deleting", + inum.inum, snapshot); ret = -EIO; goto err; } bch2_inode_unpack(k, &inode_u); - /* Subvolume root? */ - BUG_ON(inode_u.bi_subvol); - bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); @@ -897,3 +927,175 @@ void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked * else bi->bi_flags |= BCH_INODE_UNLINKED; } + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) +{ + struct bch_opts ret = { 0 }; +#define x(_name, _bits) \ + if (inode->bi_##_name) \ + opt_set(ret, _name, inode->bi_##_name - 1); + BCH_INODE_OPTS() +#undef x + return ret; +} + +void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, + struct bch_inode_unpacked *inode) +{ +#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); + BCH_INODE_OPTS() +#undef x + + if (opts->nocow) + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; +} + +int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + do { + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + } while (ret == -BCH_ERR_transaction_restart_nested); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) + bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + return ret ?: -BCH_ERR_transaction_restart_nested; +} + +static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + if (bch2_snapshot_is_internal_node(c, pos.snapshot)) + return 0; + + if (!fsck_err_on(c->sb.clean, c, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (fsck_err_on(!bkey_is_inode(k.k), c, + "nonexistent inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, + "non-deleted inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + + return 1; +err: +fsck_err: + return ret; +delete: + return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); +} + +int bch2_delete_dead_inodes(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + ret = bch2_btree_write_buffer_flush_sync(&trans); + if (ret) + goto err; + + /* + * Weird transaction restart handling here because on successful delete, + * bch2_inode_rm_snapshot() will return a nested transaction restart, + * but we can't retry because the btree write buffer won't have been + * flushed and we'd spin: + */ + for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p)); + if (ret < 0) + break; + + if (ret) { + if (!test_bit(BCH_FS_RW, &c->flags)) { + bch2_trans_unlock(&trans); + bch2_fs_lazy_rw(c); + } + + ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); +err: + bch2_trans_exit(&trans); + + return ret; +} diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index be01604..22b2440 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -5,18 +5,28 @@ #include "bkey.h" #include "opts.h" +enum bkey_invalid_flags; extern const char * const bch2_inode_opts[]; -int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); -int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + #define bch2_bkey_ops_inode ((struct bkey_ops) { \ .key_invalid = bch2_inode_invalid, \ .val_to_text = bch2_inode_to_text, \ .trans_trigger = bch2_trans_mark_inode, \ .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 16, \ }) #define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ @@ -24,6 +34,7 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_inode_to_text, \ .trans_trigger = bch2_trans_mark_inode, \ .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 32, \ }) #define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ @@ -31,6 +42,7 @@ void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_inode_to_text, \ .trans_trigger = bch2_trans_mark_inode, \ .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 48, \ }) static inline bool bkey_is_inode(const struct bkey *k) @@ -41,12 +53,13 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ .key_invalid = bch2_inode_generation_invalid, \ .val_to_text = bch2_inode_generation_to_text, \ + .min_val_size = 8, \ }) #if 0 @@ -82,7 +95,7 @@ struct bkey_inode_buf { void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); -struct bkey_s_c bch2_inode_to_v3(struct btree_trans *, struct bkey_s_c); +struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); @@ -110,17 +123,8 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, struct bch_inode_unpacked *); -static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) -{ - struct bch_io_opts ret = { 0 }; - -#define x(_name, _bits) \ - if (inode->bi_##_name) \ - opt_set(ret, _name, inode->bi_##_name - 1); - BCH_INODE_OPTS() -#undef x - return ret; -} +#define inode_opt_get(_c, _inode, _name) \ + ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, enum inode_opt_id id, u64 v) @@ -151,17 +155,6 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, } } -static inline struct bch_io_opts -io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) -{ - struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); - - bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); - if (opts.nocow) - opts.compression = opts.background_compression = opts.data_checksum = opts.erasure_code; - return opts; -} - static inline u8 mode_to_type(umode_t mode) { return (mode >> 12) & 15; @@ -201,4 +194,11 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); + +int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); +int bch2_delete_dead_inodes(struct bch_fs *); + #endif /* _BCACHEFS_INODE_H */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 0ff835e..5bacc6a 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -27,17 +27,18 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "nocow_locking.h" #include "rebalance.h" #include "subvolume.h" #include "super.h" #include "super-io.h" +#include "trace.h" #include +#include #include #include -#include - const char *bch2_blk_status_to_str(blk_status_t status) { if (status == BLK_STS_REMOVED) @@ -45,6 +46,8 @@ const char *bch2_blk_status_to_str(blk_status_t status) return blk_status_to_str(status); } +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -133,6 +136,15 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + /* Allocate, free from mempool: */ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) @@ -151,7 +163,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) struct page *page; if (likely(!*using_mempool)) { - page = alloc_page(GFP_NOIO); + page = alloc_page(GFP_NOFS); if (unlikely(!page)) { mutex_lock(&c->bio_bounce_pages_lock); *using_mempool = true; @@ -160,7 +172,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) } } else { pool_alloc: - page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); } return page; @@ -205,7 +217,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) { + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), bkey_start_offset(old.k)); @@ -225,7 +238,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; - if (bkey_cmp(old.k->p, new->k.p) >= 0) + if (bkey_ge(old.k->p, new->k.p)) break; } @@ -233,57 +246,54 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, return ret; } -static int bch2_extent_update_i_size_sectors(struct btree_trans *trans, - struct btree_iter *extent_iter, - u64 new_i_size, - s64 i_sectors_delta) +static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct btree_iter *extent_iter, + u64 new_i_size, + s64 i_sectors_delta) { struct btree_iter iter; - struct bkey_s_c inode_k; - struct bkey_s_c_inode_v3 inode; - struct bkey_i_inode_v3 *new_inode; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, - extent_iter->pos.inode, - extent_iter->snapshot), - BTREE_ITER_INTENT|BTREE_ITER_CACHED); - inode_k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(inode_k); + k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_CACHED); + ret = PTR_ERR_OR_ZERO(k); if (unlikely(ret)) - goto err; - - ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT; - if (unlikely(ret)) - goto err; + return ret; - if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) { - inode_k = bch2_inode_to_v3(trans, inode_k); - ret = bkey_err(inode_k); + if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = PTR_ERR_OR_ZERO(k); if (unlikely(ret)) goto err; } - inode = bkey_s_c_to_inode_v3(inode_k); - - new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k)); - ret = PTR_ERR_OR_ZERO(new_inode); - if (unlikely(ret)) - goto err; - - bkey_reassemble(&new_inode->k_i, inode.s_c); + inode = bkey_i_to_inode_v3(k); - if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > le64_to_cpu(inode.v->bi_size)) - new_inode->v.bi_size = cpu_to_le64(new_i_size); + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } - le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta); + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } - new_inode->k.p.snapshot = iter.snapshot; + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } - ret = bch2_trans_update(trans, &iter, &new_inode->k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -373,6 +383,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, struct open_buckets open_buckets; struct bkey_s_c k; struct bkey_buf old, new; + unsigned sectors_allocated; bool have_reservation = false; bool unwritten = opts.nocow && c->sb.version >= bcachefs_metadata_version_unwritten_extents; @@ -383,6 +394,8 @@ int bch2_extent_fallocate(struct btree_trans *trans, closure_init_stack(&cl); open_buckets.nr = 0; retry: + sectors_allocated = 0; + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -438,16 +451,17 @@ retry: &devs_have, opts.data_replicas, opts.data_replicas, - RESERVE_none, 0, &cl, &wp); - if (ret == -EAGAIN) { + BCH_WATERMARK_normal, 0, &cl, &wp); + if (ret) { bch2_trans_unlock(trans); closure_sync(&cl); - goto retry; - } - if (ret) + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + goto retry; return ret; + } sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; bch2_key_resize(&e->k, sectors); @@ -474,6 +488,9 @@ out: goto retry; } + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); @@ -513,11 +530,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(iter, snapshot); - k = bch2_btree_iter_peek(iter); - if (bkey_cmp(iter->pos, end_pos) >= 0) { - bch2_btree_iter_set_pos(iter, end_pos); + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) break; - } ret = bkey_err(k); if (ret) @@ -608,7 +626,7 @@ static int bch2_write_index_default(struct bch_write_op *op) if (ret) break; - if (bkey_cmp(iter.pos, k->k.p) >= 0) + if (bkey_ge(iter.pos, k->k.p)) bch2_keylist_pop_front(&op->insert_keys); else bch2_cut_front(iter.pos, k); @@ -642,7 +660,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, - GFP_NOIO, &ca->replica_set)); + GFP_NOFS, &ca->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -671,6 +689,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, bio_sectors(&n->bio)); bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { + bio_endio(&n->bio); + continue; + } + submit_bio(&n->bio); } else { n->bio.bi_status = BLK_STS_REMOVED; @@ -687,11 +711,13 @@ static void bch2_write_done(struct closure *cl) struct bch_fs *c = op->c; bch2_disk_reservation_put(c, &op->res); - percpu_ref_put(&c->writes); + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + EBUG_ON(cl->parent); closure_debug_destroy(cl); if (op->end_io) op->end_io(op); @@ -715,7 +741,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) } if (dst != src) - memmove_u64s_down(dst, src, src->u64s); + memmove_u64s_down(dst, src, src->k.u64s); dst = bkey_next(dst); } @@ -745,14 +771,9 @@ static void __bch2_write_index(struct bch_write_op *op) * particularly want to plumb io_opts all the way through the btree * update stack right now */ - for_each_keylist_key(keys, k) { + for_each_keylist_key(keys, k) bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); - if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) - bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); - - } - if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); @@ -765,15 +786,17 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); - if (ret) { + if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *k = bch2_keylist_front(&op->insert_keys); bch_err_inum_offset_ratelimited(c, k->k.p.inode, k->k.p.offset << 9, "write error while doing btree update: %s", bch2_err_str(ret)); - goto err; } + + if (ret) + goto err; } out: /* If some a bucket wasn't written, we can't erasure code it: */ @@ -789,17 +812,61 @@ err: goto out; } +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + static void bch2_write_index(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct write_point *wp = op->wp; struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); - barrier(); - op->btree_update_ready = true; queue_work(wq, &wp->index_update_work); } +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + void bch2_write_point_do_index_updates(struct work_struct *work) { struct write_point *wp = @@ -807,17 +874,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work) struct bch_write_op *op; while (1) { - spin_lock(&wp->writes_lock); + spin_lock_irq(&wp->writes_lock); op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op && !op->btree_update_ready) - op = NULL; if (op) list_del(&op->wp_list); - spin_unlock(&wp->writes_lock); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); if (!op) break; + op->flags |= BCH_WRITE_IN_WORKER; + __bch2_write_index(op); if (!(op->flags & BCH_WRITE_DONE)) @@ -859,12 +927,10 @@ static void bch2_write_endio(struct bio *bio) if (wbio->put_bio) bio_put(bio); - if (parent) { + if (parent) bio_endio(&parent->bio); - return; - } - - closure_put(cl); + else + closure_put(cl); } static void init_append_extent(struct bch_write_op *op, @@ -872,7 +938,6 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { - struct bch_fs *c = op->c; struct bkey_i_extent *e; op->pos.offset += crc.uncompressed_size; @@ -887,7 +952,7 @@ static void init_append_extent(struct bch_write_op *op, crc.nonce) bch2_extent_crc_append(&e->k_i, crc); - bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size, + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, op->flags & BCH_WRITE_CACHED); bch2_keylist_push(&op->insert_keys); @@ -911,7 +976,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, pages = min(pages, BIO_MAX_VECS); bio = bio_alloc_bioset(NULL, pages, 0, - GFP_NOIO, &c->bio_write); + GFP_NOFS, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ @@ -1013,11 +1078,12 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - (op->crc.compression_type == op->compression_type || + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type)) + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; return PREP_ENCODED_DO_WRITE; @@ -1037,7 +1103,7 @@ static enum prep_encoded_ret { csum = bch2_checksum_bio(c, op->crc.csum_type, extent_nonce(op->version, op->crc), bio); - if (bch2_crc_cmp(op->crc.csum, csum)) + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) @@ -1055,13 +1121,14 @@ static enum prep_encoded_ret { */ if ((op->crc.live_size != op->crc.uncompressed_size || op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type)) + bch2_write_rechecksum(c, op, op->csum_type) && + !c->opts.no_data_io) return PREP_ENCODED_CHECKSUM_ERR; /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_type || + if ((op->compression_opt || bch2_csum_type_is_encryption(op->crc.csum_type) != bch2_csum_type_is_encryption(op->csum_type)) && bch2_write_decrypt(op)) @@ -1108,7 +1175,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } if (ec_buf || - op->compression_type || + op->compression_opt || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && @@ -1131,16 +1198,16 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; - BUG_ON(op->compression_type && + BUG_ON(op->compression_opt && (op->flags & BCH_WRITE_DATA_ENCODED) && bch2_csum_type_is_encryption(op->crc.csum_type)); - BUG_ON(op->compression_type && !bounce); + BUG_ON(op->compression_opt && !bounce); crc.compression_type = op->incompressible ? BCH_COMPRESSION_TYPE_incompressible - : op->compression_type + : op->compression_opt ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + op->compression_opt) : 0; if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); @@ -1249,7 +1316,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, BUG_ON(total_output != total_input); dst = bio_split(src, total_input >> 9, - GFP_NOIO, &c->bio_write); + GFP_NOFS, &c->bio_write); wbio_init(dst)->put_bio = true; /* copy WRITE_SYNC flag */ dst->bi_opf = src->bi_opf; @@ -1328,13 +1395,11 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, return 0; } - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; - bkey_reassemble(new, k); - bch2_cut_front(bkey_start_pos(&orig->k), new); bch2_cut_back(orig->k.p, new); @@ -1367,23 +1432,23 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bch2_trans_init(&trans, c, 0, 0); for_each_keylist_key(&op->insert_keys, orig) { - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, - bkey_start_pos(&orig->k), + ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_INTENT, k, NULL, NULL, BTREE_INSERT_NOFAIL, ({ - if (bkey_cmp(bkey_start_pos(k.k), orig->k.p) >= 0) - break; - bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); })); - if (ret) { + if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *k = bch2_keylist_front(&op->insert_keys); bch_err_inum_offset_ratelimited(c, k->k.p.inode, k->k.p.offset << 9, "write error while doing btree update: %s", bch2_err_str(ret)); + } + + if (ret) { op->error = ret; break; } @@ -1417,9 +1482,15 @@ static void bch2_nocow_write(struct bch_write_op *op) struct btree_iter iter; struct bkey_s_c k; struct bkey_ptrs_c ptrs; - const struct bch_extent_ptr *ptr, *ptr2; + const struct bch_extent_ptr *ptr; + struct { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; u32 snapshot; - int ret; + int ret, i; if (op->flags & BCH_WRITE_MOVE) return; @@ -1438,6 +1509,8 @@ retry: while (1) { struct bio *bio = &op->wbio.bio; + nr_buckets = 0; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) @@ -1456,27 +1529,47 @@ retry: /* Get iorefs before dropping btree locks: */ ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr) + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, + bucket_to_u64(buckets[nr_buckets].b)); + + prefetch(buckets[nr_buckets].l); + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) goto err_get_ioref; + nr_buckets++; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + /* Unlock before taking nocow locks, doing IO: */ bkey_reassemble(op->insert_keys.top, k); bch2_trans_unlock(&trans); bch2_cut_front(op->pos, op->insert_keys.top); - bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); - ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(op->insert_keys.top)); - bkey_for_each_ptr(ptrs, ptr) { - bch2_bucket_nocow_lock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr), - BUCKET_NOCOW_LOCK_UPDATE); - if (unlikely(ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) - goto err_bucket_stale; + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + struct nocow_lock_bucket *l = buckets[i].l; + bool stale; - if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + __bch2_bucket_nocow_lock(&c->nocow_locks, l, + bucket_to_u64(buckets[i].b), + BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; } bio = &op->wbio.bio; @@ -1540,29 +1633,21 @@ err: } return; err_get_ioref: - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; - - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); - } + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* Fall back to COW path: */ goto out; err_bucket_stale: - bkey_for_each_ptr(ptrs, ptr2) { + while (--i >= 0) bch2_bucket_nocow_unlock(&c->nocow_locks, - PTR_BUCKET_POS(c, ptr2), + buckets[i].b, BUCKET_NOCOW_LOCK_UPDATE); - if (ptr2 == ptr) - break; - } - - bkey_for_each_ptr(ptrs, ptr2) - percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); /* We can retry this: */ - ret = BCH_ERR_transaction_restart; + ret = -BCH_ERR_transaction_restart; goto out; } @@ -1576,14 +1661,13 @@ static void __bch2_write(struct bch_write_op *op) nofs_flags = memalloc_nofs_save(); - if (unlikely(op->opts.nocow)) { + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); if (op->flags & BCH_WRITE_DONE) goto out_nofs_restore; } again: memset(&op->failed, 0, sizeof(op->failed)); - op->btree_update_ready = false; do { struct bkey_i *key_to_write; @@ -1614,33 +1698,33 @@ again: &op->devs_have, op->nr_replicas, op->nr_replicas_required, - op->alloc_reserve, + op->watermark, op->flags, (op->flags & (BCH_WRITE_ALLOC_NOWAIT| BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : &op->cl, &wp)); if (unlikely(ret)) { - if (unlikely(ret != -EAGAIN)) { - op->error = ret; - op->flags |= BCH_WRITE_DONE; - } + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; - break; + goto err; } + EBUG_ON(!wp); + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); - bch2_alloc_sectors_done(c, wp); - - if (ret < 0) { - op->error = ret; + bch2_alloc_sectors_done_inlined(c, wp); +err: + if (ret <= 0) { op->flags |= BCH_WRITE_DONE; - break; - } - if (!ret) - op->flags |= BCH_WRITE_DONE; + if (ret < 0) { + op->error = ret; + break; + } + } bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; @@ -1662,7 +1746,9 @@ again: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) { + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { closure_sync(&op->cl); __bch2_write_index(op); @@ -1670,11 +1756,7 @@ again: goto again; bch2_write_done(&op->cl); } else { - spin_lock(&wp->writes_lock); - op->wp = wp; - list_add_tail(&op->wp_list, &wp->writes); - spin_unlock(&wp->writes_lock); - + bch2_write_queue(op, wp); continue_at(&op->cl, bch2_write_index, NULL); } out_nofs_restore: @@ -1689,6 +1771,9 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, @@ -1716,9 +1801,6 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) set_bkey_val_bytes(&id->k, data_len); bch2_keylist_push(&op->insert_keys); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_DONE; - __bch2_write_index(op); err: bch2_write_done(&op->cl); @@ -1750,7 +1832,7 @@ void bch2_write(struct closure *cl) EBUG_ON(op->cl.parent); BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); - BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + BUG_ON(bkey_eq(op->pos, POS_MAX)); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); @@ -1765,9 +1847,14 @@ void bch2_write(struct closure *cl) goto err; } - if (c->opts.nochanges || - !percpu_ref_tryget_live(&c->writes)) { - op->error = -EROFS; + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; goto err; } @@ -1793,6 +1880,34 @@ err: op->end_io(op); } +static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + /* Cache promotion on read */ struct promote_op { @@ -1845,10 +1960,12 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) { int ret; + bch2_data_update_exit(&op->write); + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params); BUG_ON(ret); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } @@ -1860,8 +1977,6 @@ static void promote_done(struct bch_write_op *wop) bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - - bch2_data_update_exit(&op->write); promote_free(c, op); } @@ -1882,7 +1997,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_data_update_read_done(&op->write, rbio->pick.crc); } -static struct promote_op *__promote_alloc(struct bch_fs *c, +static struct promote_op *__promote_alloc(struct btree_trans *trans, enum btree_id btree_id, struct bkey_s_c k, struct bpos pos, @@ -1891,15 +2006,16 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, unsigned sectors, struct bch_read_bio **rbio) { + struct bch_fs *c = trans->c; struct promote_op *op = NULL; struct bio *bio; unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!percpu_ref_tryget_live(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return NULL; - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); if (!op) goto err; @@ -1912,7 +2028,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, */ *rbio = kzalloc(sizeof(struct bch_read_bio) + sizeof(struct bio_vec) * pages, - GFP_NOIO); + GFP_NOFS); if (!*rbio) goto err; @@ -1920,7 +2036,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, - GFP_NOIO)) + GFP_NOFS)) goto err; (*rbio)->bounce = true; @@ -1934,7 +2050,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, bio = &op->write.op.wbio.bio; bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_data_update_init(c, &op->write, + ret = bch2_data_update_init(trans, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, (struct data_update_opts) { @@ -1943,7 +2059,17 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); - BUG_ON(ret); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + op->write.op.end_io = promote_done; return op; @@ -1953,21 +2079,22 @@ err: kfree(*rbio); *rbio = NULL; kfree(op); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); return NULL; } noinline -static struct promote_op *promote_alloc(struct bch_fs *c, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) { + struct bch_fs *c = trans->c; bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full @@ -1981,7 +2108,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, + promote = __promote_alloc(trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, @@ -2189,9 +2316,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); if ((ret = bkey_err(k))) goto out; @@ -2267,7 +2393,7 @@ static void __bch2_read_endio(struct work_struct *work) } csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) goto csum_err; /* @@ -2292,7 +2418,8 @@ static void __bch2_read_endio(struct work_struct *work) if (ret) goto decrypt_err; - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) goto decompression_err; } else { /* don't need to decrypt the entire bio: */ @@ -2427,10 +2554,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); ret = bkey_err(k); if (ret) goto err; @@ -2588,7 +2713,7 @@ retry_pick: } if (orig->opts.promote_target) - promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, &rbio, &bounce, &read_full); if (!read_full) { @@ -2627,7 +2752,7 @@ get_bio: rbio = rbio_init(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, - GFP_NOIO, + GFP_NOFS, &c->bio_read_split), orig->opts); @@ -2643,7 +2768,7 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO, + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; @@ -2718,10 +2843,21 @@ get_bio: bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(c, rbio)) { @@ -2897,24 +3033,27 @@ void bch2_fs_io_exit(struct bch_fs *c) int bch2_fs_io_init(struct bch_fs *c) { - unsigned i; + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; - for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) - two_state_lock_init(&c->nocow_locks.l[i]); + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; - if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS) || - mempool_init_page_pool(&c->bio_bounce_pages, + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, c->opts.encoded_extent_max) / - PAGE_SIZE, 0) || - rhashtable_init(&c->promote_table, &bch_promote_params)) - return -ENOMEM; + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; return 0; } diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 68e4d76..1476380 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -15,7 +15,11 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); @@ -24,27 +28,38 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, const char *bch2_blk_status_to_str(blk_status_t); +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + enum bch_write_flags { - BCH_WRITE_ALLOC_NOWAIT = (1 << 0), - BCH_WRITE_CACHED = (1 << 1), - BCH_WRITE_DATA_ENCODED = (1 << 2), - BCH_WRITE_PAGES_STABLE = (1 << 3), - BCH_WRITE_PAGES_OWNED = (1 << 4), - BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5), - BCH_WRITE_WROTE_DATA_INLINE = (1 << 6), - BCH_WRITE_CHECK_ENOSPC = (1 << 7), - BCH_WRITE_SYNC = (1 << 8), - BCH_WRITE_MOVE = (1 << 9), - - /* Internal: */ - BCH_WRITE_DONE = (1 << 10), - BCH_WRITE_IO_ERROR = (1 << 11), - BCH_WRITE_CONVERT_UNWRITTEN = (1 << 12), +#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x }; static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { - return op->alloc_reserve == RESERVE_movinggc + return op->watermark == BCH_WATERMARK_copygc ? op->c->copygc_wq : op->c->btree_update_wq; } @@ -71,10 +86,10 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->written = 0; op->error = 0; op->csum_type = bch2_data_checksum_type(c, opts); - op->compression_type = bch2_compression_opt_to_type[opts.compression]; + op->compression_opt = opts.compression; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; - op->alloc_reserve = RESERVE_none; + op->watermark = BCH_WATERMARK_normal; op->incompressible = 0; op->open_buckets.nr = 0; op->devs_have.nr = 0; @@ -98,10 +113,12 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) { struct bch_write_bio *wbio = to_wbio(bio); - memset(wbio, 0, offsetof(struct bch_write_bio, bio)); + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); return wbio; } +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 4e5d310..737f16d 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -83,6 +83,7 @@ struct bch_read_bio { }; struct bch_write_bio { + struct_group(wbio, struct bch_fs *c; struct bch_write_bio *parent; @@ -99,6 +100,7 @@ struct bch_write_bio { nocow:1, used_mempool:1, first_btree_write:1; + ); struct bio bio; }; @@ -113,13 +115,13 @@ struct bch_write_op { u16 flags; s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ + unsigned compression_opt:8; unsigned csum_type:4; - unsigned compression_type:4; unsigned nr_replicas:4; unsigned nr_replicas_required:4; - unsigned alloc_reserve:3; + unsigned watermark:3; unsigned incompressible:1; - unsigned btree_update_ready:1; + unsigned stripe_waited:1; struct bch_devs_list devs_have; u16 target; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 95c2922..80a612c 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -17,20 +17,14 @@ #include "journal_reclaim.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" - -#include - -#define x(n) #n, -static const char * const bch2_journal_watermarks[] = { - JOURNAL_WATERMARKS() - NULL -}; +#include "trace.h" static const char * const bch2_journal_errors[] = { +#define x(n) #n, JOURNAL_ERRORS() +#undef x NULL }; -#undef x static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { @@ -68,13 +62,75 @@ journal_seq_to_buf(struct journal *j, u64 seq) static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) { - INIT_LIST_HEAD(&p->list); - INIT_LIST_HEAD(&p->key_cache_list); + unsigned i; + for (i = 0; i < ARRAY_SIZE(p->list); i++) + INIT_LIST_HEAD(&p->list[i]); INIT_LIST_HEAD(&p->flushed); atomic_set(&p->count, count); p->devs.nr = 0; } +/* + * Detect stuck journal conditions and trigger shutdown. Technically the journal + * can end up stuck for a variety of reasons, such as a blocked I/O, journal + * reservation lockup, etc. Since this is a fatal error with potentially + * unpredictable characteristics, we want to be fairly conservative before we + * decide to shut things down. + * + * Consider the journal stuck when it appears full with no ability to commit + * btree transactions, to discard journal buckets, nor acquire priority + * (reserved watermark) reservation. + */ +static inline bool +journal_error_check_stuck(struct journal *j, int error, unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool stuck = false; + struct printbuf buf = PRINTBUF; + + if (!(error == JOURNAL_ERR_journal_full || + error == JOURNAL_ERR_journal_pin_full) || + nr_unwritten_journal_entries(j) || + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) + return stuck; + + spin_lock(&j->lock); + + if (j->can_discard) { + spin_unlock(&j->lock); + return stuck; + } + + stuck = true; + + /* + * The journal shutdown path will set ->err_seq, but do it here first to + * serialize against concurrent failures and avoid duplicate error + * reports. + */ + if (j->err_seq) { + spin_unlock(&j->lock); + return stuck; + } + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", + bch2_journal_errors[error]); + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + dump_stack(); + + return stuck; +} + /* journal entry close/open: */ void __bch2_journal_buf_put(struct journal *j) @@ -162,6 +218,7 @@ void bch2_journal_halt(struct journal *j) __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); if (!j->err_seq) j->err_seq = journal_cur_seq(j); + journal_wake(j); spin_unlock(&j->lock); } @@ -199,12 +256,6 @@ static bool journal_entry_close(struct journal *j) /* * should _only_ called from journal_res_get() - when we actually want a * journal reservation - journal entry is open means journal is dirty: - * - * returns: - * 0: success - * -ENOSPC: journal currently full, must invoke reclaim - * -EAGAIN: journal blocked, must wait - * -EROFS: insufficient rw devices or journal error */ static int journal_entry_open(struct journal *j) { @@ -231,7 +282,7 @@ static int journal_entry_open(struct journal *j) if (!fifo_free(&j->pin)) return JOURNAL_ERR_journal_pin_full; - if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) - 1) + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; BUG_ON(!j->cur_entry_sectors); @@ -250,7 +301,7 @@ static int journal_entry_open(struct journal *j) journal_entry_overhead(j); u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); - if (u64s <= 0) + if (u64s <= (ssize_t) j->early_journal_entries.nr) return JOURNAL_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) @@ -275,6 +326,12 @@ static int journal_entry_open(struct journal *j) buf->data->seq = cpu_to_le64(journal_cur_seq(j)); buf->data->u64s = 0; + if (j->early_journal_entries.nr) { + memcpy(buf->data->_data, j->early_journal_entries.data, + j->early_journal_entries.nr * sizeof(u64)); + le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); + } + /* * Must be set before marking the journal entry as open: */ @@ -291,7 +348,9 @@ static int journal_entry_open(struct journal *j) BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); journal_state_inc(&new); - new.cur_entry_offset = 0; + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); @@ -304,6 +363,9 @@ static int journal_entry_open(struct journal *j) &j->write_work, msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); + + if (j->early_journal_entries.nr) + darray_exit(&j->early_journal_entries); return 0; } @@ -353,10 +415,16 @@ retry: return 0; if (bch2_journal_error(j)) - return -EROFS; + return -BCH_ERR_erofs_journal_err; spin_lock(&j->lock); + /* check once more in case somebody else shut things down... */ + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return -BCH_ERR_erofs_journal_err; + } + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call journal_entry_close() @@ -367,7 +435,7 @@ retry: return 0; } - if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { /* * Don't want to close current journal entry, just need to * invoke reclaim: @@ -404,28 +472,8 @@ unlock: if (!ret) goto retry; - - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && - !can_discard && - !nr_unwritten_journal_entries(j) && - (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { - struct printbuf buf = PRINTBUF; - - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", - bch2_journal_errors[ret]); - - bch2_journal_debug_to_text(&buf, j); - bch_err(c, "%s", buf.buf); - - printbuf_reset(&buf); - bch2_journal_pins_to_text(&buf, j); - bch_err(c, "Journal pins:\n%s", buf.buf); - - printbuf_exit(&buf); - bch2_fatal_error(c); - dump_stack(); - } + if (journal_error_check_stuck(j, ret, flags)) + ret = -BCH_ERR_journal_res_get_blocked; /* * Journal is full - can't rely on reclaim from work item due to @@ -445,7 +493,9 @@ unlock: } } - return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; + return ret == JOURNAL_ERR_insufficient_devices + ? -BCH_ERR_erofs_journal_err + : -BCH_ERR_journal_res_get_blocked; } /* @@ -464,7 +514,8 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -EAGAIN || + (ret = __journal_res_get(j, res, flags)) != + -BCH_ERR_journal_res_get_blocked|| (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } @@ -720,39 +771,6 @@ int bch2_journal_meta(struct journal *j) return bch2_journal_flush_seq(j, res.seq); } -int bch2_journal_log_msg(struct journal *j, const char *fmt, ...) -{ - struct jset_entry_log *entry; - struct journal_res res = { 0 }; - unsigned msglen, u64s; - va_list args; - int ret; - - va_start(args, fmt); - msglen = vsnprintf(NULL, 0, fmt, args) + 1; - va_end(args); - - u64s = jset_u64s(DIV_ROUND_UP(msglen, sizeof(u64))); - - ret = bch2_journal_res_get(j, &res, u64s, 0); - if (ret) - return ret; - - entry = container_of(journal_res_entry(j, &res), - struct jset_entry_log, entry); - memset(entry, 0, u64s * sizeof(u64)); - entry->entry.type = BCH_JSET_ENTRY_log; - entry->entry.u64s = u64s - 1; - - va_start(args, fmt); - vsnprintf(entry->d, INT_MAX, fmt, args); - va_end(args); - - bch2_journal_res_put(j, &res); - - return bch2_journal_flush_seq(j, res.seq); -} - /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) @@ -783,26 +801,18 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, u64 *new_bucket_seq = NULL, *new_buckets = NULL; struct open_bucket **ob = NULL; long *bu = NULL; - unsigned i, nr_got = 0, nr_want = nr - ja->nr; - unsigned old_nr = ja->nr; - unsigned old_discard_idx = ja->discard_idx; - unsigned old_dirty_idx_ondisk = ja->dirty_idx_ondisk; - unsigned old_dirty_idx = ja->dirty_idx; - unsigned old_cur_idx = ja->cur_idx; + unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; int ret = 0; - if (c) { - bch2_journal_flush_all_pins(&c->journal); - bch2_journal_block(&c->journal); - } + BUG_ON(nr <= ja->nr); bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = -ENOMEM; - goto err_unblock; + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err_free; } for (nr_got = 0; nr_got < nr_want; nr_got++) { @@ -813,12 +823,18 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, break; } } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, - false, cl); - if (IS_ERR(ob[nr_got])) { - ret = cl - ? -EAGAIN - : -BCH_ERR_ENOSPC_bucket_alloc; + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (ret) { + bch2_open_bucket_put(c, ob[nr_got]); + bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); break; } @@ -827,76 +843,77 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } if (!nr_got) - goto err_unblock; + goto err_free; - /* - * We may be called from the device add path, before the new device has - * actually been added to the running filesystem: - */ - if (!new_fs) - spin_lock(&c->journal.lock); + /* Don't return an error if we successfully allocated some buckets: */ + ret = 0; + + if (c) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); + mutex_lock(&c->sb_lock); + } memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); + + BUG_ON(ja->discard_idx > ja->nr); + + pos = ja->discard_idx ?: ja->nr; + + memmove(new_buckets + pos + nr_got, + new_buckets + pos, + sizeof(new_buckets[0]) * (ja->nr - pos)); + memmove(new_bucket_seq + pos + nr_got, + new_bucket_seq + pos, + sizeof(new_bucket_seq[0]) * (ja->nr - pos)); for (i = 0; i < nr_got; i++) { - unsigned pos = ja->discard_idx ?: ja->nr; - long b = bu[i]; - - __array_insert_item(ja->buckets, ja->nr, pos); - __array_insert_item(ja->bucket_seq, ja->nr, pos); - ja->nr++; - - ja->buckets[pos] = b; - ja->bucket_seq[pos] = 0; - - if (pos <= ja->discard_idx) - ja->discard_idx = (ja->discard_idx + 1) % ja->nr; - if (pos <= ja->dirty_idx_ondisk) - ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (pos <= ja->dirty_idx) - ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; - if (pos <= ja->cur_idx) - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + new_buckets[pos + i] = bu[i]; + new_bucket_seq[pos + i] = 0; } - ret = bch2_journal_buckets_to_sb(c, ca); - if (ret) { - /* Revert: */ - swap(new_buckets, ja->buckets); - swap(new_bucket_seq, ja->bucket_seq); - ja->nr = old_nr; - ja->discard_idx = old_discard_idx; - ja->dirty_idx_ondisk = old_dirty_idx_ondisk; - ja->dirty_idx = old_dirty_idx; - ja->cur_idx = old_cur_idx; - } + nr = ja->nr + nr_got; + + ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); + if (ret) + goto err_unblock; if (!new_fs) - spin_unlock(&c->journal.lock); + bch2_write_super(c); + /* Commit: */ if (c) - bch2_journal_unblock(&c->journal); + spin_lock(&c->journal.lock); - if (ret) - goto err; + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = nr; - if (!new_fs) { - for (i = 0; i < nr_got; i++) { - ret = bch2_trans_run(c, - bch2_trans_mark_metadata_bucket(&trans, ca, - bu[i], BCH_DATA_journal, - ca->mi.bucket_size)); - if (ret) { - bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret); - goto err; - } - } + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; + + if (c) + spin_unlock(&c->journal.lock); +err_unblock: + if (c) { + bch2_journal_unblock(&c->journal); + mutex_unlock(&c->sb_lock); } -err: - if (ob && !new_fs) + + if (ret && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + bu[i], BCH_DATA_free, 0)); +err_free: + if (!new_fs) for (i = 0; i < nr_got; i++) bch2_open_bucket_put(c, ob[i]); @@ -904,12 +921,7 @@ err: kfree(new_buckets); kfree(ob); kfree(bu); - return ret; -err_unblock: - if (c) - bch2_journal_unblock(&c->journal); - goto err; } /* @@ -921,46 +933,49 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, { struct journal_device *ja = &ca->journal; struct closure cl; - unsigned current_nr; int ret = 0; - /* don't handle reducing nr of buckets yet: */ - if (nr < ja->nr) - return 0; - closure_init_stack(&cl); - while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) { - struct disk_reservation disk_res = { 0, 0 }; + down_write(&c->state_lock); - closure_sync(&cl); + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + goto unlock; - mutex_lock(&c->sb_lock); - current_nr = ja->nr; + while (ja->nr < nr) { + struct disk_reservation disk_res = { 0, 0 }; /* * note: journal buckets aren't really counted as _sectors_ used yet, so * we don't need the disk reservation to avoid the BUG_ON() in buckets.c * when space used goes up without a reservation - but we do need the * reservation to ensure we'll actually be able to allocate: + * + * XXX: that's not right, disk reservations only ensure a + * filesystem-wide allocation will succeed, this is a device + * specific allocation - we can hang here: */ ret = bch2_disk_reservation_get(c, &disk_res, bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) { - mutex_unlock(&c->sb_lock); - return ret; - } + if (ret) + break; ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); bch2_disk_reservation_put(c, &disk_res); - if (ja->nr != current_nr) - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + closure_sync(&cl); + + if (ret && ret != -BCH_ERR_bucket_alloc_blocked) + break; } + if (ret) + bch_err_fn(c, ret); +unlock: + up_write(&c->state_lock); return ret; } @@ -969,8 +984,10 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) unsigned nr; int ret; - if (dynamic_fault("bcachefs:add:journal_alloc")) - return -ENOMEM; + if (dynamic_fault("bcachefs:add:journal_alloc")) { + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err; + } /* 1/128th of the device by default: */ nr = ca->mi.nbuckets >> 7; @@ -984,14 +1001,10 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - if (ca->fs) - mutex_lock(&ca->fs->sb_lock); - ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); - - if (ca->fs) - mutex_unlock(&ca->fs->sb_lock); - +err: + if (ret) + bch_err_fn(ca, ret); return ret; } @@ -1008,7 +1021,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) seq++) { struct journal_buf *buf = journal_seq_to_buf(j, seq); - if (bch2_bkey_has_device(bkey_i_to_s_c(&buf->key), dev_idx)) + if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) ret = true; } spin_unlock(&j->lock); @@ -1070,7 +1083,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); if (!j->pin.data) { bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_pin_fifo; } } @@ -1164,19 +1177,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) - return -ENOMEM; + return -BCH_ERR_ENOMEM_dev_journal_init; nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!ca->journal.bio) - return -ENOMEM; + return -BCH_ERR_ENOMEM_dev_journal_init; bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) - return -ENOMEM; + return -BCH_ERR_ENOMEM_dev_journal_init; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); @@ -1198,6 +1211,8 @@ void bch2_fs_journal_exit(struct journal *j) { unsigned i; + darray_exit(&j->early_journal_entries); + for (i = 0; i < ARRAY_SIZE(j->buf); i++) kvpfree(j->buf[i].data, j->buf[i].buf_size); free_fifo(&j->pin); @@ -1205,12 +1220,8 @@ void bch2_fs_journal_exit(struct journal *j) int bch2_fs_journal_init(struct journal *j) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; unsigned i; - int ret = 0; - - pr_verbose_init(c->opts, ""); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); @@ -1227,24 +1238,18 @@ int bch2_fs_journal_init(struct journal *j) ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { - ret = -ENOMEM; - goto out; - } + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) + return -BCH_ERR_ENOMEM_journal_pin_fifo; for (i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); - if (!j->buf[i].data) { - ret = -ENOMEM; - goto out; - } + if (!j->buf[i].data) + return -BCH_ERR_ENOMEM_journal_buf; } j->pin.front = j->pin.back = 1; -out: - pr_verbose_init(c->opts, "ret %i", ret); - return ret; + return 0; } /* debug: */ @@ -1272,7 +1277,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); - prt_printf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1382,6 +1387,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; + unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1399,15 +1405,11 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - list_for_each_entry(pin, &pin_list->list, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } - - list_for_each_entry(pin, &pin_list->key_cache_list, list) { - prt_printf(out, "\t%px %ps", pin, pin->flush); - prt_newline(out); - } + for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + list_for_each_entry(pin, &pin_list->list[i], list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } if (!list_empty(&pin_list->flushed)) { prt_printf(out, "flushed:"); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 51d29a0..008a2e2 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -110,7 +110,6 @@ */ #include -#include #include "journal_types.h" @@ -295,9 +294,14 @@ static inline void bch2_journal_res_put(struct journal *j, int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned); -/* First two bits for JOURNAL_WATERMARK: */ -#define JOURNAL_RES_GET_NONBLOCK (1 << 2) -#define JOURNAL_RES_GET_CHECK (1 << 3) +/* First bits for BCH_WATERMARK: */ +enum journal_res_flags { + __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, + __JOURNAL_RES_GET_CHECK, +}; + +#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) +#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -305,34 +309,23 @@ static inline int journal_res_get_fast(struct journal *j, { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); - unsigned u64s, offset; do { old.v = new.v = v; - /* - * Round up the end of the journal reservation to the next - * cacheline boundary: - */ - u64s = res->u64s; - offset = sizeof(struct jset) / sizeof(u64) + - new.cur_entry_offset + u64s; - u64s += ((offset - 1) & ((SMP_CACHE_BYTES / sizeof(u64)) - 1)) + 1; - - /* * Check if there is still room in the current journal * entry: */ - if (new.cur_entry_offset + u64s > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; EBUG_ON(!journal_state_count(new, new.idx)); - if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) + if ((flags & BCH_WATERMARK_MASK) < j->watermark) return 0; - new.cur_entry_offset += u64s; + new.cur_entry_offset += res->u64s; journal_state_inc(&new); /* @@ -349,15 +342,8 @@ static inline int journal_res_get_fast(struct journal *j, res->ref = true; res->idx = old.idx; - res->u64s = u64s; res->offset = old.cur_entry_offset; res->seq = le64_to_cpu(j->buf[old.idx].data->seq); - - offset = res->offset; - while (offset < res->offset + res->u64s) { - prefetchw(vstruct_idx(j->buf[res->idx].data, offset)); - offset += SMP_CACHE_BYTES / sizeof(u64); - } return 1; } @@ -392,17 +378,17 @@ out: static inline void journal_set_watermark(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - unsigned watermark = JOURNAL_WATERMARK_any; + unsigned watermark = BCH_WATERMARK_stripe; if (fifo_free(&j->pin) < j->pin.size / 4) - watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); if (fifo_free(&j->pin) < j->pin.size / 8) - watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); if (s.reserved > s.remaining) - watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); if (!s.remaining) - watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); if (watermark == j->watermark) return; @@ -445,13 +431,14 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, int d = new_u64s - res->u64s; union journal_preres_state old, new; u64 v = atomic64_read(&j->prereserved.counter); + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; int ret; do { old.v = new.v = v; ret = 0; - if ((flags & JOURNAL_WATERMARK_reserved) || + if (watermark == BCH_WATERMARK_reclaim || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; @@ -479,7 +466,7 @@ static inline int bch2_journal_preres_get(struct journal *j, return 0; if (flags & JOURNAL_RES_GET_NONBLOCK) - return -EAGAIN; + return -BCH_ERR_journal_preres_get_blocked; return __bch2_journal_preres_get(j, res, new_u64s, flags); } @@ -497,7 +484,6 @@ int bch2_journal_flush_seq(struct journal *, u64); int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64); int bch2_journal_meta(struct journal *); -int bch2_journal_log_msg(struct journal *, const char *, ...); void bch2_journal_halt(struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index fb7d0bf..f861ae2 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -14,8 +14,7 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "replicas.h" - -#include +#include "trace.h" static struct nonce journal_nonce(const struct jset *jset) { @@ -119,7 +118,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, le64_to_cpu(j->seq)), GFP_KERNEL); if (!_i) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_entry_add; /* * Duplicate journal entries? If so we want the one that didn't have a @@ -149,12 +148,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, replace: i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_entry_add; i->nr_ptrs = 0; i->csum_good = entry_ptr.csum_good; i->ignore = false; - memcpy(&i->j, j, bytes); + unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { @@ -341,7 +340,8 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, int ret = journal_validate_key(c, jset, entry, entry->level, entry->btree_id, - k, version, big_endian, write); + k, version, big_endian, + write|BKEY_INVALID_JOURNAL); if (ret == FSCK_DELETED_KEY) continue; @@ -357,7 +357,7 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs struct bkey_i *k; bool first = true; - vstruct_for_each(entry, k) { + jset_entry_for_each_key(entry, k) { if (!first) { prt_newline(out); prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); @@ -662,7 +662,8 @@ static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset_entry *entry, unsigned version, int big_endian, int write) { - return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write); + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -745,14 +746,12 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, - c, jset, NULL, - "%s sector %llu seq %llu: unknown journal entry version %u", + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + "%s sector %llu seq %llu: incompatible journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), - version)) { + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { /* don't try to continue: */ return -EINVAL; } @@ -796,14 +795,12 @@ static int jset_validate_early(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max, - c, jset, NULL, - "%s sector %llu seq %llu: unknown journal entry version %u", + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + "%s sector %llu seq %llu: unknown journal entry version %u.%u", ca ? ca->name : c->name, sector, le64_to_cpu(jset->seq), - version)) { + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { /* don't try to continue: */ return -EINVAL; } @@ -835,12 +832,12 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, /* the bios are sized for this many pages, max: */ if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); n = kvpmalloc(new_size, GFP_KERNEL); if (!n) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; kvpfree(b->data, b->size); b->data = n; @@ -987,7 +984,6 @@ static void bch2_journal_read_device(struct closure *cl) struct journal_replay *r, **_r; struct genradix_iter iter; struct journal_read_buf buf = { NULL, 0 }; - u64 min_seq = U64_MAX; unsigned i; int ret = 0; @@ -1006,45 +1002,27 @@ static void bch2_journal_read_device(struct closure *cl) goto err; } - /* Find the journal bucket with the highest sequence number: */ - for (i = 0; i < ja->nr; i++) { - if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) - ja->cur_idx = i; - - min_seq = min(ja->bucket_seq[i], min_seq); - } - - /* - * If there's duplicate journal entries in multiple buckets (which - * definitely isn't supposed to happen, but...) - make sure to start - * cur_idx at the last of those buckets, so we don't deadlock trying to - * allocate - */ - while (ja->bucket_seq[ja->cur_idx] > min_seq && - ja->bucket_seq[ja->cur_idx] == - ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; mutex_lock(&jlist->lock); - genradix_for_each(&c->journal_entries, iter, _r) { + genradix_for_each_reverse(&c->journal_entries, iter, _r) { r = *_r; if (!r) continue; for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx && - sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) { + if (r->ptrs[i].dev == ca->dev_idx) { unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + vstruct_sectors(&r->j, c->block_bits); - ja->sectors_free = min(ja->sectors_free, - ca->mi.bucket_size - wrote); + ja->cur_idx = r->ptrs[i].bucket; + ja->sectors_free = ca->mi.bucket_size - wrote; + goto found; } } } +found: mutex_unlock(&jlist->lock); if (ja->bucket_seq[ja->cur_idx] && @@ -1099,7 +1077,10 @@ void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, } } -int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) +int bch2_journal_read(struct bch_fs *c, + u64 *last_seq, + u64 *blacklist_seq, + u64 *start_seq) { struct journal_list jlist; struct journal_replay *i, **_i, *prev = NULL; @@ -1107,9 +1088,8 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) struct bch_dev *ca; unsigned iter; struct printbuf buf = PRINTBUF; - size_t keys = 0, entries = 0; - bool degraded = false; - u64 seq, last_seq = 0; + bool degraded = false, last_write_torn = false; + u64 seq; int ret = 0; closure_init_stack(&jlist.cl); @@ -1138,36 +1118,46 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) if (jlist.ret) return jlist.ret; - *start_seq = 0; + *last_seq = 0; + *start_seq = 0; + *blacklist_seq = 0; /* * Find most recent flush entry, and ignore newer non flush entries - * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { + int write = READ; + i = *_i; if (!i || i->ignore) continue; if (!*start_seq) - *start_seq = le64_to_cpu(i->j.seq) + 1; - - if (!JSET_NO_FLUSH(&i->j)) { - int write = READ; - if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), - c, &i->j, NULL, - "invalid journal entry: last_seq > seq (%llu > %llu)", - le64_to_cpu(i->j.last_seq), - le64_to_cpu(i->j.seq))) - i->j.last_seq = i->j.seq; - - last_seq = le64_to_cpu(i->j.last_seq); - *blacklist_seq = le64_to_cpu(i->j.seq) + 1; - break; + *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; + + if (JSET_NO_FLUSH(&i->j)) { + i->ignore = true; + continue; } - journal_replay_free(c, i); + if (!last_write_torn && !i->csum_good) { + last_write_torn = true; + i->ignore = true; + continue; + } + + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, &i->j, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) + i->j.last_seq = i->j.seq; + + *last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; } if (!*start_seq) { @@ -1175,12 +1165,18 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) return 0; } - if (!last_seq) { + if (!*last_seq) { fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); - ret = -1; - goto err; + return 0; } + bch_info(c, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); + + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); + /* Drop blacklisted entries and entries older than last_seq: */ genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; @@ -1189,7 +1185,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) continue; seq = le64_to_cpu(i->j.seq); - if (seq < last_seq) { + if (seq < *last_seq) { journal_replay_free(c, i); continue; } @@ -1197,13 +1193,12 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) if (bch2_journal_seq_is_blacklisted(c, seq, true)) { fsck_err_on(!JSET_NO_FLUSH(&i->j), c, "found blacklisted journal entry %llu", seq); - - journal_replay_free(c, i); + i->ignore = true; } } /* Check for missing entries: */ - seq = last_seq; + seq = *last_seq; genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; @@ -1241,7 +1236,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) " prev at %s\n" " next at %s", missing_start, missing_end, - last_seq, *blacklist_seq - 1, + *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); printbuf_exit(&buf1); @@ -1253,8 +1248,6 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) } genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct jset_entry *entry; - struct bkey_i *k, *_n; struct bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_required = 1, @@ -1304,18 +1297,7 @@ int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq) if (ret) goto err; } - - for_each_jset_key(k, _n, entry, &i->j) - keys++; - entries++; } - - bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", - keys, entries, *start_seq); - - if (*start_seq != *blacklist_seq) - bch_info(c, "dropped unflushed entries %llu-%llu", - *blacklist_seq, *start_seq - 1); err: fsck_err: printbuf_exit(&buf); @@ -1353,8 +1335,7 @@ static void __journal_write_alloc(struct journal *j, if (!ca->mi.durability || ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || - bch2_bkey_has_device(bkey_i_to_s_c(&w->key), - ca->dev_idx) || + bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || sectors > ja->sectors_free) continue; @@ -1454,7 +1435,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; - new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1478,7 +1459,6 @@ static void journal_write_done(struct closure *cl) struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_last_unwritten_buf(j); - struct bch_replicas_padded replicas; union journal_res_state old, new; u64 v, seq; int err = 0; @@ -1490,13 +1470,7 @@ static void journal_write_done(struct closure *cl) if (!w->devs_written.nr) { bch_err(c, "unable to write journal to sufficient devices"); err = -EIO; - } else { - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, - w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; } - if (err) bch2_fatal_error(c); @@ -1513,6 +1487,8 @@ static void journal_write_done(struct closure *cl) bch2_do_discards(c); closure_wake_up(&c->freelist_wait); + + bch2_reset_alloc_cursors(c); } } else if (!j->err_seq || seq < j->err_seq) j->err_seq = seq; @@ -1526,7 +1502,7 @@ static void journal_write_done(struct closure *cl) * Must come before signaling write completion, for * bch2_fs_journal_stop(): */ - if (j->watermark) + if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); /* also must come before signalling write completion: */ @@ -1634,12 +1610,59 @@ static void do_journal_write(struct closure *cl) return; } +static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) +{ + struct jset_entry *i, *next, *prev = NULL; + + /* + * Simple compaction, dropping empty jset_entries (from journal + * reservations that weren't fully used) and merging jset_entries that + * can be. + * + * If we wanted to be really fancy here, we could sort all the keys in + * the jset and drop keys that were overwritten - probably not worth it: + */ + vstruct_for_each_safe(jset, i, next) { + unsigned u64s = le16_to_cpu(i->u64s); + + /* Empty entry: */ + if (!u64s) + continue; + + if (i->type == BCH_JSET_ENTRY_btree_root) + bch2_journal_entry_to_btree_root(c, i); + + /* Can we merge with previous entry? */ + if (prev && + i->btree_id == prev->btree_id && + i->level == prev->level && + i->type == prev->type && + i->type == BCH_JSET_ENTRY_btree_keys && + le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { + memmove_u64s_down(vstruct_next(prev), + i->_data, + u64s); + le16_add_cpu(&prev->u64s, u64s); + continue; + } + + /* Couldn't merge, move i into new position (after prev): */ + prev = prev ? vstruct_next(prev) : jset->start; + if (i != prev) + memmove_u64s_down(prev, i, jset_u64s(u64s)); + } + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); +} + void bch2_journal_write(struct closure *cl) { struct journal *j = container_of(cl, struct journal, io); struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; struct jset_entry *start, *end; struct jset *jset; struct bio *bio; @@ -1656,20 +1679,42 @@ void bch2_journal_write(struct closure *cl) j->write_start_time = local_clock(); spin_lock(&j->lock); - if (bch2_journal_error(j) || - w->noflush || - (!w->must_flush && - (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && - test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && + (bch2_journal_error(j) || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { w->noflush = true; SET_JSET_NO_FLUSH(jset, true); jset->last_seq = 0; w->last_seq = 0; j->nr_noflush_writes++; - } else { + } else if (!bch2_journal_error(j)) { j->last_flush_write = jiffies; j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } else { + spin_unlock(&j->lock); + goto err; } spin_unlock(&j->lock); @@ -1683,7 +1728,7 @@ void bch2_journal_write(struct closure *cl) * entry: */ - bch2_journal_entries_to_btree_roots(c, jset); + bch2_journal_entries_postprocess(c, jset); start = end = vstruct_last(jset); @@ -1695,12 +1740,19 @@ void bch2_journal_write(struct closure *cl) BUG_ON(u64s > j->entry_u64s_reserved); le32_add_cpu(&jset->u64s, u64s); - BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); + + sectors = vstruct_sectors(jset, c->block_bits); + bytes = vstruct_bytes(jset); + + if (sectors > w->sectors) { + bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + vstruct_bytes(jset), w->sectors << 9, + u64s, w->u64s_reserved, j->entry_u64s_reserved); + goto err; + } jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber - ? cpu_to_le32(BCH_JSET_VERSION_OLD) - : cpu_to_le32(c->sb.version); + jset->version = cpu_to_le32(c->sb.version); SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); @@ -1732,10 +1784,6 @@ void bch2_journal_write(struct closure *cl) jset_validate(c, NULL, jset, 0, WRITE)) goto err; - sectors = vstruct_sectors(jset, c->block_bits); - BUG_ON(sectors > w->sectors); - - bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); retry_alloc: @@ -1768,9 +1816,7 @@ retry_alloc: bch_err(c, "Unable to allocate journal write:\n%s", journal_debug_buf.buf); printbuf_exit(&journal_debug_buf); - bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); - return; + goto err; } w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); @@ -1784,6 +1830,16 @@ retry_alloc: if (nr_rw_members > 1) w->separate_flush = true; + /* + * Mark journal replicas before we submit the write to guarantee + * recovery will find the journal entries after a crash. + */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { for_each_rw_member(ca, c, i) { percpu_ref_get(&ca->io_ref); diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 2f8bbf0..8801e98 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -40,9 +40,14 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, (entry = __jset_entry_type_next(jset, entry, type)); \ entry = vstruct_next(entry)) -#define for_each_jset_key(k, _n, entry, jset) \ - for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ - vstruct_for_each_safe(entry, k, _n) +#define jset_entry_for_each_key(_e, _k) \ + for (_k = (_e)->start; \ + _k < vstruct_last(_e); \ + _k = bkey_next(_k)) + +#define for_each_jset_key(k, entry, jset) \ + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ + jset_entry_for_each_key(entry, k) int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, int); @@ -52,7 +57,7 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, struct journal_replay *); -int bch2_journal_read(struct bch_fs *, u64 *, u64 *); +int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); void bch2_journal_write(struct closure *); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index e873ce2..8de83e1 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_key_cache.h" +#include "btree_update.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -9,10 +10,10 @@ #include "journal_reclaim.h" #include "replicas.h" #include "super.h" +#include "trace.h" #include #include -#include /* Free space calculations: */ @@ -209,24 +210,7 @@ void bch2_journal_space_available(struct journal *j) clean = j->space[journal_space_clean].total; total = j->space[journal_space_total].total; - if (!clean_ondisk && - journal_cur_seq(j) == j->seq_ondisk) { - struct printbuf buf = PRINTBUF; - - __bch2_journal_debug_to_text(&buf, j); - bch_err(c, "journal stuck\n%s", buf.buf); - printbuf_exit(&buf); - - /* - * Hack: bch2_fatal_error() calls bch2_journal_halt() which - * takes journal lock: - */ - spin_unlock(&j->lock); - bch2_fatal_error(c); - spin_lock(&j->lock); - - ret = JOURNAL_ERR_journal_stuck; - } else if (!j->space[journal_space_discarded].next_entry) + if (!j->space[journal_space_discarded].next_entry) ret = JOURNAL_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < @@ -287,7 +271,7 @@ void bch2_journal_do_discards(struct journal *j) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, ja->buckets[ja->discard_idx]), - ca->mi.bucket_size, GFP_NOIO); + ca->mi.bucket_size, GFP_NOFS); spin_lock(&j->lock); ja->discard_idx = (ja->discard_idx + 1) % ja->nr; @@ -318,9 +302,7 @@ static void bch2_journal_reclaim_fast(struct journal *j) */ while (!fifo_empty(&j->pin) && !atomic_read(&fifo_peek_front(&j->pin).count)) { - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); - BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); - BUG_ON(!fifo_pop(&j->pin, temp)); + fifo_pop(&j->pin, temp); popped = true; } @@ -347,13 +329,13 @@ void bch2_journal_pin_put(struct journal *j, u64 seq) } } -static inline void __journal_pin_drop(struct journal *j, +static inline bool __journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { struct journal_entry_pin_list *pin_list; if (!journal_pin_active(pin)) - return; + return false; if (j->flush_in_progress == pin) j->flush_in_progress_dropped = true; @@ -363,27 +345,39 @@ static inline void __journal_pin_drop(struct journal *j, list_del_init(&pin->list); /* - * Unpinning a journal entry may make journal_next_bucket() succeed if + * Unpinning a journal entry make make journal_next_bucket() succeed, if * writing a new last_seq will now make another bucket available: */ - if (atomic_dec_and_test(&pin_list->count) && - pin_list == &fifo_peek_front(&j->pin)) - bch2_journal_reclaim_fast(j); + return atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin); } void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { spin_lock(&j->lock); - __journal_pin_drop(j, pin); + if (__journal_pin_drop(j, pin)) + bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); } +static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +{ + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) + return JOURNAL_PIN_btree; + else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_key_cache; + else + return JOURNAL_PIN_other; +} + void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { struct journal_entry_pin_list *pin_list; + bool reclaim; spin_lock(&j->lock); @@ -400,18 +394,19 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, pin_list = journal_seq_pin(j, seq); - __journal_pin_drop(j, pin); + reclaim = __journal_pin_drop(j, pin); atomic_inc(&pin_list->count); pin->seq = seq; pin->flush = flush_fn; - if (flush_fn == bch2_btree_key_cache_journal_flush) - list_add(&pin->list, &pin_list->key_cache_list); - else if (flush_fn) - list_add(&pin->list, &pin_list->list); + if (flush_fn) + list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); else list_add(&pin->list, &pin_list->flushed); + + if (reclaim) + bch2_journal_reclaim_fast(j); spin_unlock(&j->lock); /* @@ -442,37 +437,37 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) static struct journal_entry_pin * journal_get_next_pin(struct journal *j, - bool get_any, - bool get_key_cache, - u64 max_seq, u64 *seq) + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + u64 *seq) { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; + unsigned i; fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { - if (*seq > max_seq && !get_any && !get_key_cache) + if (*seq > seq_to_flush && !allowed_above_seq) break; - if (*seq <= max_seq || get_any) { - ret = list_first_entry_or_null(&pin_list->list, - struct journal_entry_pin, list); - if (ret) - return ret; - } - - if (*seq <= max_seq || get_any || get_key_cache) { - ret = list_first_entry_or_null(&pin_list->key_cache_list, - struct journal_entry_pin, list); - if (ret) - return ret; - } + for (i = 0; i < JOURNAL_PIN_NR; i++) + if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || + ((1U << i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->list[i], + struct journal_entry_pin, list); + if (ret) + return ret; + } } return NULL; } /* returns true if we did work */ -static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, +static size_t journal_flush_pins(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, unsigned min_any, unsigned min_key_cache) { @@ -485,15 +480,25 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush, lockdep_assert_held(&j->reclaim_lock); while (1) { + unsigned allowed_above = allowed_above_seq; + unsigned allowed_below = allowed_below_seq; + + if (min_any) { + allowed_above |= ~0; + allowed_below |= ~0; + } + + if (min_key_cache) { + allowed_above |= 1U << JOURNAL_PIN_key_cache; + allowed_below |= 1U << JOURNAL_PIN_key_cache; + } + cond_resched(); j->last_flushed = jiffies; spin_lock(&j->lock); - pin = journal_get_next_pin(j, - min_any != 0, - min_key_cache != 0, - seq_to_flush, &seq); + pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); if (pin) { BUG_ON(j->flush_in_progress); j->flush_in_progress = pin; @@ -652,6 +657,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) atomic_long_read(&c->btree_key_cache.nr_keys)); nr_flushed = journal_flush_pins(j, seq_to_flush, + ~0, 0, min_nr, min_key_cache); if (direct) @@ -703,7 +709,7 @@ static int bch2_journal_reclaim_thread(void *arg) j->next_reclaim = now + delay; while (1) { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); if (kthread_should_stop()) break; if (j->reclaim_kicked) @@ -714,9 +720,9 @@ static int bch2_journal_reclaim_thread(void *arg) spin_unlock(&j->lock); if (journal_empty) - freezable_schedule(); + schedule(); else if (time_after(j->next_reclaim, jiffies)) - freezable_schedule_timeout(j->next_reclaim - jiffies); + schedule_timeout(j->next_reclaim - jiffies); else break; } @@ -772,7 +778,11 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins(j, seq_to_flush, 0, 0)) + if (journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_key_cache)| + (1U << JOURNAL_PIN_other), 0, 0, 0) || + journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_btree), 0, 0, 0)) *did_work = true; spin_lock(&j->lock); @@ -827,8 +837,18 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) mutex_lock(&c->replicas_gc_lock); bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); - seq = 0; + /* + * Now that we've populated replicas_gc, write to the journal to mark + * active journal devices. This handles the case where the journal might + * be empty. Otherwise we could clear all journal replicas and + * temporarily put the fs into an unrecoverable state. Journal recovery + * expects to find devices marked for journal data on unclean mount. + */ + ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + seq = 0; spin_lock(&j->lock); while (!ret) { struct bch_replicas_padded replicas; @@ -845,7 +865,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) spin_lock(&j->lock); } spin_unlock(&j->lock); - +err: ret = bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c index c19db04..cc41bff 100644 --- a/libbcachefs/journal_sb.c +++ b/libbcachefs/journal_sb.c @@ -22,7 +22,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, { struct bch_sb_field_journal *journal = field_to_type(f, journal); struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - int ret = -EINVAL; + int ret = -BCH_ERR_invalid_sb_journal; unsigned nr; unsigned i; u64 *b; @@ -33,7 +33,7 @@ static int bch2_sb_journal_validate(struct bch_sb *sb, b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); if (!b) - return -ENOMEM; + return -BCH_ERR_ENOMEM_sb_journal_validate; for (i = 0; i < nr; i++) b[i] = le64_to_cpu(journal->buckets[i]); @@ -105,7 +105,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, { struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; - int ret = -EINVAL; + int ret = -BCH_ERR_invalid_sb_journal; unsigned nr; unsigned i; struct u64_range *b; @@ -116,7 +116,7 @@ static int bch2_sb_journal_v2_validate(struct bch_sb *sb, b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); if (!b) - return -ENOMEM; + return -BCH_ERR_ENOMEM_sb_journal_v2_validate; for (i = 0; i < nr; i++) { b[i].start = le64_to_cpu(journal->d[i].start); @@ -175,46 +175,45 @@ const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { .to_text = bch2_sb_journal_v2_to_text, }; -int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca) +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, + u64 *buckets, unsigned nr) { - struct journal_device *ja = &ca->journal; struct bch_sb_field_journal_v2 *j; - unsigned i, dst = 0, nr = 1; + unsigned i, dst = 0, nr_compacted = 1; if (c) lockdep_assert_held(&c->sb_lock); - if (!ja->nr) { + if (!nr) { bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); return 0; } - for (i = 0; i + 1 < ja->nr; i++) - if (ja->buckets[i] + 1 != ja->buckets[i + 1]) - nr++; + for (i = 0; i + 1 < nr; i++) + if (buckets[i] + 1 != buckets[i + 1]) + nr_compacted++; j = bch2_sb_resize_journal_v2(&ca->disk_sb, - (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64)); + (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) return -BCH_ERR_ENOSPC_sb_journal; bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); - j->d[dst].start = le64_to_cpu(ja->buckets[0]); - j->d[dst].nr = le64_to_cpu(1); + j->d[dst].start = cpu_to_le64(buckets[0]); + j->d[dst].nr = cpu_to_le64(1); - for (i = 1; i < ja->nr; i++) { - if (ja->buckets[i] == ja->buckets[i - 1] + 1) { + for (i = 1; i < nr; i++) { + if (buckets[i] == buckets[i - 1] + 1) { le64_add_cpu(&j->d[dst].nr, 1); } else { dst++; - j->d[dst].start = le64_to_cpu(ja->buckets[i]); - j->d[dst].nr = le64_to_cpu(1); + j->d[dst].start = cpu_to_le64(buckets[i]); + j->d[dst].nr = cpu_to_le64(1); } } - BUG_ON(dst + 1 != nr); - + BUG_ON(dst + 1 != nr_compacted); return 0; } diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h index a39192e..ba40a7e 100644 --- a/libbcachefs/journal_sb.h +++ b/libbcachefs/journal_sb.h @@ -21,4 +21,4 @@ static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_j extern const struct bch_sb_field_ops bch_sb_field_ops_journal; extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; -int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *); +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index 5c555b3..d6b9f2c 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -103,7 +103,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, sb_blacklist_u64s(nr + 1)); if (!bl) { - ret = -ENOMEM; + ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; goto out; } @@ -168,7 +168,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, GFP_KERNEL); if (!t) - return -ENOMEM; + return -BCH_ERR_ENOMEM_blacklist_table_init; t->nr = nr; @@ -203,7 +203,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, le64_to_cpu(e->end)) { prt_printf(err, "entry %u start >= end (%llu >= %llu)", i, le64_to_cpu(e->start), le64_to_cpu(e->end)); - return -EINVAL; + return -BCH_ERR_invalid_sb_journal_seq_blacklist; } if (i + 1 < nr && @@ -211,7 +211,7 @@ static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, le64_to_cpu(e[1].start)) { prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); - return -EINVAL; + return -BCH_ERR_invalid_sb_journal_seq_blacklist; } } diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index a6cdb88..42504e1 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -43,9 +43,15 @@ struct journal_buf { * flushed: */ +enum journal_pin_type { + JOURNAL_PIN_btree, + JOURNAL_PIN_key_cache, + JOURNAL_PIN_other, + JOURNAL_PIN_NR, +}; + struct journal_entry_pin_list { - struct list_head list; - struct list_head key_cache_list; + struct list_head list[JOURNAL_PIN_NR]; struct list_head flushed; atomic_t count; struct bch_devs_list devs; @@ -141,25 +147,13 @@ enum journal_space_from { journal_space_nr, }; -enum { +enum journal_flags { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NEED_FLUSH_WRITE, }; -#define JOURNAL_WATERMARKS() \ - x(any) \ - x(copygc) \ - x(reserved) - -enum journal_watermark { -#define x(n) JOURNAL_WATERMARK_##n, - JOURNAL_WATERMARKS() -#undef x -}; - -#define JOURNAL_WATERMARK_MASK 3 - /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ @@ -176,31 +170,42 @@ enum journal_errors { #undef x }; +typedef DARRAY(u64) darray_u64; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ - - unsigned long flags; + struct { union journal_res_state reservations; - enum journal_watermark watermark; + enum bch_watermark watermark; + + union journal_preres_state prereserved; + + } __aligned(SMP_CACHE_BYTES); + + unsigned long flags; /* Max size of current journal entry */ unsigned cur_entry_u64s; unsigned cur_entry_sectors; + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + + /* * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ enum journal_errors cur_entry_error; - union journal_preres_state prereserved; - - /* Reserved space in journal entry to be used just prior to write */ - unsigned entry_u64s_reserved; - unsigned buf_size_want; + /* + * We may queue up some things to be journalled (log messages) before + * the journal has actually started - stash them here: + */ + darray_u64 early_journal_entries; /* * Two journal entries -- one is currently open for new entries, the @@ -289,15 +294,15 @@ struct journal { u64 nr_flush_writes; u64 nr_noflush_writes; - struct time_stats *flush_write_time; - struct time_stats *noflush_write_time; - struct time_stats *blocked_time; - struct time_stats *flush_seq_time; + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *blocked_time; + struct bch2_time_stats *flush_seq_time; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map res_map; #endif -}; +} __aligned(SMP_CACHE_BYTES); /* * Embedded in struct bch_dev. First three fields refer to the array of journal diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c index 5e85055..5699cd4 100644 --- a/libbcachefs/keylist.c +++ b/libbcachefs/keylist.c @@ -18,7 +18,7 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, (old_buf && roundup_pow_of_two(oldsize) == newsize)) return 0; - new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); + new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); if (!new_keys) return -ENOMEM; @@ -31,22 +31,6 @@ int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, return 0; } -void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -{ - struct bkey_i *where; - - for_each_keylist_key(l, where) - if (bkey_cmp(insert->k.p, where->k.p) < 0) - break; - - memmove_u64s_up((u64 *) where + insert->k.u64s, - where, - ((u64 *) l->top) - ((u64 *) where)); - - l->top_p += insert->k.u64s; - bkey_copy(where, insert); -} - void bch2_keylist_pop_front(struct keylist *l) { l->top_p -= bch2_keylist_front(l)->k.u64s; @@ -63,6 +47,6 @@ void bch2_verify_keylist_sorted(struct keylist *l) for_each_keylist_key(l, k) BUG_ON(bkey_next(k) != l->top && - bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0); + bpos_ge(k->k.p, bkey_next(k)->k.p)); } #endif diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index 635efb7..fe759c7 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -5,7 +5,6 @@ #include "keylist_types.h" int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); void bch2_keylist_pop_front(struct keylist *); static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 53e607d..3e8b8f2 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -4,19 +4,20 @@ #include "alloc_background.h" #include "btree_iter.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "error.h" #include "lru.h" #include "recovery.h" +/* KEY_TYPE_lru is obsolete: */ int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + if (!lru_pos_time(k.k->p)) { + prt_printf(err, "lru entry at time=0"); + return -BCH_ERR_invalid_bkey; - if (bkey_val_bytes(k.k) < sizeof(*lru)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*lru)); - return -EINVAL; } return 0; @@ -30,154 +31,110 @@ void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); } -int bch2_lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time, - struct bkey_s_c orig_k) +void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) { - struct btree_iter iter; - struct bkey_s_c k; - u64 existing_idx; - struct printbuf buf = PRINTBUF; - int ret = 0; - - if (!time) - return 0; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, - POS(id, time), - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_lru) { - bch2_bkey_val_to_text(&buf, trans->c, orig_k); - bch2_trans_inconsistent(trans, - "pointer to nonexistent lru %llu:%llu\n%s", - id, time, buf.buf); - ret = -EIO; - goto err; - } - - existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx); - if (existing_idx != idx) { - bch2_bkey_val_to_text(&buf, trans->c, orig_k); - bch2_trans_inconsistent(trans, - "lru %llu:%llu with wrong backpointer: got %llu, should be %llu\n%s", - id, time, existing_idx, idx, buf.buf); - ret = -EIO; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; + prt_printf(out, "%llu:%llu -> %llu:%llu", + lru_pos_id(lru), + lru_pos_time(lru), + u64_to_bucket(lru.offset).inode, + u64_to_bucket(lru.offset).offset); } -int bch2_lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time) +static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, + u64 dev_bucket, u64 time, bool set) { - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_lru *lru; - int ret = 0; - - if (!*time) - return 0; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_lru, - POS(lru_id, *time), - BTREE_ITER_SLOTS| - BTREE_ITER_INTENT| - BTREE_ITER_WITH_UPDATES, k, ret) - if (bkey_deleted(k.k)) - break; - - if (ret) - goto err; - - BUG_ON(iter.pos.inode != lru_id); - *time = iter.pos.offset; - - lru = bch2_trans_kmalloc(trans, sizeof(*lru)); - ret = PTR_ERR_OR_ZERO(lru); - if (ret) - goto err; + return time + ? bch2_btree_bit_mod(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) + : 0; +} - bkey_lru_init(&lru->k_i); - lru->k.p = iter.pos; - lru->v.idx = cpu_to_le64(idx); +int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); +} - ret = bch2_trans_update(trans, &iter, &lru->k_i, 0); - if (ret) - goto err; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; +int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx, - u64 old_time, u64 *new_time, - struct bkey_s_c k) +int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { - if (old_time == *new_time) + if (old_time == new_time) return 0; - return bch2_lru_delete(trans, id, idx, old_time, k) ?: - bch2_lru_set(trans, id, idx, new_time); + return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: + bch2_lru_set(trans, lru_id, dev_bucket, new_time); } +static const char * const bch2_lru_types[] = { +#define x(n) #n, + BCH_LRU_TYPES() +#undef x + NULL +}; + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, - struct bkey_s_c lru_k) + struct bkey_s_c lru_k, + struct bpos *last_flushed_pos) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct bch_alloc_v4 a; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - struct bpos alloc_pos; + enum bch_lru_type type = lru_type(lru_k); + struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); + u64 idx; int ret; - alloc_pos = POS(lru_k.k->p.inode, - le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx)); - if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, "lru key points to nonexistent device:bucket %llu:%llu", alloc_pos.inode, alloc_pos.offset)) return bch2_btree_delete_at(trans, lru_iter, 0); - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); ret = bkey_err(k); if (ret) goto err; - bch2_alloc_to_v4(k, &a); + a = bch2_alloc_to_v4(k, &a_convert); - if (fsck_err_on(a.data_type != BCH_DATA_cached || - a.io_time[READ] != lru_k.k->p.offset, c, - "incorrect lru entry %s\n" - " for %s", - (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), - (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.p = lru_iter->pos; + switch (type) { + case BCH_LRU_read: + idx = alloc_lru_idx_read(*a); + break; + case BCH_LRU_fragmentation: + idx = a->fragmentation_lru; + break; + } - ret = bch2_trans_update(trans, lru_iter, update, 0); - if (ret) - goto err; + if (lru_k.k->type != KEY_TYPE_set || + lru_pos_time(lru_k.k->p) != idx) { + if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { + *last_flushed_pos = lru_k.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + " %s\n" + " for %s", + bch2_lru_types[type], + lru_pos_time(lru_k.k->p), + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) + ret = bch2_btree_delete_at(trans, lru_iter, 0); } +out: err: fsck_err: bch2_trans_iter_exit(trans, &iter); @@ -188,19 +145,18 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct bpos last_flushed_pos = POS_MIN; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = for_each_btree_key_commit(&trans, iter, - BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, - NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, - bch2_check_lru_key(&trans, &iter, k)); - - bch2_trans_exit(&trans); + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos))); + if (ret) + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h index 925c29b..be66bf9 100644 --- a/libbcachefs/lru.h +++ b/libbcachefs/lru.h @@ -2,17 +2,67 @@ #ifndef _BCACHEFS_LRU_H #define _BCACHEFS_LRU_H -int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +static inline u64 lru_pos_id(struct bpos pos) +{ + return pos.inode >> LRU_TIME_BITS; +} + +static inline u64 lru_pos_time(struct bpos pos) +{ + return pos.inode & ~(~0ULL << LRU_TIME_BITS); +} + +static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) +{ + struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); + + EBUG_ON(time > LRU_TIME_MAX); + EBUG_ON(lru_pos_id(pos) != lru_id); + EBUG_ON(lru_pos_time(pos) != time); + EBUG_ON(pos.offset != dev_bucket); + + return pos; +} + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +static inline enum bch_lru_type lru_type(struct bkey_s_c l) +{ + u16 lru_id = l.k->p.inode >> 48; + + if (lru_id == BCH_LRU_FRAGMENTATION_START) + return BCH_LRU_fragmentation; + return BCH_LRU_read; +} + +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_lru_pos_to_text(struct printbuf *, struct bpos); + #define bch2_bkey_ops_lru ((struct bkey_ops) { \ .key_invalid = bch2_lru_invalid, \ .val_to_text = bch2_lru_to_text, \ + .min_val_size = 8, \ }) -int bch2_lru_delete(struct btree_trans *, u64, u64, u64, struct bkey_s_c); -int bch2_lru_set(struct btree_trans *, u64, u64, u64 *); -int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *, struct bkey_s_c); +int bch2_lru_del(struct btree_trans *, u16, u64, u64); +int bch2_lru_set(struct btree_trans *, u16, u64, u64); +int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); int bch2_check_lrus(struct bch_fs *); diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 8b258d9..81c8cdb 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -46,16 +46,14 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct bkey_i *n; int ret; - if (!bch2_bkey_has_device(k, dev_idx)) + if (!bch2_bkey_has_device_c(k, dev_idx)) return 0; - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; - bkey_reassemble(n, k); - ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); if (ret) return ret; @@ -75,8 +73,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, */ if (bkey_deleted(&n->k)) n->k.size = 0; - - return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + return 0; } static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) @@ -132,8 +129,7 @@ retry: while (bch2_trans_begin(&trans), (b = bch2_btree_iter_peek_node(&iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { - if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), - dev_idx)) + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; bch2_bkey_buf_copy(&k, c, &b->key); @@ -145,7 +141,7 @@ retry: break; } - ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, false); + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 700f847..0527267 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -1,12 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "backpointers.h" #include "bkey_buf.h" #include "btree_gc.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "disk_groups.h" #include "ec.h" #include "errcode.h" @@ -14,15 +16,47 @@ #include "inode.h" #include "io.h" #include "journal_reclaim.h" +#include "keylist.h" #include "move.h" #include "replicas.h" #include "super-io.h" -#include "keylist.h" +#include "trace.h" #include #include -#include +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_read_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_read(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_alloc_mem_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_alloc_mem_fail(c, buf.buf); + printbuf_exit(&buf); + } +} static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) { @@ -39,28 +73,36 @@ static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) } struct moving_io { - struct list_head list; - struct closure cl; - bool read_completed; + struct list_head read_list; + struct list_head io_list; + struct move_bucket_in_flight *b; + struct closure cl; + bool read_completed; - unsigned read_sectors; - unsigned write_sectors; + unsigned read_sectors; + unsigned write_sectors; - struct bch_read_bio rbio; + struct bch_read_bio rbio; - struct data_update write; + struct data_update write; /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; + struct bio_vec bi_inline_vecs[0]; }; static void move_free(struct moving_io *io) { struct moving_context *ctxt = io->write.ctxt; - struct bch_fs *c = ctxt->c; + + if (io->b) + atomic_dec(&io->b->count); bch2_data_update_exit(&io->write); + + mutex_lock(&ctxt->lock); + list_del(&io->io_list); wake_up(&ctxt->wait); - percpu_ref_put(&c->writes); + mutex_unlock(&ctxt->lock); + kfree(io); } @@ -73,6 +115,7 @@ static void move_write_done(struct bch_write_op *op) ctxt->write_error = true; atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_dec(&io->write.ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } @@ -86,14 +129,15 @@ static void move_write(struct moving_io *io) closure_get(&io->write.ctxt->cl); atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); bch2_data_update_read_done(&io->write, io->rbio.pick.crc); } -static inline struct moving_io *next_pending_write(struct moving_context *ctxt) +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) { struct moving_io *io = - list_first_entry_or_null(&ctxt->reads, struct moving_io, list); + list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); return io && io->read_completed ? io : NULL; } @@ -104,35 +148,27 @@ static void move_read_endio(struct bio *bio) struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); + atomic_dec(&ctxt->read_ios); io->read_completed = true; wake_up(&ctxt->wait); closure_put(&ctxt->cl); } -static void do_pending_writes(struct moving_context *ctxt, struct btree_trans *trans) +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, + struct btree_trans *trans) { struct moving_io *io; if (trans) bch2_trans_unlock(trans); - while ((io = next_pending_write(ctxt))) { - list_del(&io->list); + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + list_del(&io->read_list); move_write(io); } } -#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ -do { \ - do_pending_writes(_ctxt, _trans); \ - \ - if (_cond) \ - break; \ - __wait_event((_ctxt)->wait, \ - next_pending_write(_ctxt) || (_cond)); \ -} while (1) - static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, struct btree_trans *trans) { @@ -145,17 +181,26 @@ static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, void bch2_moving_ctxt_exit(struct moving_context *ctxt) { + struct bch_fs *c = ctxt->c; + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); closure_sync(&ctxt->cl); + EBUG_ON(atomic_read(&ctxt->write_sectors)); + EBUG_ON(atomic_read(&ctxt->write_ios)); + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); if (ctxt->stats) { - progress_list_del(ctxt->c, ctxt->stats); - - trace_move_data(ctxt->c, + progress_list_del(c, ctxt->stats); + trace_move_data(c, atomic64_read(&ctxt->stats->sectors_moved), atomic64_read(&ctxt->stats->keys_moved)); } + + mutex_lock(&c->moving_context_lock); + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); } void bch2_moving_ctxt_init(struct moving_context *ctxt, @@ -168,15 +213,23 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, memset(ctxt, 0, sizeof(*ctxt)); ctxt->c = c; + ctxt->fn = (void *) _RET_IP_; ctxt->rate = rate; ctxt->stats = stats; ctxt->wp = wp; ctxt->wait_on_copygc = wait_on_copygc; closure_init_stack(&ctxt->cl); + + mutex_init(&ctxt->lock); INIT_LIST_HEAD(&ctxt->reads); + INIT_LIST_HEAD(&ctxt->ios); init_waitqueue_head(&ctxt->wait); + mutex_lock(&c->moving_context_lock); + list_add(&ctxt->list, &c->moving_context_list); + mutex_unlock(&c->moving_context_lock); + if (stats) { progress_list_add(c, stats); stats->data_type = BCH_DATA_user; @@ -198,13 +251,11 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, struct bkey_i *n; int ret; - n = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + n = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; - bkey_reassemble(n, k); - while (data_opts.kill_ptrs) { unsigned i = 0, drop = __fls(data_opts.kill_ptrs); struct bch_extent_ptr *ptr; @@ -229,13 +280,15 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans, if (bkey_deleted(&n->k)) n->k.size = 0; - return bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); } static int bch2_move_extent(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, struct bch_io_opts io_opts, enum btree_id btree_id, struct bkey_s_c k, @@ -249,6 +302,8 @@ static int bch2_move_extent(struct btree_trans *trans, unsigned sectors = k.k->size, pages; int ret = -ENOMEM; + trace_move_extent2(c, k); + bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && @@ -258,9 +313,6 @@ static int bch2_move_extent(struct btree_trans *trans, return 0; } - if (!percpu_ref_tryget_live(&c->writes)) - return -EROFS; - /* * Before memory allocations & taking nocow locks in * bch2_data_update_init(): @@ -277,6 +329,7 @@ static int bch2_move_extent(struct btree_trans *trans, if (!io) goto err; + INIT_LIST_HEAD(&io->io_list); io->write.ctxt = ctxt; io->read_sectors = k.k->size; io->write_sectors = k.k->size; @@ -296,21 +349,15 @@ static int bch2_move_extent(struct btree_trans *trans, bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; - bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); + io->rbio.bio.bi_opf = REQ_OP_READ; io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); io->rbio.bio.bi_end_io = move_read_endio; - ret = bch2_data_update_init(c, &io->write, ctxt->wp, io_opts, - data_opts, btree_id, k); + ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, btree_id, k); if (ret && ret != -BCH_ERR_unwritten_extent_update) goto err_free_pages; - io->write.ctxt = ctxt; - io->write.op.end_io = move_write_done; - - atomic64_inc(&ctxt->stats->keys_moved); - atomic64_add(k.k->size, &ctxt->stats->sectors_moved); - if (ret == -BCH_ERR_unwritten_extent_update) { bch2_update_unwritten_extent(trans, &io->write); move_free(io); @@ -319,12 +366,30 @@ static int bch2_move_extent(struct btree_trans *trans, BUG_ON(ret); + io->write.ctxt = ctxt; + io->write.op.end_io = move_write_done; + + if (ctxt->stats) { + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + } + + if (bucket_in_flight) { + io->b = bucket_in_flight; + atomic_inc(&io->b->count); + } + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); - trace_move_extent_read(k.k); + trace_move_extent_read2(c, k); + mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); - list_add_tail(&io->list, &ctxt->reads); + atomic_inc(&ctxt->read_ios); + + list_add_tail(&io->read_list, &ctxt->reads); + list_add_tail(&io->io_list, &ctxt->ios); + mutex_unlock(&ctxt->lock); /* * dropped by move_read_endio() - guards against use after free of @@ -342,8 +407,8 @@ err_free_pages: err_free: kfree(io); err: - percpu_ref_put(&c->writes); - trace_and_count(c, move_extent_alloc_mem_fail, k.k); + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); + trace_move_extent_alloc_mem_fail2(c, k); return ret; } @@ -361,8 +426,8 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos, if (ret) goto err; - if (!k.k || bkey_cmp(k.k->p, pos)) { - ret = -ENOENT; + if (!k.k || !bkey_eq(k.k->p, pos)) { + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -413,13 +478,15 @@ static int move_ratelimit(struct btree_trans *trans, } } while (delay); + /* + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->write_sectors) < - c->opts.move_bytes_in_flight >> 9); - - move_ctxt_wait_event(ctxt, trans, - atomic_read(&ctxt->read_sectors) < - c->opts.move_bytes_in_flight >> 9); + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && + atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); return 0; } @@ -434,8 +501,6 @@ static int move_get_io_opts(struct btree_trans *trans, if (*cur_inum == k.k->p.inode) return 0; - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - ret = lookup_inode(trans, SPOS(0, k.k->p.inode, k.k->p.snapshot), &inode); @@ -443,8 +508,9 @@ static int move_get_io_opts(struct btree_trans *trans, return ret; if (!ret) - bch2_io_opts_apply(io_opts, bch2_inode_opts_get(&inode)); - + bch2_inode_opts_get(io_opts, trans->c, &inode); + else + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); *cur_inum = k.k->p.inode; return 0; } @@ -468,9 +534,11 @@ static int __bch2_move_data(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - ctxt->stats->data_type = BCH_DATA_user; - ctxt->stats->btree_id = btree_id; - ctxt->stats->pos = start; + if (ctxt->stats) { + ctxt->stats->data_type = BCH_DATA_user; + ctxt->stats->btree_id = btree_id; + ctxt->stats->pos = start; + } bch2_trans_iter_init(&trans, &iter, btree_id, start, BTREE_ITER_PREFETCH| @@ -492,10 +560,11 @@ static int __bch2_move_data(struct moving_context *ctxt, if (ret) break; - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + if (bkey_ge(bkey_start_pos(k.k), end)) break; - ctxt->stats->pos = iter.pos; + if (ctxt->stats) + ctxt->stats->pos = iter.pos; if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; @@ -514,10 +583,9 @@ static int __bch2_move_data(struct moving_context *ctxt, */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, - btree_id, k, data_opts); + ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL, + io_opts, btree_id, k, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -535,7 +603,8 @@ static int __bch2_move_data(struct moving_context *ctxt, if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); next: - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: bch2_btree_iter_advance(&iter); } @@ -563,7 +632,7 @@ int bch2_move_data(struct bch_fs *c, bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); id++) { stats->btree_id = id; @@ -571,6 +640,9 @@ int bch2_move_data(struct bch_fs *c, id != BTREE_ID_reflink) continue; + if (!bch2_btree_id_root(c, id)->b) + continue; + ret = __bch2_move_data(&ctxt, id == start_btree_id ? start_pos : POS_MIN, id == end_btree_id ? end_pos : POS_MAX, @@ -584,104 +656,69 @@ int bch2_move_data(struct bch_fs *c, return ret; } -static int verify_bucket_evacuated(struct btree_trans *trans, struct bpos bucket, int gen) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - struct bch_backpointer bp; - u64 bp_offset = 0; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_CACHED); -again: - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - - if (!ret && k.k->type == KEY_TYPE_alloc_v4) { - struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - - if (a.v->gen == gen && - a.v->dirty_sectors) { - if (a.v->data_type == BCH_DATA_btree) { - bch2_trans_unlock(trans); - if (bch2_btree_interior_updates_flush(c)) - goto again; - goto failed_to_evacuate; - } - } - } - - bch2_trans_iter_exit(trans, &iter); - return ret; -failed_to_evacuate: - bch2_trans_iter_exit(trans, &iter); - - prt_printf(&buf, bch2_log_msg(c, "failed to evacuate bucket ")); - bch2_bkey_val_to_text(&buf, c, k); - - while (1) { - bch2_trans_begin(trans); - - ret = bch2_get_next_backpointer(trans, bucket, gen, - &bp_offset, &bp, - BTREE_ITER_CACHED); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - if (bp_offset == U64_MAX) - break; - - k = bch2_backpointer_get_key(trans, &iter, - bucket, bp_offset, bp); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - break; - if (!k.k) - continue; - prt_newline(&buf); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_iter_exit(trans, &iter); - } - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - return 0; -} - -int __bch2_evacuate_bucket(struct moving_context *ctxt, +int __bch2_evacuate_bucket(struct btree_trans *trans, + struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, struct bpos bucket, int gen, struct data_update_opts _data_opts) { struct bch_fs *c = ctxt->c; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_trans trans; struct btree_iter iter; struct bkey_buf sk; struct bch_backpointer bp; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c k; struct data_update_opts data_opts; - u64 bp_offset = 0, cur_inum = U64_MAX; + unsigned dirty_sectors, bucket_size; + u64 fragmentation; + u64 cur_inum = U64_MAX; + struct bpos bp_pos = POS_MIN; int ret = 0; + trace_bucket_evacuate(c, &bucket); + bch2_bkey_buf_init(&sk); - bch2_trans_init(&trans, c, 0, 0); - while (!(ret = move_ratelimit(&trans, ctxt))) { - bch2_trans_begin(&trans); + /* + * We're not run in a context that handles transaction restarts: + */ + bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(&trans, bucket, gen, - &bp_offset, &bp, + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "looking up alloc key"); + goto err; + } + + a = bch2_alloc_to_v4(k, &a_convert); + dirty_sectors = a->dirty_sectors; + bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + fragmentation = a->fragmentation_lru; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) { + bch_err_msg(c, ret, "flushing btree write buffer"); + goto err; + } + + while (!(ret = move_ratelimit(trans, ctxt))) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, + &bp_pos, &bp, BTREE_ITER_CACHED); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (bp_offset == U64_MAX) + if (bkey_eq(bp_pos, POS_MAX)) break; if (!bp.level) { @@ -689,22 +726,21 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, struct bkey_s_c k; unsigned i = 0; - k = bch2_backpointer_get_key(&trans, &iter, - bucket, bp_offset, bp); + k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; if (!k.k) - continue; + goto next; bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); if (ret) { - bch2_trans_iter_exit(&trans, &iter); + bch2_trans_iter_exit(trans, &iter); continue; } @@ -713,20 +749,26 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.rewrite_ptrs = 0; bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == bucket.inode) + if (ptr->dev == bucket.inode) { data_opts.rewrite_ptrs |= 1U << i; + if (ptr->cached) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + } i++; } - ret = bch2_move_extent(&trans, &iter, ctxt, io_opts, - bp.btree_id, k, data_opts); - bch2_trans_iter_exit(&trans, &iter); + ret = bch2_move_extent(trans, &iter, ctxt, + bucket_in_flight, + io_opts, bp.btree_id, k, data_opts); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt, &trans); + bch2_move_ctxt_wait_for_io(ctxt, trans); continue; } if (ret) @@ -734,12 +776,12 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); } else { struct btree *b; - b = bch2_backpointer_get_node(&trans, &iter, - bucket, bp_offset, bp); + b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) continue; @@ -748,10 +790,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (ret) goto err; if (!b) - continue; + goto next; - ret = bch2_btree_node_rewrite(&trans, &iter, b, 0); - bch2_trans_iter_exit(&trans, &iter); + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -761,22 +803,17 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt, if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, c->opts.btree_node_size >> 9); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); - atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + if (ctxt->stats) { + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + } } - - bp_offset++; +next: + bp_pos = bpos_nosnap_successor(bp_pos); } - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && gen >= 0) { - bch2_trans_unlock(&trans); - move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); - closure_sync(&ctxt->cl); - if (!ctxt->write_error) - lockrestart_do(&trans, verify_bucket_evacuated(&trans, bucket, gen)); - } + trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); err: - bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return ret; } @@ -789,12 +826,15 @@ int bch2_evacuate_bucket(struct bch_fs *c, struct write_point_specifier wp, bool wait_on_copygc) { + struct btree_trans trans; struct moving_context ctxt; int ret; + bch2_trans_init(&trans, c, 0, 0); bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_evacuate_bucket(&ctxt, bucket, gen, data_opts); + ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts); bch2_moving_ctxt_exit(&ctxt); + bch2_trans_exit(&trans); return ret; } @@ -824,10 +864,13 @@ static int bch2_move_btree(struct bch_fs *c, stats->data_type = BCH_DATA_btree; for (id = start_btree_id; - id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); id++) { stats->btree_id = id; + if (!bch2_btree_id_root(c, id)->b) + continue; + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, BTREE_ITER_PREFETCH); retry: @@ -867,7 +910,7 @@ next: bch2_trans_exit(&trans); if (ret) - bch_err(c, "error in %s(): %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); bch2_btree_interior_updates_flush(c); @@ -992,6 +1035,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) mutex_unlock(&c->sb_lock); } + if (ret) + bch_err_fn(c, ret); return ret; } @@ -1057,3 +1102,67 @@ int bch2_data_job(struct bch_fs *c, return ret; } + +void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_move_stats *stats; + + mutex_lock(&c->data_progress_lock); + list_for_each_entry(stats, &c->data_progress_list, list) { + prt_printf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + prt_printf(out, "%s", "\n"); + } + mutex_unlock(&c->data_progress_lock); +} + +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) +{ + struct moving_io *io; + + prt_printf(out, "%ps:", ctxt->fn); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "reads: %u sectors %u", + atomic_read(&ctxt->read_ios), + atomic_read(&ctxt->read_sectors)); + prt_newline(out); + + prt_printf(out, "writes: %u sectors %u", + atomic_read(&ctxt->write_ios), + atomic_read(&ctxt->write_sectors)); + prt_newline(out); + + printbuf_indent_add(out, 2); + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) { + bch2_write_op_to_text(out, &io->write.op); + } + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +} + +void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct moving_context *ctxt; + + mutex_lock(&c->moving_context_lock); + list_for_each_entry(ctxt, &c->moving_context_list, list) + bch2_moving_ctxt_to_text(out, ctxt); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_fs_move_init(struct bch_fs *c) +{ + INIT_LIST_HEAD(&c->moving_context_list); + mutex_init(&c->moving_context_lock); + + INIT_LIST_HEAD(&c->data_progress_list); + mutex_init(&c->data_progress_lock); +} diff --git a/libbcachefs/move.h b/libbcachefs/move.h index b14f679..547ee7b 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -11,6 +11,9 @@ struct bch_read_bio; struct moving_context { struct bch_fs *c; + struct list_head list; + void *fn; + struct bch_ratelimit *rate; struct bch_move_stats *stats; struct write_point_specifier wp; @@ -19,15 +22,34 @@ struct moving_context { /* For waiting on outstanding reads and writes: */ struct closure cl; + + struct mutex lock; struct list_head reads; + struct list_head ios; /* in flight sectors: */ atomic_t read_sectors; atomic_t write_sectors; + atomic_t read_ios; + atomic_t write_ios; wait_queue_head_t wait; }; +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +do { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond))); \ + if (cond_finished) \ + break; \ +} while (1) + typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); @@ -35,6 +57,9 @@ void bch2_moving_ctxt_exit(struct moving_context *); void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, struct bch_ratelimit *, struct bch_move_stats *, struct write_point_specifier, bool); +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *, + struct btree_trans *); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); @@ -47,7 +72,9 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); -int __bch2_evacuate_bucket(struct moving_context *, +int __bch2_evacuate_bucket(struct btree_trans *, + struct moving_context *, + struct move_bucket_in_flight *, struct bpos, int, struct data_update_opts); int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, @@ -61,6 +88,9 @@ int bch2_data_job(struct bch_fs *, struct bch_ioctl_data); void bch2_move_stats_init(struct bch_move_stats *stats, char *name); +void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_move_init(struct bch_fs *); #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/move_types.h b/libbcachefs/move_types.h index 9df6d18..baf1f85 100644 --- a/libbcachefs/move_types.h +++ b/libbcachefs/move_types.h @@ -16,4 +16,21 @@ struct bch_move_stats { atomic64_t sectors_raced; }; +struct move_bucket_key { + struct bpos bucket; + u8 gen; +}; + +struct move_bucket { + struct move_bucket_key k; + unsigned sectors; +}; + +struct move_bucket_in_flight { + struct move_bucket_in_flight *next; + struct rhash_head hash; + struct move_bucket bucket; + atomic_t count; +}; + #endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 63bc692..5242f20 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -10,6 +10,7 @@ #include "alloc_foreground.h" #include "btree_iter.h" #include "btree_update.h" +#include "btree_write_buffer.h" #include "buckets.h" #include "clock.h" #include "disk_groups.h" @@ -19,11 +20,13 @@ #include "eytzinger.h" #include "io.h" #include "keylist.h" +#include "lru.h" #include "move.h" #include "movinggc.h" #include "super-io.h" +#include "trace.h" -#include +#include #include #include #include @@ -31,141 +34,216 @@ #include #include -static inline int fragmentation_cmp(copygc_heap *heap, - struct copygc_heap_entry l, - struct copygc_heap_entry r) +struct buckets_in_flight { + struct rhashtable table; + struct move_bucket_in_flight *first; + struct move_bucket_in_flight *last; + size_t nr; + size_t sectors; +}; + +static const struct rhashtable_params bch_move_bucket_params = { + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), +}; + +static struct move_bucket_in_flight * +move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) { - return cmp_int(l.fragmentation, r.fragmentation); + struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); + int ret; + + if (!new) + return ERR_PTR(-ENOMEM); + + new->bucket = b; + + ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, + bch_move_bucket_params); + if (ret) { + kfree(new); + return ERR_PTR(ret); + } + + if (!list->first) + list->first = new; + else + list->last->next = new; + + list->last = new; + list->nr++; + list->sectors += b.sectors; + return new; } -static int find_buckets_to_copygc(struct bch_fs *c) +static int bch2_bucket_is_movable(struct btree_trans *trans, + struct move_bucket *b, u64 time) { - copygc_heap *h = &c->copygc_heap; - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - struct bch_alloc_v4 a; + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a; int ret; - bch2_trans_init(&trans, c, 0, 0); + if (bch2_bucket_is_open(trans->c, + b->k.bucket.inode, + b->k.bucket.offset)) + return 0; - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - h->used = 0; - - for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode); - struct copygc_heap_entry e; - - bch2_alloc_to_v4(k, &a); - - if ((a.data_type != BCH_DATA_btree && - a.data_type != BCH_DATA_user) || - a.dirty_sectors >= ca->mi.bucket_size || - bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset)) - continue; + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; - e = (struct copygc_heap_entry) { - .dev = iter.pos.inode, - .gen = a.gen, - .replicas = 1 + a.stripe_redundancy, - .fragmentation = div_u64((u64) a.dirty_sectors * (1ULL << 31), - ca->mi.bucket_size), - .sectors = a.dirty_sectors, - .bucket = iter.pos.offset, - }; - heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + a = bch2_alloc_to_v4(k, &_a); + b->k.gen = a->gen; + b->sectors = a->dirty_sectors; - } - bch2_trans_iter_exit(&trans, &iter); + ret = data_type_movable(a->data_type) && + a->fragmentation_lru && + a->fragmentation_lru <= time; - bch2_trans_exit(&trans); + bch2_trans_iter_exit(trans, &iter); return ret; } -static int bch2_copygc(struct bch_fs *c) +static void move_buckets_wait(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *list, + bool flush) { - copygc_heap *h = &c->copygc_heap; - struct copygc_heap_entry e; - struct bch_move_stats move_stats; - struct bch_dev *ca; - unsigned dev_idx; - size_t heap_size = 0; - struct moving_context ctxt; - struct data_update_opts data_opts = { - .btree_insert_flags = BTREE_INSERT_USE_RESERVE|JOURNAL_WATERMARK_copygc, - }; - int ret = 0; + struct move_bucket_in_flight *i; + int ret; - bch2_move_stats_init(&move_stats, "copygc"); + while ((i = list->first)) { + if (flush) + move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); + + if (atomic_read(&i->count)) + break; - for_each_rw_member(ca, c, dev_idx) - heap_size += ca->mi.nbuckets >> 7; + list->first = i->next; + if (!list->first) + list->last = NULL; - if (h->size < heap_size) { - free_heap(&c->copygc_heap); - if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { - bch_err(c, "error allocating copygc heap"); - return 0; - } + list->nr--; + list->sectors -= i->bucket.sectors; + + ret = rhashtable_remove_fast(&list->table, &i->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(i); } - ret = find_buckets_to_copygc(c); - if (ret) { - bch2_fs_fatal_error(c, "error walking buckets to copygc!"); + bch2_trans_unlock(trans); +} + +static bool bucket_in_flight(struct buckets_in_flight *list, + struct move_bucket_key k) +{ + return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); +} + +typedef DARRAY(struct move_bucket) move_buckets; + +static int bch2_copygc_get_buckets(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + move_buckets *buckets) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); + size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; + int ret; + + move_buckets_wait(trans, ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_flush(trans); + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + __func__, bch2_err_str(ret))) return ret; - } - if (!h->used) { - s64 wait = S64_MAX, dev_wait; - u64 dev_min_wait_fragmented = 0; - u64 dev_min_wait_allowed = 0; - int dev_min_wait = -1; - - for_each_rw_member(ca, c, dev_idx) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - s64 allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * - ca->mi.bucket_size) >> 1); - s64 fragmented = usage.d[BCH_DATA_user].fragmented; - - dev_wait = max(0LL, allowed - fragmented); - - if (dev_min_wait < 0 || dev_wait < wait) { - dev_min_wait = dev_idx; - dev_min_wait_fragmented = fragmented; - dev_min_wait_allowed = allowed; - } + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), + lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + 0, k, ({ + struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; + int ret = 0; + + saw++; + + if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + not_movable++; + else if (bucket_in_flight(buckets_in_flight, b.k)) + in_flight++; + else { + ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret >= 0) + sectors += b.sectors; } + ret; + })); - bch_err_ratelimited(c, "copygc requested to run but found no buckets to move! dev %u fragmented %llu allowed %llu", - dev_min_wait, dev_min_wait_fragmented, dev_min_wait_allowed); - return 0; - } + pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", + buckets_in_flight->nr, buckets_in_flight->sectors, + saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); - heap_resort(h, fragmentation_cmp, NULL); + return ret < 0 ? ret : 0; +} - bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, - writepoint_ptr(&c->copygc_write_point), - false); +noinline +static int bch2_copygc(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct bch_fs *c = trans->c; + struct data_update_opts data_opts = { + .btree_insert_flags = BCH_WATERMARK_copygc, + }; + move_buckets buckets = { 0 }; + struct move_bucket_in_flight *f; + struct move_bucket *i; + u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + + ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); + if (ret) + goto err; + + darray_for_each(buckets, i) { + if (unlikely(freezing(current))) + break; - /* not correct w.r.t. device removal */ - while (h->used && !ret) { - BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); - ret = __bch2_evacuate_bucket(&ctxt, POS(e.dev, e.bucket), e.gen, - data_opts); + f = move_bucket_in_flight_add(buckets_in_flight, *i); + ret = PTR_ERR_OR_ZERO(f); + if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ + continue; + if (ret == -ENOMEM) { /* flush IO, continue later */ + ret = 0; + break; + } + + ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, + f->bucket.k.gen, data_opts); + if (ret) + goto err; } +err: + darray_exit(&buckets); - bch2_moving_ctxt_exit(&ctxt); + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) + ret = 0; - if (ret < 0 && ret != -EROFS) + if (ret < 0 && !bch2_err_matches(ret, EROFS)) bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); - trace_and_count(c, copygc, c, atomic64_read(&move_stats.sectors_moved), 0, 0, 0); + moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; + trace_and_count(c, copygc, c, moved, 0, 0, 0); return ret; } @@ -188,13 +266,18 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) struct bch_dev *ca; unsigned dev_idx; s64 wait = S64_MAX, fragmented_allowed, fragmented; + unsigned i; for_each_rw_member(ca, c, dev_idx) { struct bch_dev_usage usage = bch2_dev_usage_read(ca); - fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) * + fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ca->mi.bucket_size) >> 1); - fragmented = usage.d[BCH_DATA_user].fragmented; + fragmented = 0; + + for (i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage.d[i].fragmented; wait = min(wait, max(0LL, fragmented_allowed - fragmented)); } @@ -202,27 +285,75 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) return wait; } +void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) +{ + prt_printf(out, "Currently waiting for: "); + prt_human_readable_u64(out, max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + prt_newline(out); + + prt_printf(out, "Currently waiting since: "); + prt_human_readable_u64(out, max(0LL, + atomic64_read(&c->io_clock[WRITE].now) - + c->copygc_wait_at) << 9); + prt_newline(out); + + prt_printf(out, "Currently calculated wait: "); + prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); + prt_newline(out); +} + static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; + struct btree_trans trans; + struct moving_context ctxt; + struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; + struct buckets_in_flight move_buckets; u64 last, wait; int ret = 0; + memset(&move_buckets, 0, sizeof(move_buckets)); + + ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); + if (ret) { + bch_err(c, "error allocating copygc buckets in flight: %s", + bch2_err_str(ret)); + return ret; + } + set_freezable(); + bch2_trans_init(&trans, c, 0, 0); + + bch2_move_stats_init(&move_stats, "copygc"); + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, + writepoint_ptr(&c->copygc_write_point), + false); while (!ret && !kthread_should_stop()) { + bch2_trans_unlock(&trans); cond_resched(); - if (kthread_wait_freezable(c->copy_gc_enabled)) - break; + if (!c->copy_gc_enabled) { + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + kthread_wait_freezable(c->copy_gc_enabled); + } + + if (unlikely(freezing(current))) { + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + __refrigerator(false); + continue; + } last = atomic64_read(&clock->now); wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { - trace_and_count(c, copygc_wait, c, wait, last + wait); + c->copygc_wait_at = last; c->copygc_wait = last + wait; + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); continue; @@ -231,12 +362,17 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(c); + ret = bch2_copygc(&trans, &ctxt, &move_buckets); c->copygc_running = false; wake_up(&c->copygc_running_wq); } + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + rhashtable_destroy(&move_buckets.table); + bch2_trans_exit(&trans); + bch2_moving_ctxt_exit(&ctxt); + return 0; } diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h index e85c813..ea181fe 100644 --- a/libbcachefs/movinggc.h +++ b/libbcachefs/movinggc.h @@ -3,6 +3,8 @@ #define _BCACHEFS_MOVINGGC_H unsigned long bch2_copygc_wait_amount(struct bch_fs *); +void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); + void bch2_copygc_stop(struct bch_fs *); int bch2_copygc_start(struct bch_fs *); void bch2_fs_copygc_init(struct bch_fs *); diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index 54e8669..396357c 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -4,13 +4,120 @@ #include "nocow_locking.h" #include "util.h" +#include + +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) + return true; + return false; +} + +#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) + +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + int lock_val = flags ? 1 : -1; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) { + BUG_ON(sign(atomic_read(&l->l[i])) != lock_val); + + if (!atomic_sub_return(lock_val, &l->l[i])) + closure_wake_up(&l->wait); + return; + } + + BUG(); +} + +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + int v, lock_val = flags ? 1 : -1; + unsigned i; + + spin_lock(&l->lock); + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) + goto got_entry; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (!atomic_read(&l->l[i])) { + l->b[i] = dev_bucket; + goto take_lock; + } +fail: + spin_unlock(&l->lock); + return false; +got_entry: + v = atomic_read(&l->l[i]); + if (lock_val > 0 ? v < 0 : v > 0) + goto fail; +take_lock: + atomic_add(lock_val, &l->l[i]); + spin_unlock(&l->lock); + return true; +} + void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) + struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { + struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); + u64 start_time = local_clock(); + + __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + } +} + +void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) +{ + unsigned i, nr_zero = 0; + struct nocow_lock_bucket *l; + + for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { + unsigned v = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) + v |= atomic_read(&l->l[i]); + + if (!v) { + nr_zero++; + continue; + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); + nr_zero = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) + if (atomic_read(&l->l[i])) + prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i])); + prt_newline(out); + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); +} + +int bch2_fs_nocow_locking_init(struct bch_fs *c) { - struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); - two_state_lock_t *l = bucket_nocow_lock(t, bucket); - u64 start_time = local_clock(); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) + spin_lock_init(&c->nocow_locks.l[i].lock); - bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); - bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + return 0; } diff --git a/libbcachefs/nocow_locking.h b/libbcachefs/nocow_locking.h index 09ab85a..ff8e4af 100644 --- a/libbcachefs/nocow_locking.h +++ b/libbcachefs/nocow_locking.h @@ -2,54 +2,48 @@ #ifndef _BCACHEFS_NOCOW_LOCKING_H #define _BCACHEFS_NOCOW_LOCKING_H -#include "bcachefs_format.h" -#include "two_state_shared_lock.h" +#include "bcachefs.h" +#include "alloc_background.h" +#include "nocow_locking_types.h" -#include +#include -#define BUCKET_NOCOW_LOCKS (1U << 10) - -struct bucket_nocow_lock_table { - siphash_key_t key; - two_state_lock_t l[BUCKET_NOCOW_LOCKS]; -}; - -#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) - -static inline two_state_lock_t *bucket_nocow_lock(struct bucket_nocow_lock_table *t, - struct bpos bucket) +static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, + u64 dev_bucket) { - u64 dev_bucket = bucket.inode << 56 | bucket.offset; - unsigned h = siphash_1u64(dev_bucket, &t->key); + unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); } -static inline bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, - struct bpos bucket) -{ - two_state_lock_t *l = bucket_nocow_lock(t, bucket); +#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) - return atomic_long_read(&l->v) != 0; -} +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, + struct nocow_lock_bucket *, u64, int); -static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, - struct bpos bucket, int flags) +static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) { - two_state_lock_t *l = bucket_nocow_lock(t, bucket); + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - bch2_two_state_unlock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); + __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); } -void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, struct bpos, int); - -static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, +static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) { - two_state_lock_t *l = bucket_nocow_lock(t, bucket); + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); - if (!bch2_two_state_trylock(l, flags & BUCKET_NOCOW_LOCK_UPDATE)) - __bch2_bucket_nocow_lock(t, bucket, flags); + return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); } +void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); + +int bch2_fs_nocow_locking_init(struct bch_fs *); + #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/libbcachefs/nocow_locking_types.h b/libbcachefs/nocow_locking_types.h new file mode 100644 index 0000000..bd12bf6 --- /dev/null +++ b/libbcachefs/nocow_locking_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H +#define _BCACHEFS_NOCOW_LOCKING_TYPES_H + +#define BUCKET_NOCOW_LOCKS_BITS 10 +#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) + +struct nocow_lock_bucket { + struct closure_waitlist wait; + spinlock_t lock; + u64 b[4]; + atomic_t l[4]; +} __aligned(SMP_CACHE_BYTES); + +struct bucket_nocow_lock_table { + struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; +}; + +#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ + diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 407b221..960bb24 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -5,21 +5,25 @@ #include "bcachefs.h" #include "compress.h" #include "disk_groups.h" +#include "error.h" #include "opts.h" #include "super-io.h" #include "util.h" -#include +#define x(t, n, ...) [n] = #t, -#define x(t, n) [n] = #t, +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() + NULL +}; -const char * const bch2_metadata_versions[] = { - BCH_METADATA_VERSIONS() +const char * const bch2_fsck_fix_opts[] = { + BCH_FIX_ERRORS_OPTS() NULL }; -const char * const bch2_error_actions[] = { - BCH_ERROR_ACTIONS() +const char * const bch2_version_upgrade_opts[] = { + BCH_VERSION_UPGRADE_OPTS() NULL }; @@ -91,6 +95,37 @@ const char * const bch2_fs_usage_types[] = { #undef x +static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) +{ + if (!val) { + *res = FSCK_FIX_yes; + } else { + int ret = match_string(bch2_fsck_fix_opts, -1, val); + + if (ret < 0 && err) + prt_str(err, "fix_errors: invalid selection"); + if (ret < 0) + return ret; + *res = ret; + } + + return 0; +} + +static void bch2_opt_fix_errors_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + prt_str(out, bch2_fsck_fix_opts[v]); +} + +#define bch2_opt_fix_errors (struct bch_opt_fn) { \ + .parse = bch2_opt_fix_errors_parse, \ + .to_text = bch2_opt_fix_errors_to_text, \ +} + const char * const bch2_d_types[BCH_DT_MAX] = { [DT_UNKNOWN] = "unknown", [DT_FIFO] = "fifo", @@ -169,11 +204,9 @@ const struct bch_option bch2_opt_table[] = { #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ .min = _min, .max = _max #define OPT_STR(_choices) .type = BCH_OPT_STR, \ - .min = 0, .max = ARRAY_SIZE(_choices),\ + .min = 0, .max = ARRAY_SIZE(_choices), \ .choices = _choices -#define OPT_FN(_fn) .type = BCH_OPT_FN, \ - .parse = _fn##_parse, \ - .to_text = _fn##_to_text +#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ @@ -269,44 +302,59 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: - ret = kstrtou64(val, 10, res); + if (val) { + ret = kstrtou64(val, 10, res); + } else { + ret = 0; + *res = 1; + } + if (ret < 0 || (*res != 0 && *res != 1)) { - prt_printf(err, "%s: must be bool", - opt->attr.name); + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); return ret; } break; case BCH_OPT_UINT: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + ret = opt->flags & OPT_HUMAN_READABLE ? bch2_strtou64_h(val, res) : kstrtou64(val, 10, res); if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", - opt->attr.name); + opt->attr.name); return ret; } break; case BCH_OPT_STR: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + ret = match_string(opt->choices, -1, val); if (ret < 0) { if (err) prt_printf(err, "%s: invalid selection", - opt->attr.name); + opt->attr.name); return ret; } *res = ret; break; case BCH_OPT_FN: - if (!c) - return 0; - - ret = opt->parse(c, val, res); + ret = opt->fn.parse(c, val, res, err); if (ret < 0) { if (err) prt_printf(err, "%s: parse error", - opt->attr.name); + opt->attr.name); return ret; } } @@ -342,10 +390,10 @@ void bch2_opt_to_text(struct printbuf *out, if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else - prt_printf(out, "%s", opt->choices[v]); + prt_str(out, opt->choices[v]); break; case BCH_OPT_FN: - opt->to_text(out, c, sb, v); + opt->fn.to_text(out, c, sb, v); break; default: BUG(); @@ -397,6 +445,13 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, if (!options) return 0; + /* + * sys_fsconfig() is now occasionally providing us with option lists + * starting with a comma - weird. + */ + if (*options == ',') + options++; + copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) return -1; @@ -406,31 +461,19 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, name = strsep(&opt, "="); val = opt; - if (val) { - id = bch2_mount_opt_lookup(name); - if (id < 0) - goto bad_opt; - - ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret < 0) - goto bad_val; - } else { - id = bch2_mount_opt_lookup(name); - v = 1; - - if (id < 0 && - !strncmp("no", name, 2)) { - id = bch2_mount_opt_lookup(name + 2); - v = 0; - } + id = bch2_mount_opt_lookup(name); - if (id < 0) - goto bad_opt; - - if (bch2_opt_table[id].type != BCH_OPT_BOOL) - goto no_val; + /* Check for the form "noopt", negation of a boolean opt: */ + if (id < 0 && + !val && + !strncmp("no", name, 2)) { + id = bch2_mount_opt_lookup(name + 2); + val = "0"; } + if (id < 0) + goto bad_opt; + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; @@ -443,6 +486,10 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) goto bad_opt; + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret < 0) + goto bad_val; + bch2_opt_set_by_id(opts, id, v); } @@ -457,10 +504,6 @@ bad_val: pr_err("Invalid mount option %s", err.buf); ret = -1; goto out; -no_val: - pr_err("Mount option %s requires a value", name); - ret = -1; - goto out; out: kfree(copied_opts_start); printbuf_exit(&err); @@ -532,33 +575,11 @@ void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { - struct bch_io_opts ret = { 0 }; -#define x(_name, _bits) \ - if (opt_defined(src, _name)) \ - opt_set(ret, _name, src._name); - BCH_INODE_OPTS() -#undef x - return ret; -} - -struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) -{ - struct bch_opts ret = { 0 }; -#define x(_name, _bits) \ - if (opt_defined(src, _name)) \ - opt_set(ret, _name, src._name); - BCH_INODE_OPTS() -#undef x - return ret; -} - -void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) -{ -#define x(_name, _bits) \ - if (opt_defined(src, _name)) \ - opt_set(*dst, _name, src._name); + return (struct bch_io_opts) { +#define x(_name, _bits) ._name = src._name, BCH_INODE_OPTS() #undef x + }; } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index ad0b13e..8a9db11 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -8,8 +8,11 @@ #include #include "bcachefs_format.h" -extern const char * const bch2_metadata_versions[]; +struct bch_fs; + extern const char * const bch2_error_actions[]; +extern const char * const bch2_fsck_fix_opts[]; +extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const bch2_btree_ids[]; @@ -67,6 +70,11 @@ enum opt_type { BCH_OPT_FN, }; +struct bch_opt_fn { + int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +}; + /** * x(name, shortopt, type, in mem type, mode, sb_opt) * @@ -92,6 +100,24 @@ enum opt_type { #define RATELIMIT_ERRORS_DEFAULT false #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCACHEFS_VERBOSE_DEFAULT true +#else +#define BCACHEFS_VERBOSE_DEFAULT false +#endif + +#define BCH_FIX_ERRORS_OPTS() \ + x(exit, 0) \ + x(yes, 1) \ + x(no, 2) \ + x(ask, 3) + +enum fsck_err_opts { +#define x(t, n) FSCK_FIX_##t, + BCH_FIX_ERRORS_OPTS() +#undef x +}; + #define BCH_OPTS() \ x(block_size, u16, \ OPT_FS|OPT_FORMAT| \ @@ -148,12 +174,12 @@ enum opt_type { NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_compression_opts), \ + OPT_FN(bch2_opt_compression), \ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(background_compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_compression_opts), \ + OPT_FN(bch2_opt_compression), \ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ NULL, NULL) \ x(str_hash, u8, \ @@ -206,6 +232,11 @@ enum opt_type { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(btree_write_buffer_size, u32, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(16, (1U << 20) - 1), \ + BCH2_NO_SB_OPT, 1U << 13, \ + NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ @@ -271,7 +302,7 @@ enum opt_type { x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ NULL, "Extra debugging information during mount/recovery")\ x(journal_flush_delay, u32, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ @@ -294,7 +325,12 @@ enum opt_type { OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1024, U32_MAX), \ BCH2_NO_SB_OPT, 1U << 20, \ - NULL, "Amount of IO in flight to keep in flight by the move path")\ + NULL, "Maximum Amount of IO to keep in flight by the move path")\ + x(move_ios_in_flight, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 1024), \ + BCH2_NO_SB_OPT, 32, \ + NULL, "Maximum number of IOs to keep in flight by the move path")\ x(fsck, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ @@ -302,8 +338,8 @@ enum opt_type { NULL, "Run fsck on mount") \ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_FN(bch2_opt_fix_errors), \ + BCH2_NO_SB_OPT, FSCK_FIX_exit, \ NULL, "Fix errors during fsck without asking") \ x(ratelimit_errors, u8, \ OPT_FS|OPT_MOUNT, \ @@ -319,27 +355,32 @@ enum opt_type { x(norecovery, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ x(keep_journal, u8, \ 0, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't free journal entries/keys after startup")\ x(read_entire_journal, u8, \ 0, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Read all journal entries, not just dirty ones")\ x(read_journal_only, u8, \ 0, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ x(noexcl, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don't open device in exclusive mode") \ x(direct_io, u8, \ OPT_FS|OPT_MOUNT, \ @@ -349,38 +390,38 @@ enum opt_type { x(sb, u64, \ OPT_MOUNT, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ + BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ "offset", "Sector offset of superblock") \ x(read_only, u8, \ OPT_FS, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nostart, u8, \ 0, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Don\'t start filesystem, only open devices") \ x(reconstruct_alloc, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_version_upgrade_opts), \ + BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ "allowing any new features to be used") \ x(buckets_nouse, u8, \ 0, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, "Allocate the buckets_nouse bitmap") \ x(project, u8, \ OPT_INODE, \ OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + BCH2_NO_SB_OPT, false, \ NULL, NULL) \ x(nocow, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ @@ -389,6 +430,18 @@ enum opt_type { NULL, "Nocow mode: Writes will be done in place when possible.\n"\ "Snapshots and reflink will still caused writes to be COW\n"\ "Implicitly disables data checksumming, compression and encryption")\ + x(nocow_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable nocow mode: enables runtime locking in\n"\ + "data move path needed if nocow will ever be in use\n")\ + x(no_data_io, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Skip submit_bio() for data reads and writes, " \ + "for performance testing purposes") \ x(fs_size, u64, \ OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ @@ -462,8 +515,8 @@ struct bch_option { u64 min, max; const char * const *choices; - int (*parse)(struct bch_fs *, const char *, u64 *); - void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + + struct bch_opt_fn fn; const char *hint; const char *help; @@ -499,18 +552,12 @@ int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); /* inode opts: */ struct bch_io_opts { -#define x(_name, _bits) unsigned _name##_defined:1; - BCH_INODE_OPTS() -#undef x - #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x }; struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); -void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); #endif /* _BCACHEFS_OPTS_H */ diff --git a/linux/printbuf.c b/libbcachefs/printbuf.c similarity index 72% rename from linux/printbuf.c rename to libbcachefs/printbuf.c index 5cf79d4..c41daa1 100644 --- a/linux/printbuf.c +++ b/libbcachefs/printbuf.c @@ -4,16 +4,17 @@ #include #include #include -#include #include #include +#include "printbuf.h" + static inline unsigned printbuf_linelen(struct printbuf *buf) { return buf->pos - buf->last_newline; } -int printbuf_make_room(struct printbuf *out, unsigned extra) +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) { unsigned new_size; char *buf; @@ -44,13 +45,46 @@ int printbuf_make_room(struct printbuf *out, unsigned extra) out->size = new_size; return 0; } -EXPORT_SYMBOL(printbuf_make_room); + +void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) +{ + int len; + + do { + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + int len; + + do { + va_start(args, fmt); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + va_end(args); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} /** * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null * terminated */ -const char *printbuf_str(const struct printbuf *buf) +const char *bch2_printbuf_str(const struct printbuf *buf) { /* * If we've written to a printbuf then it's guaranteed to be a null @@ -61,33 +95,29 @@ const char *printbuf_str(const struct printbuf *buf) ? buf->buf : ""; } -EXPORT_SYMBOL(printbuf_str); /** * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it * against accidental use. */ -void printbuf_exit(struct printbuf *buf) +void bch2_printbuf_exit(struct printbuf *buf) { if (buf->heap_allocated) { kfree(buf->buf); buf->buf = ERR_PTR(-EINTR); /* poison value */ } } -EXPORT_SYMBOL(printbuf_exit); -void printbuf_tabstops_reset(struct printbuf *buf) +void bch2_printbuf_tabstops_reset(struct printbuf *buf) { buf->nr_tabstops = 0; } -EXPORT_SYMBOL(printbuf_tabstops_reset); -void printbuf_tabstop_pop(struct printbuf *buf) +void bch2_printbuf_tabstop_pop(struct printbuf *buf) { if (buf->nr_tabstops) --buf->nr_tabstops; } -EXPORT_SYMBOL(printbuf_tabstop_pop); /* * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop @@ -99,7 +129,7 @@ EXPORT_SYMBOL(printbuf_tabstop_pop); * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start * of line. */ -int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) +int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) { unsigned prev_tabstop = buf->nr_tabstops ? buf->_tabstops[buf->nr_tabstops - 1] @@ -112,7 +142,6 @@ int printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) buf->has_indent_or_tabstops = true; return 0; } -EXPORT_SYMBOL(printbuf_tabstop_push); /** * printbuf_indent_add - add to the current indent level @@ -123,7 +152,7 @@ EXPORT_SYMBOL(printbuf_tabstop_push); * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces more spaces. */ -void printbuf_indent_add(struct printbuf *buf, unsigned spaces) +void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) spaces = 0; @@ -133,7 +162,6 @@ void printbuf_indent_add(struct printbuf *buf, unsigned spaces) buf->has_indent_or_tabstops = true; } -EXPORT_SYMBOL(printbuf_indent_add); /** * printbuf_indent_sub - subtract from the current indent level @@ -144,7 +172,7 @@ EXPORT_SYMBOL(printbuf_indent_add); * Subsequent lines, and the current line if the output position is at the start * of the current line, will be indented by @spaces less spaces. */ -void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) +void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) { if (WARN_ON_ONCE(spaces > buf->indent)) spaces = buf->indent; @@ -158,13 +186,12 @@ void printbuf_indent_sub(struct printbuf *buf, unsigned spaces) if (!buf->indent && !buf->nr_tabstops) buf->has_indent_or_tabstops = false; } -EXPORT_SYMBOL(printbuf_indent_sub); -void prt_newline(struct printbuf *buf) +void bch2_prt_newline(struct printbuf *buf) { unsigned i; - printbuf_make_room(buf, 1 + buf->indent); + bch2_printbuf_make_room(buf, 1 + buf->indent); __prt_char(buf, '\n'); @@ -178,7 +205,6 @@ void prt_newline(struct printbuf *buf) buf->last_field = buf->pos; buf->cur_tabstop = 0; } -EXPORT_SYMBOL(prt_newline); /* * Returns spaces from start of line, if set, or 0 if unset: @@ -207,14 +233,13 @@ static void __prt_tab(struct printbuf *out) * * Advance output to the next tabstop by printing spaces. */ -void prt_tab(struct printbuf *out) +void bch2_prt_tab(struct printbuf *out) { if (WARN_ON(!cur_tabstop(out))) return; __prt_tab(out); } -EXPORT_SYMBOL(prt_tab); static void __prt_tab_rjust(struct printbuf *buf) { @@ -222,7 +247,7 @@ static void __prt_tab_rjust(struct printbuf *buf) int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); if (pad > 0) { - printbuf_make_room(buf, pad); + bch2_printbuf_make_room(buf, pad); if (buf->last_field + pad < buf->size) memmove(buf->buf + buf->last_field + pad, @@ -250,14 +275,13 @@ static void __prt_tab_rjust(struct printbuf *buf) * Advance output to the next tabstop by inserting spaces immediately after the * previous tabstop, right justifying previously outputted text. */ -void prt_tab_rjust(struct printbuf *buf) +void bch2_prt_tab_rjust(struct printbuf *buf) { if (WARN_ON(!cur_tabstop(buf))) return; __prt_tab_rjust(buf); } -EXPORT_SYMBOL(prt_tab_rjust); /** * prt_bytes_indented - Print an array of chars, handling embedded control characters @@ -271,7 +295,7 @@ EXPORT_SYMBOL(prt_tab_rjust); * \t: prt_tab advance to next tabstop * \r: prt_tab_rjust advance to next tabstop, with right justification */ -void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) +void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) { const char *unprinted_start = str; const char *end = str + count; @@ -286,7 +310,7 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) case '\n': prt_bytes(out, unprinted_start, str - unprinted_start); unprinted_start = str + 1; - prt_newline(out); + bch2_prt_newline(out); break; case '\t': if (likely(cur_tabstop(out))) { @@ -309,34 +333,31 @@ void prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) prt_bytes(out, unprinted_start, str - unprinted_start); } -EXPORT_SYMBOL(prt_bytes_indented); /** * prt_human_readable_u64 - Print out a u64 in human readable units * * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units */ -void prt_human_readable_u64(struct printbuf *buf, u64 v) +void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) { - printbuf_make_room(buf, 10); + bch2_printbuf_make_room(buf, 10); buf->pos += string_get_size(v, 1, !buf->si_units, buf->buf + buf->pos, printbuf_remaining_size(buf)); } -EXPORT_SYMBOL(prt_human_readable_u64); /** * prt_human_readable_s64 - Print out a s64 in human readable units * * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units */ -void prt_human_readable_s64(struct printbuf *buf, s64 v) +void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) { if (v < 0) prt_char(buf, '-'); - prt_human_readable_u64(buf, abs(v)); + bch2_prt_human_readable_u64(buf, abs(v)); } -EXPORT_SYMBOL(prt_human_readable_s64); /** * prt_units_u64 - Print out a u64 according to printbuf unit options @@ -344,14 +365,13 @@ EXPORT_SYMBOL(prt_human_readable_s64); * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ -void prt_units_u64(struct printbuf *out, u64 v) +void bch2_prt_units_u64(struct printbuf *out, u64 v) { if (out->human_readable_units) - prt_human_readable_u64(out, v); + bch2_prt_human_readable_u64(out, v); else - prt_printf(out, "%llu", v); + bch2_prt_printf(out, "%llu", v); } -EXPORT_SYMBOL(prt_units_u64); /** * prt_units_s64 - Print out a s64 according to printbuf unit options @@ -359,10 +379,37 @@ EXPORT_SYMBOL(prt_units_u64); * Units are either raw (default), or human reabable units (controlled via * @buf->human_readable_units) */ -void prt_units_s64(struct printbuf *out, s64 v) +void bch2_prt_units_s64(struct printbuf *out, s64 v) { if (v < 0) prt_char(out, '-'); - prt_units_u64(out, abs(v)); + bch2_prt_units_u64(out, abs(v)); +} + +void bch2_prt_string_option(struct printbuf *out, + const char * const list[], + size_t selected) +{ + size_t i; + + for (i = 0; list[i]; i++) + bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); +} + +void bch2_prt_bitflags(struct printbuf *out, + const char * const list[], u64 flags) +{ + unsigned bit, nr = 0; + bool first = true; + + while (list[nr]) + nr++; + + while (flags && (bit = __ffs(flags)) < nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[bit]); + flags ^= 1 << bit; + } } -EXPORT_SYMBOL(prt_units_s64); diff --git a/include/linux/printbuf.h b/libbcachefs/printbuf.h similarity index 76% rename from include/linux/printbuf.h rename to libbcachefs/printbuf.h index 24e62e5..2191423 100644 --- a/include/linux/printbuf.h +++ b/libbcachefs/printbuf.h @@ -1,8 +1,8 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ /* Copyright (C) 2022 Kent Overstreet */ -#ifndef _LINUX_PRINTBUF_H -#define _LINUX_PRINTBUF_H +#ifndef _BCACHEFS_PRINTBUF_H +#define _BCACHEFS_PRINTBUF_H /* * Printbufs: Simple strings for printing to, with optional heap allocation @@ -71,7 +71,7 @@ enum printbuf_si { PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ }; -#define PRINTBUF_INLINE_TABSTOPS 4 +#define PRINTBUF_INLINE_TABSTOPS 6 struct printbuf { char *buf; @@ -100,26 +100,30 @@ struct printbuf { u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; }; -int printbuf_make_room(struct printbuf *, unsigned); -const char *printbuf_str(const struct printbuf *); -void printbuf_exit(struct printbuf *); +int bch2_printbuf_make_room(struct printbuf *, unsigned); +__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); +__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); +const char *bch2_printbuf_str(const struct printbuf *); +void bch2_printbuf_exit(struct printbuf *); -void printbuf_tabstops_reset(struct printbuf *); -void printbuf_tabstop_pop(struct printbuf *); -int printbuf_tabstop_push(struct printbuf *, unsigned); +void bch2_printbuf_tabstops_reset(struct printbuf *); +void bch2_printbuf_tabstop_pop(struct printbuf *); +int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); -void printbuf_indent_add(struct printbuf *, unsigned); -void printbuf_indent_sub(struct printbuf *, unsigned); +void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_sub(struct printbuf *, unsigned); -void prt_newline(struct printbuf *); -void prt_tab(struct printbuf *); -void prt_tab_rjust(struct printbuf *); +void bch2_prt_newline(struct printbuf *); +void bch2_prt_tab(struct printbuf *); +void bch2_prt_tab_rjust(struct printbuf *); -void prt_bytes_indented(struct printbuf *, const char *, unsigned); -void prt_human_readable_u64(struct printbuf *, u64); -void prt_human_readable_s64(struct printbuf *, s64); -void prt_units_u64(struct printbuf *, u64); -void prt_units_s64(struct printbuf *, s64); +void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); +void bch2_prt_human_readable_u64(struct printbuf *, u64); +void bch2_prt_human_readable_s64(struct printbuf *, s64); +void bch2_prt_units_u64(struct printbuf *, u64); +void bch2_prt_units_s64(struct printbuf *, s64); +void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); +void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); /* Initializer for a heap allocated printbuf: */ #define PRINTBUF ((struct printbuf) { .heap_allocated = true }) @@ -163,7 +167,7 @@ static inline bool printbuf_overflowed(struct printbuf *out) static inline void printbuf_nul_terminate(struct printbuf *out) { - printbuf_make_room(out, 1); + bch2_printbuf_make_room(out, 1); if (out->pos < out->size) out->buf[out->pos] = 0; @@ -171,7 +175,7 @@ static inline void printbuf_nul_terminate(struct printbuf *out) out->buf[out->size - 1] = 0; } -/* Doesn't call printbuf_make_room(), doesn't nul terminate: */ +/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ static inline void __prt_char_reserved(struct printbuf *out, char c) { if (printbuf_remaining(out)) @@ -182,7 +186,7 @@ static inline void __prt_char_reserved(struct printbuf *out, char c) /* Doesn't nul terminate: */ static inline void __prt_char(struct printbuf *out, char c) { - printbuf_make_room(out, 1); + bch2_printbuf_make_room(out, 1); __prt_char_reserved(out, c); } @@ -203,7 +207,7 @@ static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n static inline void prt_chars(struct printbuf *out, char c, unsigned n) { - printbuf_make_room(out, n); + bch2_printbuf_make_room(out, n); __prt_chars_reserved(out, c, n); printbuf_nul_terminate(out); } @@ -212,7 +216,7 @@ static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) { unsigned i, can_print; - printbuf_make_room(out, n); + bch2_printbuf_make_room(out, n); can_print = min(n, printbuf_remaining(out)); @@ -230,12 +234,12 @@ static inline void prt_str(struct printbuf *out, const char *str) static inline void prt_str_indented(struct printbuf *out, const char *str) { - prt_bytes_indented(out, str, strlen(str)); + bch2_prt_bytes_indented(out, str, strlen(str)); } static inline void prt_hex_byte(struct printbuf *out, u8 byte) { - printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 2); __prt_char_reserved(out, hex_asc_hi(byte)); __prt_char_reserved(out, hex_asc_lo(byte)); printbuf_nul_terminate(out); @@ -243,7 +247,7 @@ static inline void prt_hex_byte(struct printbuf *out, u8 byte) static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) { - printbuf_make_room(out, 2); + bch2_printbuf_make_room(out, 2); __prt_char_reserved(out, hex_asc_upper_hi(byte)); __prt_char_reserved(out, hex_asc_upper_lo(byte)); printbuf_nul_terminate(out); @@ -277,30 +281,4 @@ static inline void printbuf_atomic_dec(struct printbuf *buf) buf->atomic--; } -/* - * This is used for the %pf(%p) sprintf format extension, where we pass a pretty - * printer and arguments to the pretty-printer to sprintf - * - * Instead of passing a pretty-printer function to sprintf directly, we pass it - * a pointer to a struct call_pp, so that sprintf can check that the magic - * number is present, which in turn ensures that the CALL_PP() macro has been - * used in order to typecheck the arguments to the pretty printer function - * - * Example usage: - * sprintf("%pf(%p)", CALL_PP(prt_bdev, bdev)); - */ -struct call_pp { - unsigned long magic; - void *fn; -}; - -#define PP_TYPECHECK(fn, ...) \ - ({ while (0) fn((struct printbuf *) NULL, ##__VA_ARGS__); }) - -#define CALL_PP_MAGIC (unsigned long) 0xce0b92d22f6b6be4 - -#define CALL_PP(fn, ...) \ - (PP_TYPECHECK(fn, ##__VA_ARGS__), \ - &((struct call_pp) { CALL_PP_MAGIC, fn })), ##__VA_ARGS__ - -#endif /* _LINUX_PRINTBUF_H */ +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 7f74c02..4f0654f 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "btree_update.h" #include "errcode.h" +#include "error.h" #include "inode.h" #include "quota.h" #include "subvolume.h" @@ -26,7 +27,7 @@ static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, if (vstruct_bytes(&q->field) < sizeof(*q)) { prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&q->field), sizeof(*q)); - return -EINVAL; + return -BCH_ERR_invalid_sb_quota; } return 0; @@ -59,18 +60,13 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { if (k.k->p.inode >= QTYP_NR) { prt_printf(err, "invalid quota type (%llu >= %u)", k.k->p.inode, QTYP_NR); - return -EINVAL; - } - - if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_quota)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -485,13 +481,13 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, } if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) - mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer); + mq->c[Q_SPC].timer = qdq->d_spc_timer; if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) - mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns); + mq->c[Q_SPC].warns = qdq->d_spc_warns; if (qdq && qdq->d_fieldmask & QC_INO_TIMER) - mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer); + mq->c[Q_INO].timer = qdq->d_ino_timer; if (qdq && qdq->d_fieldmask & QC_INO_WARNS) - mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns); + mq->c[Q_INO].warns = qdq->d_ino_warns; mutex_unlock(&q->lock); } @@ -562,23 +558,32 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_inode_unpacked u; - struct bch_subvolume subvolume; + struct bch_snapshot_tree s_t; int ret; - ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, + snapshot_t(c, k.k->p.snapshot)->tree); if (ret) return ret; - /* - * We don't do quota accounting in snapshots: - */ - if (BCH_SUBVOLUME_SNAP(&subvolume)) + if (!s_t.master_subvol) goto advance; - if (!bkey_is_inode(k.k)) + ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { + le32_to_cpu(s_t.master_subvol), + k.k->p.offset, + }, &u); + /* + * Inode might be deleted in this snapshot - the easiest way to handle + * that is to just skip it here: + */ + if (bch2_err_matches(ret, ENOENT)) goto advance; - ret = bch2_inode_unpack(k, &u); if (ret) return ret; @@ -587,7 +592,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: - bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); return 0; } @@ -617,10 +622,11 @@ int bch2_fs_quota_read(struct bch_fs *c) for_each_btree_key2(&trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, bch2_fs_quota_read_inode(&trans, &iter, k)); - if (ret) - bch_err(c, "err in quota_read: %s", bch2_err_str(ret)); bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); return ret; } @@ -709,7 +715,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_USR, 0), - POS(QTYP_USR + 1, 0), + POS(QTYP_USR, U64_MAX), 0, NULL); if (ret) return ret; @@ -721,7 +727,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_GRP, 0), - POS(QTYP_GRP + 1, 0), + POS(QTYP_GRP, U64_MAX), 0, NULL); if (ret) return ret; @@ -733,7 +739,7 @@ static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ret = bch2_btree_delete_range(c, BTREE_ID_quotas, POS(QTYP_PRJ, 0), - POS(QTYP_PRJ + 1, 0), + POS(QTYP_PRJ, U64_MAX), 0, NULL); if (ret) return ret; @@ -896,7 +902,7 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ret = -ENOENT; found: mutex_unlock(&q->lock); - return ret; + return bch2_err_class(ret); } static int bch2_set_quota_trans(struct btree_trans *trans, @@ -907,10 +913,8 @@ static int bch2_set_quota_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_quotas, new_quota->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ret = bkey_err(k); if (unlikely(ret)) return ret; @@ -958,7 +962,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); - return ret; + return bch2_err_class(ret); } const struct quotactl_ops bch2_quotactl_operations = { diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index 59bed11..2f46387 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -5,14 +5,17 @@ #include "inode.h" #include "quota_types.h" +enum bkey_invalid_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ .key_invalid = bch2_quota_invalid, \ .val_to_text = bch2_quota_to_text, \ + .min_val_size = 32, \ }) static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index 4df981b..c3d5772 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -5,6 +5,7 @@ #include "btree_iter.h" #include "buckets.h" #include "clock.h" +#include "compress.h" #include "disk_groups.h" #include "errcode.h" #include "extents.h" @@ -12,11 +13,11 @@ #include "move.h" #include "rebalance.h" #include "super-io.h" +#include "trace.h" #include #include #include -#include /* * Check if an extent should be moved: @@ -45,7 +46,7 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached && p.crc.compression_type != - bch2_compression_opt_to_type[io_opts->background_compression]) + bch2_compression_opt_to_type(io_opts->background_compression)) data_opts->rewrite_ptrs |= 1U << i; i++; } @@ -57,7 +58,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, i = 0; bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && - !bch2_dev_in_target(c, ptr->dev, io_opts->background_target)) + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && + bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) data_opts->rewrite_ptrs |= 1U << i; i++; } diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index fdcd70e..55a233c 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -129,12 +129,11 @@ search: if (!*idx) *idx = __bch2_journal_key_search(keys, btree_id, level, pos); - while (*idx < keys->nr && - (k = idx_to_key(keys, *idx), - k->btree_id == btree_id && - k->level == level && - bpos_cmp(k->k->k.p, end_pos) <= 0)) { - if (bpos_cmp(k->k->k.p, pos) >= 0 && + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && !k->overwritten) return k->k; @@ -229,7 +228,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (!new_keys.d) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_key_insert; } /* Since @keys was full, there was no gap: */ @@ -267,7 +266,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_key_insert; bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -296,7 +295,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, if (idx < keys->size && keys->d[idx].btree_id == btree && keys->d[idx].level == level && - !bpos_cmp(keys->d[idx].k->k.p, pos)) + bpos_eq(keys->d[idx].k->k.p, pos)) keys->d[idx].overwritten = true; } @@ -309,7 +308,7 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) } } -struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { struct journal_key *k = iter->keys->d + iter->idx; @@ -355,7 +354,7 @@ static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) { - if (!bpos_cmp(iter->pos, SPOS_MAX)) + if (bpos_eq(iter->pos, SPOS_MAX)) iter->at_end = true; else iter->pos = bpos_successor(iter->pos); @@ -369,19 +368,19 @@ again: return bkey_s_c_null; while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && - bpos_cmp(btree_k.k->p, iter->pos) < 0) + bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && - bpos_cmp(journal_k.k->p, iter->pos) < 0) + bpos_lt(journal_k.k->p, iter->pos)) bch2_journal_iter_advance(&iter->journal); ret = journal_k.k && - (!btree_k.k || bpos_cmp(journal_k.k->p, btree_k.k->p) <= 0) + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) ? journal_k : btree_k; - if (ret.k && iter->b && bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) ret = bkey_s_c_null; if (ret.k) { @@ -477,15 +476,34 @@ void bch2_journal_keys_free(struct journal_keys *keys) keys->nr = keys->gap = keys->size = 0; } +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + static int journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; struct journal_replay *i, **_i; struct jset_entry *entry; - struct bkey_i *k, *_n; + struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; - struct journal_key *src, *dst; - size_t nr_keys = 0; + size_t nr_keys = 0, nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -493,7 +511,7 @@ static int journal_keys_sort(struct bch_fs *c) if (!i || i->ignore) continue; - for_each_jset_key(k, _n, entry, &i->j) + for_each_jset_key(k, entry, &i->j) nr_keys++; } @@ -503,8 +521,21 @@ static int journal_keys_sort(struct bch_fs *c) keys->size = roundup_pow_of_two(nr_keys); keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) - return -ENOMEM; + if (!keys->d) { + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", + nr_keys); + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; @@ -512,7 +543,19 @@ static int journal_keys_sort(struct bch_fs *c) if (!i || i->ignore) continue; - for_each_jset_key(k, _n, entry, &i->j) + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + keys->d[keys->nr++] = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, @@ -520,23 +563,15 @@ static int journal_keys_sort(struct bch_fs *c) .journal_seq = le64_to_cpu(i->j.seq), .journal_offset = k->_data - i->j._data, }; - } - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); - - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - src[0].btree_id == src[1].btree_id && - src[0].level == src[1].level && - !bpos_cmp(src[0].k->k.p, src[1].k->k.p)) - src++; - - *dst++ = *src++; + nr_read++; + } } - keys->nr = dst - keys->d; + __journal_keys_sort(keys); keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); return 0; } @@ -559,10 +594,21 @@ static int bch2_journal_replay_key(struct btree_trans *trans, unsigned iter_flags = BTREE_ITER_INTENT| BTREE_ITER_NOT_EXTENTS; + unsigned update_flags = BTREE_TRIGGER_NORUN; int ret; + /* + * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * keep the key cache coherent with the underlying btree. Nothing + * besides the allocator is doing updates yet so we don't need key cache + * coherency for non-alloc btrees, and key cache fills for snapshots + * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * the snapshots recovery pass runs. + */ if (!k->level && k->btree_id == BTREE_ID_alloc) iter_flags |= BTREE_ITER_CACHED; + else + update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, @@ -575,7 +621,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (k->overwritten) goto out; - ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); + ret = bch2_trans_update(trans, &iter, k->k, update_flags); out: bch2_trans_iter_exit(trans, &iter); return ret; @@ -594,6 +640,8 @@ static int bch2_journal_replay(struct bch_fs *c) struct journal_keys *keys = &c->journal_keys; struct journal_key **keys_sorted, *k; struct journal *j = &c->journal; + u64 start_seq = c->journal_replay_seq_start; + u64 end_seq = c->journal_replay_seq_start; size_t i; int ret; @@ -602,7 +650,7 @@ static int bch2_journal_replay(struct bch_fs *c) keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); if (!keys_sorted) - return -ENOMEM; + return -BCH_ERR_ENOMEM_journal_replay; for (i = 0; i < keys->nr; i++) keys_sorted[i] = &keys->d[i]; @@ -611,6 +659,13 @@ static int bch2_journal_replay(struct bch_fs *c) sizeof(keys_sorted[0]), journal_sort_seq_cmp, NULL); + if (keys->nr) { + ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", + keys->nr, start_seq, end_seq); + if (ret) + goto err; + } + for (i = 0; i < keys->nr; i++) { k = keys_sorted[i]; @@ -622,12 +677,12 @@ static int bch2_journal_replay(struct bch_fs *c) BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL| (!k->allocated - ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved + ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim : 0), bch2_journal_replay_key(&trans, k)); if (ret) { - bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", - ret, bch2_btree_ids[k->btree_id], k->level); + bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", + bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); goto err; } } @@ -640,9 +695,12 @@ static int bch2_journal_replay(struct bch_fs *c) ret = bch2_journal_error(j); if (keys->nr && !ret) - bch2_journal_log_msg(&c->journal, "journal replay finished"); + bch2_journal_log_msg(c, "journal replay finished"); err: kvfree(keys_sorted); + + if (ret) + bch_err_fn(c, ret); return ret; } @@ -657,13 +715,13 @@ static int journal_replay_entry_early(struct bch_fs *c, case BCH_JSET_ENTRY_btree_root: { struct btree_root *r; - if (entry->btree_id >= BTREE_ID_NR) { - bch_err(c, "filesystem has unknown btree type %u", - entry->btree_id); - return -EINVAL; + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { + ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); + if (ret) + return ret; } - r = &c->btree_roots[entry->btree_id]; + r = bch2_btree_id_root(c, entry->btree_id); if (entry->u64s) { r->level = entry->level; @@ -865,7 +923,7 @@ static int verify_superblock_clean(struct bch_fs *c, IS_ERR(k1) || IS_ERR(k2) || k1->k.u64s != k2->k.u64s || - memcmp(k1, k2, bkey_bytes(k1)) || + memcmp(k1, k2, bkey_bytes(&k1->k)) || l1 != l2, c, "superblock btree root %u doesn't match journal after clean shutdown\n" "sb: l=%u %s\n" @@ -899,7 +957,7 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) GFP_KERNEL); if (!clean) { mutex_unlock(&c->sb_lock); - return ERR_PTR(-ENOMEM); + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); } ret = bch2_sb_clean_validate_late(c, clean, READ); @@ -923,6 +981,7 @@ static bool btree_id_is_alloc(enum btree_id id) case BTREE_ID_backpointers: case BTREE_ID_need_discard: case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: return true; default: return false; @@ -934,8 +993,8 @@ static int read_btree_roots(struct bch_fs *c) unsigned i; int ret = 0; - for (i = 0; i < BTREE_ID_NR; i++) { - struct btree_root *r = &c->btree_roots[i]; + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); if (!r->alive) continue; @@ -962,70 +1021,80 @@ static int read_btree_roots(struct bch_fs *c) ? FSCK_CAN_IGNORE : 0, "error reading btree root %s", bch2_btree_ids[i]); - if (i == BTREE_ID_alloc) + if (btree_id_is_alloc(i)) c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); } } - for (i = 0; i < BTREE_ID_NR; i++) - if (!c->btree_roots[i].b) + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->b) { + r->alive = false; + r->level = 0; bch2_btree_root_alloc(c, i); + } + } fsck_err: return ret; } -static int bch2_fs_initialize_subvolumes(struct bch_fs *c) +static int bch2_initialize_subvolumes(struct bch_fs *c) { - struct bkey_i_snapshot root_snapshot; - struct bkey_i_subvolume root_volume; + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; int ret; + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + bkey_snapshot_init(&root_snapshot.k_i); root_snapshot.k.p.offset = U32_MAX; root_snapshot.v.flags = 0; root_snapshot.v.parent = 0; - root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; - root_snapshot.v.pad = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); - ret = bch2_btree_insert(c, BTREE_ID_snapshots, - &root_snapshot.k_i, - NULL, NULL, 0); - if (ret) - return ret; - bkey_subvolume_init(&root_volume.k_i); root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; root_volume.v.flags = 0; root_volume.v.snapshot = cpu_to_le32(U32_MAX); root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); - ret = bch2_btree_insert(c, BTREE_ID_subvolumes, - &root_volume.k_i, - NULL, NULL, 0); + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, + &root_tree.k_i, + NULL, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, + &root_snapshot.k_i, + NULL, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, + &root_volume.k_i, + NULL, NULL, 0); if (ret) - return ret; - - return 0; + bch_err_fn(c, ret); + return ret; } -static int bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) { struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked inode; int ret; - bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, - SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); ret = bkey_err(k); if (ret) - goto err; + return ret; if (!bkey_is_inode(k.k)) { bch_err(trans->c, "root inode not found"); - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1040,12 +1109,177 @@ err: return ret; } +/* set bi_subvol on root inode */ +noinline_for_stack +static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + __bch2_fs_upgrade_for_subvolumes(&trans)); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static void check_version_upgrade(struct bch_fs *c) +{ + unsigned latest_compatible = bch2_version_compatible(c->sb.version); + unsigned latest_version = bcachefs_metadata_version_current; + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; + u64 recovery_passes; + + if (old_version < bcachefs_metadata_required_upgrade_below) { + if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || + latest_compatible < bcachefs_metadata_required_upgrade_below) + new_version = latest_version; + else + new_version = latest_compatible; + } else { + switch (c->opts.version_upgrade) { + case BCH_VERSION_UPGRADE_compatible: + new_version = latest_compatible; + break; + case BCH_VERSION_UPGRADE_incompatible: + new_version = latest_version; + break; + case BCH_VERSION_UPGRADE_none: + new_version = old_version; + break; + } + } + + if (new_version > old_version) { + struct printbuf buf = PRINTBUF; + + if (old_version < bcachefs_metadata_required_upgrade_below) + prt_str(&buf, "Version upgrade required:\n"); + + if (old_version != c->sb.version) { + prt_str(&buf, "Version upgrade from "); + bch2_version_to_text(&buf, c->sb.version_upgrade_complete); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, " incomplete\n"); + } + + prt_printf(&buf, "Doing %s version upgrade from ", + BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) + ? "incompatible" : "compatible"); + bch2_version_to_text(&buf, old_version); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, new_version); + prt_newline(&buf); + + recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); + if (recovery_passes) { + prt_str(&buf, "fsck required"); + + c->recovery_passes_explicit |= recovery_passes; + c->opts.fix_errors = FSCK_FIX_yes; + } + + bch_info(c, "%s", buf.buf); + + mutex_lock(&c->sb_lock); + bch2_sb_upgrade(c, new_version); + mutex_unlock(&c->sb_lock); + + printbuf_exit(&buf); + } +} + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + const char *name; + unsigned when; +}; + +static struct recovery_pass_fn recovery_passes[] = { +#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) + if (recovery_passes[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + int ret; + + c->curr_recovery_pass = pass; + + if (should_run_recovery_pass(c, pass)) { + struct recovery_pass_fn *p = recovery_passes + pass; + + if (!(p->when & PASS_SILENT)) + printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + printk(KERN_CONT " done\n"); + } + + return 0; +} + +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; + + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) + continue; + if (ret) + break; + c->curr_recovery_pass++; + } + + return ret; +} + int bch2_fs_recovery(struct bch_fs *c) { - const char *err = "cannot allocate memory"; struct bch_sb_field_clean *clean = NULL; struct jset *last_journal_entry = NULL; - u64 blacklist_seq, journal_seq; + u64 last_seq, blacklist_seq, journal_seq; bool write_sb = false; int ret = 0; @@ -1080,23 +1314,8 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) { - bch_info(c, "alloc_v2 feature bit not set, fsck required"); - c->opts.fsck = true; - c->opts.fix_errors = FSCK_OPT_YES; - } - - if (!c->opts.nochanges) { - if (c->sb.version < bcachefs_metadata_version_backpointers) { - bch_info(c, "version prior to backpointers, upgrade and fsck required"); - c->opts.version_upgrade = true; - c->opts.fsck = true; - c->opts.fix_errors = FSCK_OPT_YES; - } else if (c->sb.version < bcachefs_metadata_version_inode_v3) { - bch_info(c, "version prior to inode_v3, upgrade required"); - c->opts.version_upgrade = true; - } - } + if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) + check_version_upgrade(c); if (c->opts.fsck && c->opts.norecovery) { bch_err(c, "cannot select both norecovery and fsck"); @@ -1115,10 +1334,17 @@ int bch2_fs_recovery(struct bch_fs *c) struct journal_replay **i; bch_verbose(c, "starting journal read"); - ret = bch2_journal_read(c, &blacklist_seq, &journal_seq); + ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); if (ret) goto err; + /* + * note: cmd_list_journal needs the blacklist table fully up to date so + * it can asterisk ignored journal entries: + */ + if (c->opts.read_journal_only) + goto out; + genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i && !(*i)->ignore) { last_journal_entry = &(*i)->j; @@ -1136,7 +1362,15 @@ int bch2_fs_recovery(struct bch_fs *c) if (!last_journal_entry) { fsck_err_on(!c->sb.clean, c, "no journal entries found"); - goto use_clean; + if (clean) + goto use_clean; + + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i) { + last_journal_entry = &(*i)->j; + (*i)->ignore = false; + break; + } } ret = journal_keys_sort(c); @@ -1160,6 +1394,9 @@ use_clean: blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; } + c->journal_replay_seq_start = last_seq; + c->journal_replay_seq_end = blacklist_seq - 1;; + if (c->opts.reconstruct_alloc) { c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); drop_alloc_keys(&c->journal_keys); @@ -1182,7 +1419,9 @@ use_clean: journal_seq += 8; if (blacklist_seq != journal_seq) { - ret = bch2_journal_seq_blacklist_add(c, + ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", + blacklist_seq, journal_seq) ?: + bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { bch_err(c, "error creating new journal seq blacklist entry"); @@ -1190,17 +1429,15 @@ use_clean: } } - /* - * note: cmd_list_journal needs the blacklist table fully up to date so - * it can asterisk ignored journal entries: - */ - if (c->opts.read_journal_only) - goto out; - - ret = bch2_fs_journal_start(&c->journal, journal_seq); + ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", + journal_seq, last_seq, blacklist_seq - 1) ?: + bch2_fs_journal_start(&c->journal, journal_seq); if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_journal_log_msg(c, "dropping alloc info"); + /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: @@ -1212,169 +1449,36 @@ use_clean: if (ret) goto err; - bch_verbose(c, "starting alloc read"); - err = "error reading allocation information"; - - down_read(&c->gc_lock); - ret = bch2_alloc_read(c); - up_read(&c->gc_lock); - - if (ret) - goto err; - bch_verbose(c, "alloc read done"); + if (c->opts.fsck && + (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || + BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))) + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - bch_verbose(c, "starting stripes_read"); - err = "error reading stripes"; - ret = bch2_stripes_read(c); + ret = bch2_run_recovery_passes(c); if (ret) goto err; - bch_verbose(c, "stripes_read done"); - - bch2_stripes_heap_start(c); - if (c->opts.fsck) { - bool metadata_only = c->opts.norecovery; + /* If we fixed errors, verify that fs is actually clean now: */ + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && + !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && + !test_bit(BCH_FS_ERROR, &c->flags)) { + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); - bch_info(c, "checking allocations"); - err = "error checking allocations"; - ret = bch2_gc(c, true, metadata_only); - if (ret) - goto err; - bch_verbose(c, "done checking allocations"); + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - - bch_info(c, "checking need_discard and freespace btrees"); - err = "error checking need_discard and freespace btrees"; - ret = bch2_check_alloc_info(c); + ret = bch2_run_recovery_passes(c); if (ret) goto err; - bch_verbose(c, "done checking need_discard and freespace btrees"); - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { - err = "error creating root snapshot node"; - ret = bch2_fs_initialize_subvolumes(c); - if (ret) - goto err; + if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || + test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + bch_err(c, "Second fsck run was not clean"); + set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); } - bch_verbose(c, "reading snapshots table"); - err = "error reading snapshots table"; - ret = bch2_fs_snapshots_start(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - - bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); - err = "journal replay failed"; - ret = bch2_journal_replay(c); - if (ret) - goto err; - if (c->opts.verbose || !c->sb.clean) - bch_info(c, "journal replay done"); - - bch_info(c, "checking lrus"); - err = "error checking lrus"; - ret = bch2_check_lrus(c); - if (ret) - goto err; - bch_verbose(c, "done checking lrus"); - set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); - - bch_info(c, "checking backpointers to alloc keys"); - err = "error checking backpointers to alloc keys"; - ret = bch2_check_btree_backpointers(c); - if (ret) - goto err; - bch_verbose(c, "done checking backpointers to alloc keys"); - - bch_info(c, "checking backpointers to extents"); - err = "error checking backpointers to extents"; - ret = bch2_check_backpointers_to_extents(c); - if (ret) - goto err; - bch_verbose(c, "done checking backpointers to extents"); - - bch_info(c, "checking extents to backpointers"); - err = "error checking extents to backpointers"; - ret = bch2_check_extents_to_backpointers(c); - if (ret) - goto err; - bch_verbose(c, "done checking extents to backpointers"); - set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); - - bch_info(c, "checking alloc to lru refs"); - err = "error checking alloc to lru refs"; - ret = bch2_check_alloc_to_lru_refs(c); - if (ret) - goto err; - bch_verbose(c, "done checking alloc to lru refs"); - set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); - } else { - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); - set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); - set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); - - if (c->opts.norecovery) - goto out; - - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { - err = "error creating root snapshot node"; - ret = bch2_fs_initialize_subvolumes(c); - if (ret) - goto err; - } - - bch_verbose(c, "reading snapshots table"); - err = "error reading snapshots table"; - ret = bch2_fs_snapshots_start(c); - if (ret) - goto err; - bch_verbose(c, "reading snapshots done"); - - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - - bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); - err = "journal replay failed"; - ret = bch2_journal_replay(c); - if (ret) - goto err; - if (c->opts.verbose || !c->sb.clean) - bch_info(c, "journal replay done"); - } - - err = "error initializing freespace"; - ret = bch2_fs_freespace_init(c); - if (ret) - goto err; - - if (c->sb.version < bcachefs_metadata_version_snapshot_2) { - /* set bi_subvol on root inode */ - err = "error upgrade root inode for subvolumes"; - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, - bch2_fs_upgrade_for_subvolumes(&trans)); - if (ret) - goto err; - } - - if (c->opts.fsck) { - bch_info(c, "starting fsck"); - err = "error in fsck"; - ret = bch2_fsck_full(c); - if (ret) - goto err; - bch_verbose(c, "fsck done"); - } else if (!c->sb.clean) { - bch_verbose(c, "checking for deleted inodes"); - err = "error in recovery"; - ret = bch2_fsck_walk_inodes_only(c); - if (ret) - goto err; - bch_verbose(c, "check inodes done"); + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); } if (enabled_qtypes(c)) { @@ -1386,9 +1490,8 @@ use_clean: } mutex_lock(&c->sb_lock); - if (c->opts.version_upgrade) { - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); write_sb = true; } @@ -1411,17 +1514,14 @@ use_clean: if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || - le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { + c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { struct bch_move_stats stats; bch2_move_stats_init(&stats, "recovery"); bch_info(c, "scanning for old btree nodes"); - ret = bch2_fs_read_write(c); - if (ret) - goto err; - - ret = bch2_scan_old_btree_nodes(c, &stats); + ret = bch2_fs_read_write(c) ?: + bch2_scan_old_btree_nodes(c, &stats); if (ret) goto err; bch_info(c, "scanning for old btree nodes done"); @@ -1449,9 +1549,7 @@ out: } if (ret) - bch_err(c, "Error in recovery: %s (%s)", err, bch2_err_str(ret)); - else - bch_verbose(c, "ret %s", bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; err: fsck_err: @@ -1464,7 +1562,6 @@ int bch2_fs_initialize(struct bch_fs *c) struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; struct qstr lostfound = QSTR("lost+found"); - const char *err = "cannot allocate memory"; struct bch_dev *ca; unsigned i; int ret; @@ -1475,20 +1572,16 @@ int bch2_fs_initialize(struct bch_fs *c) c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); - if (c->sb.version < bcachefs_metadata_version_inode_v3) - c->opts.version_upgrade = true; + bch2_sb_maybe_downgrade(c); - if (c->opts.version_upgrade) { - c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { + bch2_sb_upgrade(c, bcachefs_metadata_version_current); + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } mutex_unlock(&c->sb_lock); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); - set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); - set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); - set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); + c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); set_bit(BCH_FS_MAY_GO_RW, &c->flags); set_bit(BCH_FS_FSCK_DONE, &c->flags); @@ -1498,7 +1591,6 @@ int bch2_fs_initialize(struct bch_fs *c) for_each_online_member(ca, c, i) bch2_dev_usage_init(ca); - err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); if (ret) { @@ -1514,7 +1606,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_fs_journal_start(&c->journal, 1); bch2_journal_set_replay_done(&c->journal); - err = "error going read-write"; ret = bch2_fs_read_write_early(c); if (ret) goto err; @@ -1524,7 +1615,6 @@ int bch2_fs_initialize(struct bch_fs *c) * btree updates */ bch_verbose(c, "marking superblocks"); - err = "error marking superblock and journal"; for_each_member_device(ca, c, i) { ret = bch2_trans_mark_dev_sb(c, ca); if (ret) { @@ -1535,20 +1625,16 @@ int bch2_fs_initialize(struct bch_fs *c) ca->new_fs_bucket_idx = 0; } - bch_verbose(c, "initializing freespace"); - err = "error initializing freespace"; ret = bch2_fs_freespace_init(c); if (ret) goto err; - err = "error creating root snapshot node"; - ret = bch2_fs_initialize_subvolumes(c); + ret = bch2_initialize_subvolumes(c); if (ret) goto err; bch_verbose(c, "reading snapshots table"); - err = "error reading snapshots table"; - ret = bch2_fs_snapshots_start(c); + ret = bch2_snapshots_read(c); if (ret) goto err; bch_verbose(c, "reading snapshots done"); @@ -1559,16 +1645,16 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_pack(&packed_inode, &root_inode); packed_inode.inode.k.p.snapshot = U32_MAX; - err = "error creating root directory"; ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, NULL, 0); - if (ret) + if (ret) { + bch_err_msg(c, ret, "creating root directory"); goto err; + } bch2_inode_init_early(c, &lostfound_inode); - err = "error creating lost+found"; ret = bch2_trans_do(c, NULL, NULL, 0, bch2_create_trans(&trans, BCACHEFS_ROOT_SUBVOL_INUM, @@ -1577,7 +1663,7 @@ int bch2_fs_initialize(struct bch_fs *c) 0, 0, S_IFDIR|0700, 0, NULL, NULL, (subvol_inum) { 0 }, 0)); if (ret) { - bch_err(c, "error creating lost+found"); + bch_err_msg(c, ret, "creating lost+found"); goto err; } @@ -1587,10 +1673,11 @@ int bch2_fs_initialize(struct bch_fs *c) goto err; } - err = "error writing first journal entry"; ret = bch2_journal_flush(&c->journal); - if (ret) + if (ret) { + bch_err_msg(c, ret, "writing first journal entry"); goto err; + } mutex_lock(&c->sb_lock); SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); @@ -1601,6 +1688,6 @@ int bch2_fs_initialize(struct bch_fs *c) return 0; err: - pr_err("Error initializing new filesystem: %s (%i)", err, ret); + bch_err_fn(ca, ret); return ret; } diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h index 8c0348e..f8e796c 100644 --- a/libbcachefs/recovery.h +++ b/libbcachefs/recovery.h @@ -52,6 +52,8 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, void bch2_journal_keys_free(struct journal_keys *); void bch2_journal_entries_free(struct bch_fs *); +u64 bch2_fsck_recovery_passes(void); + int bch2_fs_recovery(struct bch_fs *); int bch2_fs_initialize(struct bch_fs *); diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h new file mode 100644 index 0000000..abf1f83 --- /dev/null +++ b/libbcachefs/recovery_types.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_TYPES_H +#define _BCACHEFS_RECOVERY_TYPES_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) + +#define BCH_RECOVERY_PASSES() \ + x(alloc_read, PASS_ALWAYS) \ + x(stripes_read, PASS_ALWAYS) \ + x(initialize_subvolumes, 0) \ + x(snapshots_read, PASS_ALWAYS) \ + x(check_topology, 0) \ + x(check_allocations, PASS_FSCK) \ + x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, PASS_ALWAYS) \ + x(check_alloc_info, PASS_FSCK) \ + x(check_lrus, PASS_FSCK) \ + x(check_btree_backpointers, PASS_FSCK) \ + x(check_backpointers_to_extents,PASS_FSCK) \ + x(check_extents_to_backpointers,PASS_FSCK) \ + x(check_alloc_to_lru_refs, PASS_FSCK) \ + x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 0) \ + x(check_snapshot_trees, PASS_FSCK) \ + x(check_snapshots, PASS_FSCK) \ + x(check_subvols, PASS_FSCK) \ + x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ + x(fs_upgrade_for_subvolumes, 0) \ + x(check_inodes, PASS_FSCK) \ + x(check_extents, PASS_FSCK) \ + x(check_dirents, PASS_FSCK) \ + x(check_xattrs, PASS_FSCK) \ + x(check_root, PASS_FSCK) \ + x(check_directory_structure, PASS_FSCK) \ + x(check_nlinks, PASS_FSCK) \ + x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ + x(fix_reflink_p, 0) \ + +enum bch_recovery_pass { +#define x(n, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +#endif /* _BCACHEFS_RECOVERY_TYPES_H */ diff --git a/libbcachefs/reflink.c b/libbcachefs/reflink.c index 94f2b30..39f711d 100644 --- a/libbcachefs/reflink.c +++ b/libbcachefs/reflink.c @@ -26,16 +26,11 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - if (bkey_val_bytes(p.k) != sizeof(*p.v)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(p.k), sizeof(*p.v)); - return -EINVAL; - } - if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { prt_printf(err, "idx < front_pad (%llu < %u)", @@ -78,17 +73,10 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r /* indirect extents */ int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - if (bkey_val_bytes(r.k) < sizeof(*r.v)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(r.k), sizeof(*r.v)); - return -EINVAL; - } - - return bch2_bkey_ptrs_invalid(c, k, rw, err); + return bch2_bkey_ptrs_invalid(c, k, flags, err); } void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, @@ -131,14 +119,9 @@ int bch2_trans_mark_reflink_v(struct btree_trans *trans, /* indirect inline data */ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { - if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_indirect_inline_data)); - return -EINVAL; - } - return 0; } @@ -187,24 +170,13 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (orig->k.type == KEY_TYPE_inline_data) bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); - for_each_btree_key_norestart(trans, reflink_iter, BTREE_ID_reflink, - POS(0, c->reflink_hint), - BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { - if (reflink_iter.pos.inode) { - bch2_btree_iter_set_pos(&reflink_iter, POS_MIN); - continue; - } - - if (bkey_deleted(k.k) && orig->k.size <= k.k->size) - break; - } - + bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_prev(&reflink_iter); + ret = bkey_err(k); if (ret) goto err; - /* rewind iter to start of hole, if necessary: */ - bch2_btree_iter_set_pos_to_extent_start(&reflink_iter); - r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) @@ -233,14 +205,19 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, orig->k.type = KEY_TYPE_reflink_p; r_p = bkey_i_to_reflink_p(orig); set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + + /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ +#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); +#else memset(&r_p->v, 0, sizeof(r_p->v)); +#endif r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); err: - c->reflink_hint = reflink_iter.pos.offset; bch2_trans_iter_exit(trans, &reflink_iter); return ret; @@ -251,10 +228,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_continue_norestart(*iter, 0, k, ret) { - if (bkey_cmp(iter->pos, end) >= 0) - break; - + for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -262,7 +236,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) return k; } - if (bkey_cmp(iter->pos, end) >= 0) + if (bkey_ge(iter->pos, end)) bch2_btree_iter_set_pos(iter, end); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -285,8 +259,8 @@ s64 bch2_remap_range(struct bch_fs *c, u32 dst_snapshot, src_snapshot; int ret = 0, ret2 = 0; - if (!percpu_ref_tryget_live(&c->writes)) - return -EROFS; + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) + return -BCH_ERR_erofs_no_writes; bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -304,7 +278,7 @@ s64 bch2_remap_range(struct bch_fs *c, while ((ret == 0 || bch2_err_matches(ret, BCH_ERR_transaction_restart)) && - bkey_cmp(dst_iter.pos, dst_end) < 0) { + bkey_lt(dst_iter.pos, dst_end)) { struct disk_reservation disk_res = { 0 }; bch2_trans_begin(&trans); @@ -337,7 +311,7 @@ s64 bch2_remap_range(struct bch_fs *c, if (ret) continue; - if (bkey_cmp(src_want, src_iter.pos) < 0) { + if (bkey_lt(src_want, src_iter.pos)) { ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, min(dst_end.offset, dst_iter.pos.offset + @@ -389,8 +363,8 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_iter_exit(&trans, &dst_iter); bch2_trans_iter_exit(&trans, &src_iter); - BUG_ON(!ret && bkey_cmp(dst_iter.pos, dst_end)); - BUG_ON(bkey_cmp(dst_iter.pos, dst_end) > 0); + BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); + BUG_ON(bkey_gt(dst_iter.pos, dst_end)); dst_done = dst_iter.pos.offset - dst_start.offset; new_i_size = min(dst_iter.pos.offset << 9, new_i_size); @@ -419,7 +393,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } diff --git a/libbcachefs/reflink.h b/libbcachefs/reflink.h index ce0012a..fe52538 100644 --- a/libbcachefs/reflink.h +++ b/libbcachefs/reflink.h @@ -2,8 +2,10 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H +enum bkey_invalid_flags; + int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -14,10 +16,11 @@ bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); .key_merge = bch2_reflink_p_merge, \ .trans_trigger = bch2_trans_mark_reflink_p, \ .atomic_trigger = bch2_mark_reflink_p, \ + .min_val_size = 16, \ }) int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, @@ -29,10 +32,11 @@ int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, .swab = bch2_ptr_swab, \ .trans_trigger = bch2_trans_mark_reflink_v, \ .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 8, \ }) int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, - int, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trans_mark_indirect_inline_data(struct btree_trans *, @@ -44,6 +48,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *, .key_invalid = bch2_indirect_inline_data_invalid, \ .val_to_text = bch2_indirect_inline_data_to_text, \ .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .min_val_size = 8, \ }) static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index fcf73d7..5b591c5 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -36,8 +36,8 @@ static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); } -void bch2_replicas_entry_v0_to_text(struct printbuf *out, - struct bch_replicas_entry_v0 *e) +static void bch2_replicas_entry_v0_to_text(struct printbuf *out, + struct bch_replicas_entry_v0 *e) { unsigned i; @@ -272,7 +272,7 @@ static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, { unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; struct bch_fs_usage *dst, *src = (void *) - bch2_acc_percpu_u64s((void *) src_p, src_nr); + bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); preempt_disable(); dst = this_cpu_ptr(dst_p); @@ -336,7 +336,7 @@ out: return ret; err: bch_err(c, "error updating replicas table: memory allocation failure"); - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_replicas_table; goto out; } @@ -383,14 +383,18 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (c->replicas_gc.entries && !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); - if (!new_gc.entries) + if (!new_gc.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; + } } if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(&c->replicas, new_entry); - if (!new_r.entries) + if (!new_r.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; + } ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) @@ -425,8 +429,7 @@ out: return ret; err: - bch_err(c, "error adding replicas entry: memory allocation failure"); - ret = -ENOMEM; + bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret)); goto out; } @@ -457,36 +460,14 @@ int bch2_replicas_delta_list_mark(struct bch_fs *c, int bch2_replicas_gc_end(struct bch_fs *c, int ret) { - unsigned i; - lockdep_assert_held(&c->replicas_gc_lock); + if (ret) + goto err; + mutex_lock(&c->sb_lock); percpu_down_write(&c->mark_lock); - /* - * this is kind of crappy; the replicas gc mechanism needs to be ripped - * out - */ - - for (i = 0; i < c->replicas.nr; i++) { - struct bch_replicas_entry *e = - cpu_replicas_entry(&c->replicas, i); - struct bch_replicas_cpu n; - - if (!__replicas_has_entry(&c->replicas_gc, e) && - bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) { - n = cpu_replicas_add_entry(&c->replicas_gc, e); - if (!n.entries) { - ret = -ENOMEM; - goto err; - } - - swap(n, c->replicas_gc); - kfree(n.entries); - } - } - ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); if (ret) goto err; @@ -533,7 +514,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); - return -ENOMEM; + return -BCH_ERR_ENOMEM_replicas_gc; } for_each_cpu_replicas_entry(&c->replicas, e) @@ -547,8 +528,14 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) return 0; } -/* New much simpler mechanism for clearing out unneeded replicas entries: */ - +/* + * New much simpler mechanism for clearing out unneeded replicas entries - drop + * replicas entries that have 0 sectors used. + * + * However, we don't track sector counts for journal usage, so this doesn't drop + * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism + * is retained for that. + */ int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; @@ -562,7 +549,7 @@ retry: new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); - return -ENOMEM; + return -BCH_ERR_ENOMEM_replicas_gc; } mutex_lock(&c->sb_lock); @@ -621,7 +608,7 @@ int bch2_replicas_set_usage(struct bch_fs *c, n = cpu_replicas_add_entry(&c->replicas, r); if (!n.entries) - return -ENOMEM; + return -BCH_ERR_ENOMEM_cpu_replicas; ret = replicas_table_update(c, &n); if (ret) @@ -655,7 +642,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) - return -ENOMEM; + return -BCH_ERR_ENOMEM_cpu_replicas; cpu_r->nr = nr; cpu_r->entry_size = entry_size; @@ -687,7 +674,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) - return -ENOMEM; + return -BCH_ERR_ENOMEM_cpu_replicas; cpu_r->nr = nr; cpu_r->entry_size = entry_size; @@ -717,9 +704,8 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - if (ret) - return -ENOMEM; + return ret; bch2_cpu_replicas_sort(&new_r); @@ -834,27 +820,27 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, if (e->data_type >= BCH_DATA_NR) { prt_printf(err, "invalid data type in entry "); bch2_replicas_entry_to_text(err, e); - return -EINVAL; + return -BCH_ERR_invalid_sb_replicas; } if (!e->nr_devs) { prt_printf(err, "no devices in entry "); bch2_replicas_entry_to_text(err, e); - return -EINVAL; + return -BCH_ERR_invalid_sb_replicas; } if (e->nr_required > 1 && e->nr_required >= e->nr_devs) { prt_printf(err, "bad nr_required in entry "); bch2_replicas_entry_to_text(err, e); - return -EINVAL; + return -BCH_ERR_invalid_sb_replicas; } for (j = 0; j < e->nr_devs; j++) if (!bch2_dev_exists(sb, mi, e->devs[j])) { prt_printf(err, "invalid device %u in entry ", e->devs[j]); bch2_replicas_entry_to_text(err, e); - return -EINVAL; + return -BCH_ERR_invalid_sb_replicas; } if (i + 1 < cpu_r->nr) { @@ -866,7 +852,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, if (!memcmp(e, n, cpu_r->entry_size)) { prt_printf(err, "duplicate replicas entry "); bch2_replicas_entry_to_text(err, e); - return -EINVAL; + return -BCH_ERR_invalid_sb_replicas; } } } @@ -881,8 +867,9 @@ static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, struct bch_replicas_cpu cpu_r; int ret; - if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) - return -ENOMEM; + ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); @@ -919,8 +906,9 @@ static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field * struct bch_replicas_cpu cpu_r; int ret; - if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) - return -ENOMEM; + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index cc34b38..4887675 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -27,22 +27,6 @@ bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); -struct replicas_delta { - s64 delta; - struct bch_replicas_entry r; -} __packed; - -struct replicas_delta_list { - unsigned size; - unsigned used; - - struct {} memset_start; - u64 nr_inodes; - u64 persistent_reserved[BCH_REPLICAS_MAX]; - struct {} memset_end; - struct replicas_delta d[0]; -}; - static inline struct replicas_delta * replicas_delta_next(struct replicas_delta *d) { diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h index f12a35b..5cfff48 100644 --- a/libbcachefs/replicas_types.h +++ b/libbcachefs/replicas_types.h @@ -8,4 +8,20 @@ struct bch_replicas_cpu { struct bch_replicas_entry *entries; }; +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/libbcachefs/seqmutex.h b/libbcachefs/seqmutex.h new file mode 100644 index 0000000..c1860d8 --- /dev/null +++ b/libbcachefs/seqmutex.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SEQMUTEX_H +#define _BCACHEFS_SEQMUTEX_H + +#include + +struct seqmutex { + struct mutex lock; + u32 seq; +}; + +#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) + +static inline bool seqmutex_trylock(struct seqmutex *lock) +{ + return mutex_trylock(&lock->lock); +} + +static inline void seqmutex_lock(struct seqmutex *lock) +{ + mutex_lock(&lock->lock); +} + +static inline void seqmutex_unlock(struct seqmutex *lock) +{ + lock->seq++; + mutex_unlock(&lock->lock); +} + +static inline u32 seqmutex_seq(struct seqmutex *lock) +{ + return lock->seq; +} + +static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) +{ + if (lock->seq != seq || !mutex_trylock(&lock->lock)) + return false; + + if (lock->seq != seq) { + mutex_unlock(&lock->lock); + return false; + } + + return true; +} + +#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index 6178ae6..ae21a8c 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -181,7 +181,7 @@ bch2_hash_lookup(struct btree_trans *trans, } bch2_trans_iter_exit(trans, iter); - return ret ?: -ENOENT; + return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; } static __always_inline int @@ -288,7 +288,7 @@ found: not_found: if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { - ret = -ENOENT; + ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ret = -EEXIST; } else { diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 1133783..736afb6 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -8,38 +8,212 @@ #include "fs.h" #include "subvolume.h" +#include + +static int bch2_subvolume_delete(struct btree_trans *, u32); + +static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + + if (s->skip[2] <= ancestor) + return s->skip[2]; + if (s->skip[1] <= ancestor) + return s->skip[1]; + if (s->skip[0] <= ancestor) + return s->skip[0]; + return s->parent; +} + +bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + bool ret; + + EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); + + ret = id && id < ancestor + ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) + : id == ancestor; + rcu_read_unlock(); + + return ret; +} + +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor) + id = __snapshot_t(t, id)->parent; + rcu_read_unlock(); + + return id == ancestor; +} + +static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) +{ + u32 depth; + + rcu_read_lock(); + depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; + rcu_read_unlock(); + + return depth; +} + +static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + size_t new_size; + struct snapshot_table *new, *old; + + new_size = max(16UL, roundup_pow_of_two(idx + 1)); + + new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + if (!new) + return NULL; + + old = rcu_dereference_protected(c->snapshots, true); + if (old) + memcpy(new->s, + rcu_dereference_protected(c->snapshots, true)->s, + sizeof(new->s[0]) * c->snapshot_table_size); + + rcu_assign_pointer(c->snapshots, new); + c->snapshot_table_size = new_size; + if (old) + kvfree_rcu(old); + + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; +} + +static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + + lockdep_assert_held(&c->snapshot_table_lock); + + if (likely(idx < c->snapshot_table_size)) + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + + return __snapshot_t_mut(c, id); +} + /* Snapshot tree: */ +void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); + + prt_printf(out, "subvol %u root snapshot %u", + le32_to_cpu(t.v->master_subvol), + le32_to_cpu(t.v->root_snapshot)); +} + +int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1))) { + prt_printf(err, "bad pos"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot_tree *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + + if (bch2_err_matches(ret, ENOENT)) + ret = -BCH_ERR_ENOENT_snapshot_tree; + return ret; +} + +static struct bkey_i_snapshot_tree * +__snapshot_tree_create(struct btree_trans *trans) +{ + struct btree_iter iter; + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, U32_MAX)); + struct bkey_i_snapshot_tree *s_t; + + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_snapshot_tree; + if (ret) + return ERR_PTR(ret); + + s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : s_t; +} + +static int snapshot_tree_create(struct btree_trans *trans, + u32 root_id, u32 subvol_id, u32 *tree_id) +{ + struct bkey_i_snapshot_tree *n_tree = + __snapshot_tree_create(trans); + + if (IS_ERR(n_tree)) + return PTR_ERR(n_tree); + + n_tree->v.master_subvol = cpu_to_le32(subvol_id); + n_tree->v.root_snapshot = cpu_to_le32(root_id); + *tree_id = n_tree->k.p.offset; + return 0; +} + +/* Snapshot nodes: */ + void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u", + prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", BCH_SNAPSHOT_SUBVOL(s.v), BCH_SNAPSHOT_DELETED(s.v), le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), - le32_to_cpu(s.v->subvol)); + le32_to_cpu(s.v->subvol), + le32_to_cpu(s.v->tree)); + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) + prt_printf(out, " depth %u skiplist %u %u %u", + le32_to_cpu(s.v->depth), + le32_to_cpu(s.v->skip[0]), + le32_to_cpu(s.v->skip[1]), + le32_to_cpu(s.v->skip[2])); } int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { struct bkey_s_c_snapshot s; u32 i, id; - if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0 || - bkey_cmp(k.k->p, POS(0, 1)) < 0) { + if (bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1))) { prt_printf(err, "bad pos"); - return -EINVAL; - } - - if (bkey_val_bytes(k.k) != sizeof(struct bch_snapshot)) { - prt_printf(err, "bad val size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_snapshot)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } s = bkey_s_c_to_snapshot(k); @@ -48,18 +222,18 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, if (id && id <= k.k->p.offset) { prt_printf(err, "bad parent node (%u <= %llu)", id, k.k->p.offset); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { prt_printf(err, "children not normalized"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (s.v->children[0] && s.v->children[0] == s.v->children[1]) { prt_printf(err, "duplicate child nodes"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } for (i = 0; i < 2; i++) { @@ -68,7 +242,26 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, if (id >= k.k->p.offset) { prt_printf(err, "bad child node (%u >= %llu)", id, k.k->p.offset); - return -EINVAL; + return -BCH_ERR_invalid_bkey; + } + } + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { + if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { + prt_printf(err, "skiplist not normalized"); + return -BCH_ERR_invalid_bkey; + } + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + + if (!id != !s.v->parent || + (s.v->parent && + id <= k.k->p.offset)) { + prt_printf(err, "bad skiplist node %u)", id); + return -BCH_ERR_invalid_bkey; + } } } @@ -76,52 +269,66 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, } int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, unsigned flags) { struct bch_fs *c = trans->c; struct snapshot_t *t; + u32 id = new.k->p.offset; + int ret = 0; + + mutex_lock(&c->snapshot_table_lock); - t = genradix_ptr_alloc(&c->snapshots, - U32_MAX - new.k->p.offset, - GFP_KERNEL); - if (!t) - return -ENOMEM; + t = snapshot_t_mut(c, id); + if (!t) { + ret = -BCH_ERR_ENOMEM_mark_snapshot; + goto err; + } if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + u32 parent = id; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + t->tree = le32_to_cpu(s.v->tree); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { + t->depth = le32_to_cpu(s.v->depth); + t->skip[0] = le32_to_cpu(s.v->skip[0]); + t->skip[1] = le32_to_cpu(s.v->skip[1]); + t->skip[2] = le32_to_cpu(s.v->skip[2]); + } else { + t->depth = 0; + t->skip[0] = 0; + t->skip[1] = 0; + t->skip[2] = 0; + } + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); + + if (BCH_SNAPSHOT_DELETED(s.v)) { + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); + } } else { - t->parent = 0; - t->children[0] = 0; - t->children[1] = 0; - t->subvol = 0; + memset(t, 0, sizeof(*t)); } - - return 0; +err: + mutex_unlock(&c->snapshot_table_lock); + return ret; } static int snapshot_lookup(struct btree_trans *trans, u32 id, struct bch_snapshot *s) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_WITH_UPDATES); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k) ?: k.k->type == KEY_TYPE_snapshot ? 0 : -ENOENT; - - if (!ret) - *s = *bkey_s_c_to_snapshot(k).v; - - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot, s); } static int snapshot_live(struct btree_trans *trans, u32 id) @@ -133,7 +340,7 @@ static int snapshot_live(struct btree_trans *trans, u32 id) return 0; ret = snapshot_lookup(trans, id, &v); - if (ret == -ENOENT) + if (bch2_err_matches(ret, ENOENT)) bch_err(trans->c, "snapshot node %u not found", id); if (ret) return ret; @@ -167,21 +374,346 @@ static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) nr_live += ret; } - snapshot_t(c, id)->equiv = nr_live == 1 - ? snapshot_t(c, child[live_idx])->equiv + mutex_lock(&c->snapshot_table_lock); + + snapshot_t_mut(c, id)->equiv = nr_live == 1 + ? snapshot_t_mut(c, child[live_idx])->equiv : id; + + mutex_unlock(&c->snapshot_table_lock); + return 0; } /* fsck: */ + +static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +{ + return snapshot_t(c, id)->children[child]; +} + +static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 0); +} + +static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 1); +} + +static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) +{ + u32 n, parent; + + n = bch2_snapshot_left_child(c, id); + if (n) + return n; + + while ((parent = bch2_snapshot_parent(c, id))) { + n = bch2_snapshot_right_child(c, parent); + if (n && n != id) + return n; + id = parent; + } + + return 0; +} + +static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +{ + u32 id = snapshot_root; + u32 subvol = 0, s; + + while (id) { + s = snapshot_t(c, id)->subvol; + + if (s && (!subvol || s < subvol)) + subvol = s; + + id = bch2_snapshot_tree_next(c, id); + } + + return subvol; +} + +static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + u32 snapshot_root, u32 *subvol_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume s; + bool found = false; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + s = bkey_s_c_to_subvolume(k); + if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) + continue; + if (!BCH_SUBVOLUME_SNAP(s.v)) { + *subvol_id = s.k->p.offset; + found = true; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + + if (!ret && !found) { + struct bkey_i_subvolume *s; + + *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, *subvol_id), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, false); + } + + return ret; +} + +static int check_snapshot_tree(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot_tree st; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct printbuf buf = PRINTBUF; + u32 root_id; + int ret; + + if (k.k->type != KEY_TYPE_snapshot_tree) + return 0; + + st = bkey_s_c_to_snapshot_tree(k); + root_id = le32_to_cpu(st.v->root_snapshot); + + ret = snapshot_lookup(trans, root_id, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), + c, + "snapshot tree points to missing/incorrect snapshot:\n %s", + (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto err; + } + + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), + false, 0, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), + root_id), c, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, + "snapshot tree points to snapshot subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + struct bkey_i_snapshot_tree *u; + u32 subvol_id; + + ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + if (ret) + goto err; + + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * For each snapshot_tree, make sure it points to the root of a snapshot tree + * and that snapshot entry points back to it, or delete it. + * + * And, make sure it points to a subvolume within that snapshot tree, or correct + * it to point to the oldest subvolume within that snapshot tree. + */ +int bch2_check_snapshot_trees(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_snapshot_trees, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot_tree(&trans, &iter, k))); + + if (ret) + bch_err(c, "error %i checking snapshot trees", ret); + return ret; +} + +/* + * Look up snapshot tree for @tree_id and find root, + * make sure @snap_id is a descendent: + */ +static int snapshot_tree_ptr_good(struct btree_trans *trans, + u32 snap_id, u32 tree_id) +{ + struct bch_snapshot_tree s_t; + int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + + if (bch2_err_matches(ret, ENOENT)) + return 0; + if (ret) + return ret; + + return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); +} + +static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + + if (!id) + return 0; + + rcu_read_lock(); + s = snapshot_t(c, id); + if (s->parent) + id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); + rcu_read_unlock(); + + return id; +} + +static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) +{ + struct bch_snapshot a; + unsigned i; + int ret; + + for (i = 0; i < 3; i++) { + if (!s.parent != !s.skip[i]) + return false; + + if (!s.parent) + continue; + + ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); + if (bch2_err_matches(ret, ENOENT)) + return false; + if (ret) + return ret; + + if (a.tree != s.tree) + return false; + } + + return true; +} + +/* + * snapshot_tree pointer was incorrect: look up root snapshot node, make sure + * its snapshot_tree pointer is correct (allocate new one if necessary), then + * update this node's pointer to root node's pointer: + */ +static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_snapshot *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter root_iter; + struct bch_snapshot_tree s_t; + struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; + int ret; + + root = bch2_bkey_get_iter_typed(trans, &root_iter, + BTREE_ID_snapshots, POS(0, root_id), + BTREE_ITER_WITH_UPDATES, snapshot); + ret = bkey_err(root); + if (ret) + goto err; + + tree_id = le32_to_cpu(root.v->tree); + + ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { + u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u) ?: + snapshot_tree_create(trans, root_id, + bch2_snapshot_tree_oldest_subvol(c, root_id), + &tree_id); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + if (k.k->p.offset == root_id) + *s = u->v; + } + + if (k.k->p.offset != root_id) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + *s = u->v; + } +err: + bch2_trans_iter_exit(trans, &root_iter); + return ret; +} + +static int cmp_le32(__le32 l, __le32 r) +{ + return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); +} + static int check_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot s; + struct bch_snapshot s; struct bch_subvolume subvol; struct bch_snapshot v; + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; struct printbuf buf = PRINTBUF; bool should_have_subvol; u32 i, id; @@ -190,105 +722,147 @@ static int check_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - s = bkey_s_c_to_snapshot(k); - id = le32_to_cpu(s.v->parent); + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, bkey_val_bytes(k.k)); + + id = le32_to_cpu(s.parent); if (id) { ret = snapshot_lookup(trans, id, &v); - if (ret == -ENOENT) + if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot with nonexistent parent:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; - if (le32_to_cpu(v.children[0]) != s.k->p.offset && - le32_to_cpu(v.children[1]) != s.k->p.offset) { + if (le32_to_cpu(v.children[0]) != k.k->p.offset && + le32_to_cpu(v.children[1]) != k.k->p.offset) { bch_err(c, "snapshot parent %u missing pointer to child %llu", - id, s.k->p.offset); + id, k.k->p.offset); ret = -EINVAL; goto err; } } - for (i = 0; i < 2 && s.v->children[i]; i++) { - id = le32_to_cpu(s.v->children[i]); + for (i = 0; i < 2 && s.children[i]; i++) { + id = le32_to_cpu(s.children[i]); ret = snapshot_lookup(trans, id, &v); - if (ret == -ENOENT) + if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot node %llu has nonexistent child %u", - s.k->p.offset, id); + k.k->p.offset, id); if (ret) goto err; - if (le32_to_cpu(v.parent) != s.k->p.offset) { + if (le32_to_cpu(v.parent) != k.k->p.offset) { bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", - id, le32_to_cpu(v.parent), s.k->p.offset); + id, le32_to_cpu(v.parent), k.k->p.offset); ret = -EINVAL; goto err; } } - should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && - !BCH_SNAPSHOT_DELETED(s.v); + should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + !BCH_SNAPSHOT_DELETED(&s); if (should_have_subvol) { - id = le32_to_cpu(s.v->subvol); + id = le32_to_cpu(s.subvol); ret = bch2_subvolume_get(trans, id, 0, false, &subvol); - if (ret == -ENOENT) + if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); if (ret) goto err; - if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { + if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", - s.k->p.offset); + k.k->p.offset); ret = -EINVAL; goto err; } } else { - if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); - + if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); if (ret) goto err; - bkey_reassemble(&u->k_i, s.s_c); u->v.subvol = 0; - ret = bch2_trans_update(trans, iter, &u->k_i, 0); - if (ret) - goto err; + s = u->v; } } - if (BCH_SNAPSHOT_DELETED(s.v)) - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) + goto err; + } + ret = 0; + + real_depth = bch2_snapshot_depth(c, parent_id); + + if (le32_to_cpu(s.depth) != real_depth && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", + real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.depth = cpu_to_le32(real_depth); + s = u->v; + } + + ret = snapshot_skiplist_good(trans, s); + if (ret < 0) + goto err; + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) + u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); + + bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); + s = u->v; + } + ret = 0; err: fsck_err: printbuf_exit(&buf); return ret; } -int bch2_fs_check_snapshots(struct bch_fs *c) +int bch2_check_snapshots(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - - ret = for_each_btree_key_commit(&trans, iter, - BTREE_ID_snapshots, POS_MIN, + /* + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ + ret = bch2_trans_run(c, + for_each_btree_key_reverse_commit(&trans, iter, + BTREE_ID_snapshots, POS_MAX, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_snapshot(&trans, &iter, k)); - + check_snapshot(&trans, &iter, k))); if (ret) - bch_err(c, "error %i checking snapshots", ret); - - bch2_trans_exit(&trans); + bch_err_fn(c, ret); return ret; } @@ -296,10 +870,11 @@ static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; struct bch_snapshot snapshot; unsigned snapid; - int ret; + int ret = 0; if (k.k->type != KEY_TYPE_subvolume) return 0; @@ -308,66 +883,90 @@ static int check_subvol(struct btree_trans *trans, snapid = le32_to_cpu(subvol.v->snapshot); ret = snapshot_lookup(trans, snapid, &snapshot); - if (ret == -ENOENT) - bch_err(trans->c, "subvolume %llu points to nonexistent snapshot %u", + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", k.k->p.offset, snapid); if (ret) return ret; if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + bch2_fs_lazy_rw(c); + ret = bch2_subvolume_delete(trans, iter->pos.offset); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "error deleting subvolume %llu: %s", + if (ret) + bch_err(c, "error deleting subvolume %llu: %s", iter->pos.offset, bch2_err_str(ret)); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { + u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); + u32 snapshot_tree; + struct bch_snapshot_tree st; + + rcu_read_lock(); + snapshot_tree = snapshot_t(c, snapshot_root)->tree; + rcu_read_unlock(); + + ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, snapshot_tree); + if (ret) return ret; + + if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, + "subvolume %llu is not set as snapshot but is not master subvolume", + k.k->p.offset)) { + struct bkey_i_subvolume *s = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, true); + } } - return 0; +fsck_err: + return ret; } -int bch2_fs_check_subvols(struct bch_fs *c) +int bch2_check_subvols(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret; - bch2_trans_init(&trans, c, 0, 0); - - ret = for_each_btree_key_commit(&trans, iter, + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, - check_subvol(&trans, &iter, k)); - - bch2_trans_exit(&trans); - + check_subvol(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); return ret; } void bch2_fs_snapshots_exit(struct bch_fs *c) { - genradix_free(&c->snapshots); + kfree(rcu_dereference_protected(c->snapshots, true)); } -int bch2_fs_snapshots_start(struct bch_fs *c) +int bch2_snapshots_read(struct bch_fs *c) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + ret = bch2_trans_run(c, + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - bch2_mark_snapshot(&trans, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(&trans, k)); - - bch2_trans_exit(&trans); - + bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(&trans, k))); if (ret) - bch_err(c, "error starting snapshots: %s", bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -377,40 +976,26 @@ int bch2_fs_snapshots_start(struct bch_fs *c) static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) { struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_snapshot *s; int ret = 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_snapshot) { - bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); - ret = -ENOENT; - goto err; + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); + return ret; } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)) - goto err; - - s = bch2_trans_kmalloc(trans, sizeof(*s)); - ret = PTR_ERR_OR_ZERO(s); - if (ret) + if (BCH_SNAPSHOT_DELETED(&s->v)) goto err; - bkey_reassemble(&s->k_i, k); SET_BCH_SNAPSHOT_DELETED(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; - - ret = bch2_trans_update(trans, &iter, &s->k_i, 0); - if (ret) - goto err; err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -418,60 +1003,45 @@ err: static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { + struct bch_fs *c = trans->c; struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct bkey_s_c k; + struct btree_iter tree_iter = (struct btree_iter) { NULL }; struct bkey_s_c_snapshot s; - struct bkey_i_snapshot *parent; u32 parent_id; unsigned i; int ret = 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; + s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT, snapshot); + ret = bkey_err(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); - if (k.k->type != KEY_TYPE_snapshot) { - bch2_fs_inconsistent(trans->c, "missing snapshot %u", id); - ret = -ENOENT; + if (ret) goto err; - } - - s = bkey_s_c_to_snapshot(k); BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); parent_id = le32_to_cpu(s.v->parent); if (parent_id) { - bch2_trans_iter_init(trans, &p_iter, BTREE_ID_snapshots, - POS(0, parent_id), - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&p_iter); - ret = bkey_err(k); - if (ret) - goto err; + struct bkey_i_snapshot *parent; - if (k.k->type != KEY_TYPE_snapshot) { - bch2_fs_inconsistent(trans->c, "missing snapshot %u", parent_id); - ret = -ENOENT; - goto err; - } - - parent = bch2_trans_kmalloc(trans, sizeof(*parent)); + parent = bch2_bkey_get_mut_typed(trans, &p_iter, + BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); ret = PTR_ERR_OR_ZERO(parent); - if (ret) + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); goto err; - - bkey_reassemble(&parent->k_i, k); + } for (i = 0; i < 2; i++) if (le32_to_cpu(parent->v.children[i]) == id) break; if (i == 2) - bch_err(trans->c, "snapshot %u missing child pointer to %u", + bch_err(c, "snapshot %u missing child pointer to %u", parent_id, id); else parent->v.children[i] = 0; @@ -480,29 +1050,51 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) le32_to_cpu(parent->v.children[1])) swap(parent->v.children[0], parent->v.children[1]); + } else { + /* + * We're deleting the root of a snapshot tree: update the + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ + struct bkey_i_snapshot_tree *s_t; + + BUG_ON(s.v->children[1]); - ret = bch2_trans_update(trans, &p_iter, &parent->k_i, 0); + s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; + + if (s.v->children[0]) { + s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } } ret = bch2_btree_delete_at(trans, &iter, 0); err: + bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) +static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_i_snapshot *n; struct bkey_s_c k; - unsigned i; - int ret = 0; + unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); + int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_INTENT); @@ -522,66 +1114,116 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, goto err; } - n = bch2_trans_kmalloc(trans, sizeof(*n)); + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); ret = PTR_ERR_OR_ZERO(n); if (ret) goto err; - bkey_snapshot_init(&n->k_i); - n->k.p = iter.pos; n->v.flags = 0; n->v.parent = cpu_to_le32(parent); n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.pad = 0; + n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - ret = bch2_trans_update(trans, &iter, &n->k_i, 0) ?: - bch2_mark_snapshot(trans, bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); if (ret) goto err; new_snapids[i] = iter.pos.offset; } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} - if (parent) { - bch2_btree_iter_set_pos(&iter, POS(0, parent)); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; +/* + * Create new snapshot IDs as children of an existing snapshot ID: + */ +static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n_parent; + int ret = 0; - if (k.k->type != KEY_TYPE_snapshot) { + n_parent = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, parent), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) bch_err(trans->c, "snapshot %u not found", parent); - ret = -ENOENT; - goto err; - } - - n = bch2_trans_kmalloc(trans, sizeof(*n)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; + return ret; + } - bkey_reassemble(&n->k_i, k); + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } - if (n->v.children[0] || n->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + goto err; - n->v.children[0] = cpu_to_le32(new_snapids[0]); - n->v.children[1] = cpu_to_le32(new_snapids[1]); - n->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n->v, false); - ret = bch2_trans_update(trans, &iter, &n->k_i, 0); - if (ret) - goto err; - } + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); err: bch2_trans_iter_exit(trans, &iter); return ret; } +/* + * Create a snapshot node that is the root of a new tree: + */ +static int bch2_snapshot_node_create_tree(struct btree_trans *trans, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bkey_i_snapshot_tree *n_tree; + int ret; + + n_tree = __snapshot_tree_create(trans); + ret = PTR_ERR_OR_ZERO(n_tree) ?: + create_snapids(trans, 0, n_tree->k.p.offset, + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + return ret; + + n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); + n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); + return 0; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + BUG_ON((parent == 0) != (nr_snapids == 1)); + BUG_ON((parent != 0) != (nr_snapids == 2)); + + return parent + ? bch2_snapshot_node_create_children(trans, parent, + new_snapids, snapshot_subvols, nr_snapids) + : bch2_snapshot_node_create_tree(trans, + new_snapids, snapshot_subvols, nr_snapids); + +} + static int snapshot_delete_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -590,9 +1232,9 @@ static int snapshot_delete_key(struct btree_trans *trans, struct bpos *last_pos) { struct bch_fs *c = trans->c; - u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - if (bkey_cmp(k.k->p, *last_pos)) + if (!bkey_eq(k.k->p, *last_pos)) equiv_seen->nr = 0; *last_pos = k.k->p; @@ -643,9 +1285,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) u32 i, id; int ret = 0; - if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) - return 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { ret = bch2_fs_read_write_early(c); if (ret) { @@ -731,6 +1370,8 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) err: darray_exit(&deleted); bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); return ret; } @@ -738,17 +1379,16 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); - bch2_delete_dead_snapshots(c); - percpu_ref_put(&c->writes); + if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) + bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!percpu_ref_tryget_live(&c->writes)) - return; - - if (!queue_work(system_long_wq, &c->snapshot_delete_work)) - percpu_ref_put(&c->writes); + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, @@ -758,7 +1398,7 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - if (!test_bit(BCH_FS_FSCK_DONE, &c->flags)) + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) return 0; bch2_delete_dead_snapshots_async(c); @@ -768,18 +1408,12 @@ static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, /* Subvolumes: */ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + unsigned flags, struct printbuf *err) { - if (bkey_cmp(k.k->p, SUBVOL_POS_MIN) < 0 || - bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) { + if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || + bkey_gt(k.k->p, SUBVOL_POS_MAX)) { prt_printf(err, "invalid pos"); - return -EINVAL; - } - - if (bkey_val_bytes(k.k) != sizeof(struct bch_subvolume)) { - prt_printf(err, "incorrect value size (%zu != %zu)", - bkey_val_bytes(k.k), sizeof(struct bch_subvolume)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -791,8 +1425,25 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); prt_printf(out, "root %llu snapshot id %u", - le64_to_cpu(s.v->inode), - le32_to_cpu(s.v->snapshot)); + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) + prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); +} + +static __always_inline int +bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), + iter_flags, subvolume, s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && + inconsistent_if_not_found, + trans->c, "missing subvolume %u", subvol); + return ret; } int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, @@ -800,22 +1451,7 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, int iter_flags, struct bch_subvolume *s) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), - iter_flags); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -ENOENT; - - if (ret == -ENOENT && inconsistent_if_not_found) - bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); - if (!ret) - *s = *bkey_s_c_to_subvolume(k).v; - - bch2_trans_iter_exit(trans, &iter); - return ret; + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); } int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, @@ -830,47 +1466,88 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, u32 *snapid) { - struct bch_subvolume s; + struct btree_iter iter; + struct bkey_s_c k; int ret; - ret = bch2_subvolume_get(trans, subvol, true, - BTREE_ITER_CACHED| - BTREE_ITER_WITH_UPDATES, - &s); + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_CACHED| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume; - *snapid = le32_to_cpu(s.snapshot); + if (likely(!ret)) + *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + else if (bch2_err_matches(ret, ENOENT)) + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + bch2_trans_iter_exit(trans, &iter); return ret; } +static int bch2_subvolume_reparent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + u32 old_parent, u32 new_parent) +{ + struct bkey_i_subvolume *s; + int ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + return 0; + + s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.parent = cpu_to_le32(new_parent); + return 0; +} + +/* + * Scan for subvolumes with parent @subvolid_to_delete, reparent: + */ +static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_subvolume s; + + return lockrestart_do(trans, + bch2_subvolume_get(trans, subvolid_to_delete, true, + BTREE_ITER_CACHED, &s)) ?: + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_reparent(trans, &iter, k, + subvolid_to_delete, le32_to_cpu(s.parent))); +} + /* * Delete subvolume, mark snapshot ID as deleted, queue up snapshot * deletion/cleanup: */ -int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { struct btree_iter iter; - struct bkey_s_c k; struct bkey_s_c_subvolume subvol; struct btree_trans_commit_hook *h; u32 snapid; int ret = 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, - POS(0, subvolid), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); + subvol = bch2_bkey_get_iter_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED|BTREE_ITER_INTENT, + subvolume); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); if (ret) - goto err; - - if (k.k->type != KEY_TYPE_subvolume) { - bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); - ret = -EIO; - goto err; - } + return ret; - subvol = bkey_s_c_to_subvolume(k); snapid = le32_to_cpu(subvol.v->snapshot); ret = bch2_btree_delete_at(trans, &iter, 0); @@ -893,7 +1570,14 @@ err: return ret; } -void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + return bch2_subvolumes_reparent(trans, subvolid) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_subvolume_delete(trans, subvolid)); +} + +static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); @@ -913,8 +1597,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) bch2_evict_subvolume_inodes(c, &s); for (id = s.data; id < s.data + s.nr; id++) { - ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, - bch2_subvolume_delete(&trans, *id)); + ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id)); if (ret) { bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); break; @@ -924,7 +1607,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) darray_exit(&s); } - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -932,7 +1615,7 @@ struct subvolume_unlink_hook { u32 subvol; }; -int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, +static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, struct btree_trans_commit_hook *_h) { struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); @@ -947,58 +1630,41 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, if (ret) return ret; - if (unlikely(!percpu_ref_tryget_live(&c->writes))) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; - if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - percpu_ref_put(&c->writes); + if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) { struct btree_iter iter; - struct bkey_s_c k; struct bkey_i_subvolume *n; struct subvolume_unlink_hook *h; int ret = 0; - bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, - POS(0, subvolid), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_subvolume) { - bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); - ret = -EIO; - goto err; - } - - n = bch2_trans_kmalloc(trans, sizeof(*n)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, k); - SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); - - ret = bch2_trans_update(trans, &iter, &n->k_i, 0); - if (ret) - goto err; - h = bch2_trans_kmalloc(trans, sizeof(*h)); ret = PTR_ERR_OR_ZERO(h); if (ret) - goto err; + return ret; h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; h->subvol = subvolid; bch2_trans_commit_hook(trans, &h->h); -err: + + n = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + return ret; + } + + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1013,54 +1679,32 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; struct bkey_i_subvolume *new_subvol = NULL; struct bkey_i_subvolume *src_subvol = NULL; - struct bkey_s_c k; u32 parent = 0, new_nodes[2], snapshot_subvols[2]; int ret = 0; - for_each_btree_key(trans, dst_iter, BTREE_ID_subvolumes, SUBVOL_POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (bkey_cmp(k.k->p, SUBVOL_POS_MAX) > 0) - break; - - /* - * bch2_subvolume_delete() doesn't flush the btree key cache - - * ideally it would but that's tricky - */ - if (bkey_deleted(k.k) && - !bch2_btree_key_cache_find(c, BTREE_ID_subvolumes, dst_iter.pos)) - goto found_slot; - } - - if (!ret) + ret = bch2_bkey_get_empty_slot(trans, &dst_iter, + BTREE_ID_subvolumes, POS(0, U32_MAX)); + if (ret == -BCH_ERR_ENOSPC_btree_slot) ret = -BCH_ERR_ENOSPC_subvolume_create; - goto err; -found_slot: + if (ret) + return ret; + snapshot_subvols[0] = dst_iter.pos.offset; snapshot_subvols[1] = src_subvolid; if (src_subvolid) { /* Creating a snapshot: */ - src_subvol = bch2_trans_kmalloc(trans, sizeof(*src_subvol)); - ret = PTR_ERR_OR_ZERO(src_subvol); - if (ret) - goto err; - bch2_trans_iter_init(trans, &src_iter, BTREE_ID_subvolumes, - POS(0, src_subvolid), - BTREE_ITER_CACHED| - BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(&src_iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_subvolume) { - bch_err(c, "subvolume %u not found", src_subvolid); - ret = -ENOENT; + src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, + BTREE_ID_subvolumes, POS(0, src_subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "subvolume %u not found", src_subvolid); goto err; } - bkey_reassemble(&src_subvol->k_i, k); parent = le32_to_cpu(src_subvol->v.snapshot); } @@ -1077,21 +1721,20 @@ found_slot: goto err; } - new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol)); + new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); ret = PTR_ERR_OR_ZERO(new_subvol); if (ret) goto err; - bkey_subvolume_init(&new_subvol->k_i); new_subvol->v.flags = 0; new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.parent = cpu_to_le32(src_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); - new_subvol->k.p = dst_iter.pos; - ret = bch2_trans_update(trans, &dst_iter, &new_subvol->k_i, 0); - if (ret) - goto err; *new_subvolid = new_subvol->k.p.offset; *new_snapshotid = new_nodes[0]; diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index c694c1c..6905e91 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -5,54 +5,160 @@ #include "darray.h" #include "subvolume_types.h" +enum bkey_invalid_flags; + +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_tree_invalid, \ + .val_to_text = bch2_snapshot_tree_to_text, \ + .min_val_size = 8, \ +}) + +int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); #define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ .key_invalid = bch2_snapshot_invalid, \ .val_to_text = bch2_snapshot_to_text, \ + .atomic_trigger = bch2_mark_snapshot, \ + .min_val_size = 24, \ }) -int bch2_mark_snapshot(struct btree_trans *, struct bkey_s_c, - struct bkey_s_c, unsigned); +static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) +{ + return &t->s[U32_MAX - id]; +} + +static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return __snapshot_t(rcu_dereference(c->snapshots), id); +} -static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { - return genradix_ptr(&c->snapshots, U32_MAX - id); + rcu_read_lock(); + id = snapshot_t(c, id)->tree; + rcu_read_unlock(); + + return id; } -static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { return snapshot_t(c, id)->parent; } -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent_early(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + u32 parent = snapshot_t(c, id)->parent; + + if (parent && + snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", + id, snapshot_t(c, id)->depth, + parent, snapshot_t(c, parent)->depth); + + return parent; +#else + return snapshot_t(c, id)->parent; +#endif +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) +{ + rcu_read_lock(); + while (n--) + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +{ + u32 parent; + + rcu_read_lock(); + while ((parent = __bch2_snapshot_parent(c, id))) + id = parent; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) { return snapshot_t(c, id)->equiv; } +static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_equiv(c, id); + rcu_read_unlock(); + + return id; +} + static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) { - return id == snapshot_t(c, id)->equiv; + return id == bch2_snapshot_equiv(c, id); } -static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) +static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - struct snapshot_t *s = snapshot_t(c, id); + const struct snapshot_t *s; + bool ret; - return s->children[0] || s->children[1]; + rcu_read_lock(); + s = snapshot_t(c, id); + ret = s->children[0]; + rcu_read_unlock(); + + return ret; +} + +static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +{ + return !bch2_snapshot_is_internal_node(c, id); } static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) { - struct snapshot_t *s; - u32 parent = bch2_snapshot_parent(c, id); + const struct snapshot_t *s; + u32 parent = __bch2_snapshot_parent(c, id); if (!parent) return 0; - s = snapshot_t(c, bch2_snapshot_parent(c, id)); + s = snapshot_t(c, __bch2_snapshot_parent(c, id)); if (id == s->children[0]) return s->children[1]; if (id == s->children[1]) @@ -60,12 +166,26 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) return 0; } +bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); + static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { - while (id && id < ancestor) - id = bch2_snapshot_parent(c, id); + return id == ancestor + ? true + : __bch2_snapshot_is_ancestor(c, id, ancestor); +} + +static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *t; + bool ret; + + rcu_read_lock(); + t = snapshot_t(c, id); + ret = (t->children[0]|t->children[1]) != 0; + rcu_read_unlock(); - return id == ancestor; + return ret; } static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) @@ -99,19 +219,21 @@ static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 i return ret; } -int bch2_fs_check_snapshots(struct bch_fs *); -int bch2_fs_check_subvols(struct bch_fs *); +int bch2_check_snapshot_trees(struct bch_fs *); +int bch2_check_snapshots(struct bch_fs *); +int bch2_check_subvols(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); -int bch2_fs_snapshots_start(struct bch_fs *); +int bch2_snapshots_read(struct bch_fs *); int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, - int rw, struct printbuf *); + unsigned, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ .val_to_text = bch2_subvolume_to_text, \ + .min_val_size = 16, \ }) int bch2_subvolume_get(struct btree_trans *, unsigned, @@ -127,7 +249,6 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); -int bch2_subvolume_delete(struct btree_trans *, u32); int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32 *, u32 *, bool); diff --git a/libbcachefs/subvolume_types.h b/libbcachefs/subvolume_types.h index f7562b5..8683344 100644 --- a/libbcachefs/subvolume_types.h +++ b/libbcachefs/subvolume_types.h @@ -6,4 +6,26 @@ typedef DARRAY(u32) snapshot_id_list; +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + u32 equiv; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct snapshot_t s[0]; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 60c1f03..c9a5a7c 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -4,6 +4,7 @@ #include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" +#include "counters.h" #include "disk_groups.h" #include "ec.h" #include "error.h" @@ -12,18 +13,77 @@ #include "journal_io.h" #include "journal_sb.h" #include "journal_seq_blacklist.h" +#include "recovery.h" #include "replicas.h" #include "quota.h" #include "super-io.h" #include "super.h" +#include "trace.h" #include "vstructs.h" -#include "counters.h" #include -#include #include -#include +struct bch2_metadata_version { + u16 version; + const char *name; + u64 recovery_passes; +}; + +static const struct bch2_metadata_version bch2_metadata_versions[] = { +#define x(n, v, _recovery_passes) { \ + .version = v, \ + .name = #n, \ + .recovery_passes = _recovery_passes, \ +}, + BCH_METADATA_VERSIONS() +#undef x +}; + +void bch2_version_to_text(struct printbuf *out, unsigned v) +{ + const char *str = "(unknown version)"; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version == v) { + str = bch2_metadata_versions[i].name; + break; + } + + prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); +} + +unsigned bch2_latest_compatible_version(unsigned v) +{ + if (!BCH_VERSION_MAJOR(v)) + return v; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version > v && + BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == + BCH_VERSION_MAJOR(v)) + v = bch2_metadata_versions[i].version; + + return v; +} + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + u64 ret = 0; + + for (const struct bch2_metadata_version *i = bch2_metadata_versions; + i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); + i++) + if (i->version > old_version && i->version <= new_version) { + if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) + ret |= bch2_fsck_recovery_passes(); + ret |= i->recovery_passes; + } + + return ret &= ~RECOVERY_PASS_ALL_FSCK; +} const char * const bch2_sb_fields[] = { #define x(name, nr) #name, @@ -137,14 +197,14 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) return 0; if (dynamic_fault("bcachefs:add:super_realloc")) - return -ENOMEM; + return -BCH_ERR_ENOMEM_sb_realloc_injected; if (sb->have_bio) { unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE); bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) - return -ENOMEM; + return -BCH_ERR_ENOMEM_sb_bio_realloc; bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); @@ -154,7 +214,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); if (!new_sb) - return -ENOMEM; + return -BCH_ERR_ENOMEM_sb_buf_realloc; sb->sb = new_sb; sb->buffer_size = new_buffer_size; @@ -201,35 +261,33 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, /* Superblock validate: */ -static inline void __bch2_sb_layout_size_assert(void) -{ - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -} - static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) { u64 offset, prev_offset, max_sectors; unsigned i; - if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) { + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); + + if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && + !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { prt_printf(out, "Not a bcachefs superblock layout"); - return -EINVAL; + return -BCH_ERR_invalid_sb_layout; } if (layout->layout_type != 0) { prt_printf(out, "Invalid superblock layout type %u", layout->layout_type); - return -EINVAL; + return -BCH_ERR_invalid_sb_layout_type; } if (!layout->nr_superblocks) { prt_printf(out, "Invalid superblock layout: no superblocks"); - return -EINVAL; + return -BCH_ERR_invalid_sb_layout_nr_superblocks; } if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { prt_printf(out, "Invalid superblock layout: too many superblocks"); - return -EINVAL; + return -BCH_ERR_invalid_sb_layout_nr_superblocks; } max_sectors = 1 << layout->sb_max_size_bits; @@ -243,7 +301,7 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out prt_printf(out, "Invalid superblock layout: superblocks overlap\n" " (sb %u ends at %llu next starts at %llu", i - 1, prev_offset + max_sectors, offset); - return -EINVAL; + return -BCH_ERR_invalid_sb_layout_superblocks_overlap; } prev_offset = offset; } @@ -251,6 +309,44 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out return 0; } +static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) +{ + u16 version = le16_to_cpu(sb->version); + u16 version_min = le16_to_cpu(sb->version_min); + + if (!bch2_version_compatible(version)) { + prt_str(out, "Unsupported superblock version "); + bch2_version_to_text(out, version); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (!bch2_version_compatible(version_min)) { + prt_str(out, "Unsupported superblock version_min "); + bch2_version_to_text(out, version_min); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (version_min > version) { + prt_str(out, "Bad minimum version "); + bch2_version_to_text(out, version_min); + prt_str(out, ", greater than version field "); + bch2_version_to_text(out, version); + return -BCH_ERR_invalid_sb_version; + } + + return 0; +} + static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, int rw) { @@ -258,37 +354,17 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, struct bch_sb_field *f; struct bch_sb_field_members *mi; enum bch_opt_id opt_id; - u32 version, version_min; u16 block_size; int ret; - version = le16_to_cpu(sb->version); - version_min = version >= bcachefs_metadata_version_bkey_renumber - ? le16_to_cpu(sb->version_min) - : version; - - if (version >= bcachefs_metadata_version_max) { - prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", - version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); - return -EINVAL; - } - - if (version_min < bcachefs_metadata_version_min) { - prt_printf(out, "Unsupported superblock version %u (min %u, max %u)", - version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); - return -EINVAL; - } - - if (version_min > version) { - prt_printf(out, "Bad minimum version %u, greater than version field %u", - version_min, version); - return -EINVAL; - } + ret = bch2_sb_compatible(sb, out); + if (ret) + return ret; if (sb->features[1] || (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { prt_printf(out, "Filesystem has incompatible features"); - return -EINVAL; + return -BCH_ERR_invalid_sb_features; } block_size = le16_to_cpu(sb->block_size); @@ -296,49 +372,52 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, if (block_size > PAGE_SECTORS) { prt_printf(out, "Block size too big (got %u, max %u)", block_size, PAGE_SECTORS); - return -EINVAL; + return -BCH_ERR_invalid_sb_block_size; } - if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) { + if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); - return -EINVAL; + return -BCH_ERR_invalid_sb_uuid; } - if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) { + if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { prt_printf(out, "Bad intenal UUID (got zeroes)"); - return -EINVAL; + return -BCH_ERR_invalid_sb_uuid; } if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", sb->nr_devices, BCH_SB_MEMBERS_MAX); - return -EINVAL; + return -BCH_ERR_invalid_sb_too_many_members; } if (sb->dev_idx >= sb->nr_devices) { prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", sb->dev_idx, sb->nr_devices); - return -EINVAL; + return -BCH_ERR_invalid_sb_dev_idx; } if (!sb->time_precision || le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", le32_to_cpu(sb->time_precision), NSEC_PER_SEC); - return -EINVAL; + return -BCH_ERR_invalid_sb_time_precision; } if (rw == READ) { /* * Been seeing a bug where these are getting inexplicably - * zeroed, so we'r now validating them, but we have to be + * zeroed, so we're now validating them, but we have to be * careful not to preven people's filesystems from mounting: */ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); + + if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); } for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { @@ -363,15 +442,15 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, vstruct_for_each(sb, f) { if (!f->u64s) { - prt_printf(out, "Invalid superblock: optional with size 0 (type %u)", + prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", le32_to_cpu(f->type)); - return -EINVAL; + return -BCH_ERR_invalid_sb_field_size; } if (vstruct_next(f) > vstruct_last(sb)) { prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", le32_to_cpu(f->type)); - return -EINVAL; + return -BCH_ERR_invalid_sb_field_size; } } @@ -379,7 +458,7 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, mi = bch2_sb_get_members(sb); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); - return -EINVAL; + return -BCH_ERR_invalid_sb_members_missing; } ret = bch2_sb_field_validate(sb, &mi->field, out); @@ -413,6 +492,7 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.user_uuid = src->user_uuid; c->sb.version = le16_to_cpu(src->version); c->sb.version_min = le16_to_cpu(src->version_min); + c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); @@ -432,7 +512,7 @@ static void bch2_sb_update(struct bch_fs *c) ca->mi = bch2_mi_to_cpu(mi->members + i); } -static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) +static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; struct bch_sb *dst = dst_handle->sb; @@ -457,42 +537,45 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) memcpy(dst->compat, src->compat, sizeof(dst->compat)); for (i = 0; i < BCH_SB_FIELD_NR; i++) { + int d; + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) continue; src_f = bch2_sb_field_get(src, i); dst_f = bch2_sb_field_get(dst, i); + + d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - + (dst_f ? le32_to_cpu(dst_f->u64s) : 0); + if (d > 0) { + int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d); + if (ret) + return ret; + + dst = dst_handle->sb; + dst_f = bch2_sb_field_get(dst, i); + } + dst_f = __bch2_sb_field_resize(dst_handle, dst_f, src_f ? le32_to_cpu(src_f->u64s) : 0); if (src_f) memcpy(dst_f, src_f, vstruct_bytes(src_f)); } + + return 0; } int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) { - struct bch_sb_field_journal *journal_buckets = - bch2_sb_get_journal(src); - unsigned journal_u64s = journal_buckets - ? le32_to_cpu(journal_buckets->field.u64s) - : 0; int ret; lockdep_assert_held(&c->sb_lock); - ret = bch2_sb_realloc(&c->disk_sb, - le32_to_cpu(src->u64s) - journal_u64s); - if (ret) - return ret; - - __copy_super(&c->disk_sb, src); - - ret = bch2_sb_replicas_to_cpu_replicas(c); - if (ret) - return ret; - - ret = bch2_sb_disk_groups_to_cpu(c); + ret = bch2_sb_realloc(&c->disk_sb, 0) ?: + __copy_super(&c->disk_sb, src) ?: + bch2_sb_replicas_to_cpu_replicas(c) ?: + bch2_sb_disk_groups_to_cpu(c); if (ret) return ret; @@ -502,21 +585,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) { - struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; - struct bch_sb_field_journal *journal_buckets = - bch2_sb_get_journal(dst); - unsigned journal_u64s = journal_buckets - ? le32_to_cpu(journal_buckets->field.u64s) - : 0; - unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; - int ret; - - ret = bch2_sb_realloc(&ca->disk_sb, u64s); - if (ret) - return ret; - - __copy_super(&ca->disk_sb, src); - return 0; + return __copy_super(&ca->disk_sb, c->disk_sb.sb); } /* read superblock: */ @@ -524,7 +593,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) { struct bch_csum csum; - u32 version, version_min; size_t bytes; int ret; reread: @@ -538,45 +606,34 @@ reread: return ret; } - if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) { + if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && + !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { prt_printf(err, "Not a bcachefs superblock"); - return -EINVAL; + return -BCH_ERR_invalid_sb_magic; } - version = le16_to_cpu(sb->sb->version); - version_min = version >= bcachefs_metadata_version_bkey_renumber - ? le16_to_cpu(sb->sb->version_min) - : version; - - if (version >= bcachefs_metadata_version_max) { - prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", - version, bcachefs_metadata_version_min, bcachefs_metadata_version_max); - return -EINVAL; - } - - if (version_min < bcachefs_metadata_version_min) { - prt_printf(err, "Unsupported superblock version %u (min %u, max %u)", - version_min, bcachefs_metadata_version_min, bcachefs_metadata_version_max); - return -EINVAL; - } + ret = bch2_sb_compatible(sb->sb, err); + if (ret) + return ret; bytes = vstruct_bytes(sb->sb); if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", bytes, 512UL << sb->sb->layout.sb_max_size_bits); - return -EINVAL; + return -BCH_ERR_invalid_sb_too_big; } if (bytes > sb->buffer_size) { - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) - return -ENOMEM; + ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); + if (ret) + return ret; goto reread; } if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); - return -EINVAL; + return -BCH_ERR_invalid_sb_csum_type; } /* XXX: verify MACs */ @@ -585,7 +642,7 @@ reread: if (bch2_crc_cmp(csum, sb->sb->csum)) { prt_printf(err, "bad checksum"); - return -EINVAL; + return -BCH_ERR_invalid_sb_csum; } sb->seq = le64_to_cpu(sb->sb->seq); @@ -601,13 +658,18 @@ int bch2_read_super(const char *path, struct bch_opts *opts, struct printbuf err = PRINTBUF; __le64 *i; int ret; - - pr_verbose_init(*opts, ""); - +#ifndef __KERNEL__ +retry: +#endif memset(sb, 0, sizeof(*sb)); sb->mode = FMODE_READ; sb->have_bio = true; +#ifndef __KERNEL__ + if (opt_get(*opts, direct_io) == false) + sb->mode |= FMODE_BUFFERED; +#endif + if (!opt_get(*opts, noexcl)) sb->mode |= FMODE_EXCL; @@ -692,11 +754,17 @@ int bch2_read_super(const char *path, struct bch_opts *opts, got_super: if (le16_to_cpu(sb->sb->block_size) << 9 < - bdev_logical_block_size(sb->bdev)) { + bdev_logical_block_size(sb->bdev) && + opt_get(*opts, direct_io)) { +#ifndef __KERNEL__ + opt_set(*opts, direct_io, false); + bch2_free_super(sb); + goto retry; +#endif prt_printf(&err, "block size (%u) smaller than device block size (%u)", le16_to_cpu(sb->sb->block_size) << 9, bdev_logical_block_size(sb->bdev)); - ret = -EINVAL; + ret = -BCH_ERR_block_size_too_small; goto err; } @@ -710,7 +778,6 @@ got_super: goto err_no_print; } out: - pr_verbose_init(*opts, "ret %i", ret); printbuf_exit(&err); return ret; err: @@ -802,6 +869,10 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); + /* Make sure we're using the new magic numbers: */ + c->disk_sb.sb->magic = BCHFS_MAGIC; + c->disk_sb.sb->layout.magic = BCHFS_MAGIC; + le64_add_cpu(&c->disk_sb.sb->seq, 1); if (test_bit(BCH_FS_ERROR, &c->flags)) @@ -856,7 +927,7 @@ int bch2_write_super(struct bch_fs *c) le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); - ret = -EROFS; + ret = -BCH_ERR_erofs_sb_err; goto out; } @@ -866,7 +937,7 @@ int bch2_write_super(struct bch_fs *c) le64_to_cpu(ca->sb_read_scratch->seq), ca->disk_sb.seq); percpu_ref_put(&ca->io_ref); - ret = -EROFS; + ret = -BCH_ERR_erofs_sb_err; goto out; } } @@ -946,7 +1017,7 @@ static int bch2_sb_members_validate(struct bch_sb *sb, if ((void *) (mi->members + sb->nr_devices) > vstruct_end(&mi->field)) { prt_printf(err, "too many devices for section size"); - return -EINVAL; + return -BCH_ERR_invalid_sb_members; } for (i = 0; i < sb->nr_devices; i++) { @@ -958,28 +1029,28 @@ static int bch2_sb_members_validate(struct bch_sb *sb, if (le64_to_cpu(m->nbuckets) > LONG_MAX) { prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", i, le64_to_cpu(m->nbuckets), LONG_MAX); - return -EINVAL; + return -BCH_ERR_invalid_sb_members; } if (le64_to_cpu(m->nbuckets) - le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); - return -EINVAL; + return -BCH_ERR_invalid_sb_members; } if (le16_to_cpu(m->bucket_size) < le16_to_cpu(sb->block_size)) { prt_printf(err, "device %u: bucket size %u smaller than block size %u", i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); - return -EINVAL; + return -BCH_ERR_invalid_sb_members; } if (le16_to_cpu(m->bucket_size) < BCH_SB_BTREE_NODE_SIZE(sb)) { prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); - return -EINVAL; + return -BCH_ERR_invalid_sb_members; } } @@ -1111,12 +1182,12 @@ static int bch2_sb_crypt_validate(struct bch_sb *sb, if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&crypt->field), sizeof(*crypt)); - return -EINVAL; + return -BCH_ERR_invalid_sb_crypt; } if (BCH_CRYPT_KDF_TYPE(crypt)) { prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); - return -EINVAL; + return -BCH_ERR_invalid_sb_crypt; } return 0; @@ -1163,6 +1234,32 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle return 0; } +/* Downgrade if superblock is at a higher version than currently supported: */ +void bch2_sb_maybe_downgrade(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + /* + * Downgrade, if superblock is at a higher version than currently + * supported: + */ + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + if (c->sb.version > bcachefs_metadata_version_current) + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + if (c->sb.version_min > bcachefs_metadata_version_current) + c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); +} + +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + c->disk_sb.sb->version = cpu_to_le16(new_version); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +} + int bch2_fs_mark_dirty(struct bch_fs *c) { int ret; @@ -1174,8 +1271,10 @@ int bch2_fs_mark_dirty(struct bch_fs *c) mutex_lock(&c->sb_lock); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_sb_maybe_downgrade(c); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); - c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); + ret = bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -1254,7 +1353,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c, u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); - memcpy(&u->r, e, replicas_entry_bytes(e)); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); } for_each_member_device(ca, c, dev) { @@ -1352,7 +1452,7 @@ static int bch2_sb_clean_validate(struct bch_sb *sb, if (vstruct_bytes(&clean->field) < sizeof(*clean)) { prt_printf(err, "wrong size (got %zu should be %zu)", vstruct_bytes(&clean->field), sizeof(*clean)); - return -EINVAL; + return -BCH_ERR_invalid_sb_clean; } return 0; @@ -1393,21 +1493,27 @@ static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { #undef x }; +static const struct bch_sb_field_ops bch2_sb_field_null_ops; + +static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) +{ + return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) + ? bch2_sb_field_ops[type] + : &bch2_sb_field_null_ops; +} + static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, struct printbuf *err) { unsigned type = le32_to_cpu(f->type); struct printbuf field_err = PRINTBUF; + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); int ret; - if (type >= BCH_SB_FIELD_NR) - return 0; - - ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); + ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; if (ret) { prt_printf(err, "Invalid superblock section %s: %s", - bch2_sb_fields[type], - field_err.buf); + bch2_sb_fields[type], field_err.buf); prt_newline(err); bch2_sb_field_to_text(err, sb, f); } @@ -1420,13 +1526,12 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { unsigned type = le32_to_cpu(f->type); - const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR - ? bch2_sb_field_ops[type] : NULL; + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - if (ops) + if (type < BCH_SB_FIELD_NR) prt_printf(out, "%s", bch2_sb_fields[type]); else prt_printf(out, "(unknown field %u)", type); @@ -1434,9 +1539,9 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, " (size %zu):", vstruct_bytes(f)); prt_newline(out); - if (ops && ops->to_text) { + if (ops->to_text) { printbuf_indent_add(out, 2); - bch2_sb_field_ops[type]->to_text(out, sb, f); + ops->to_text(out, sb, f); printbuf_indent_sub(out, 2); } } @@ -1507,12 +1612,17 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_str(out, "Version:"); prt_tab(out); - prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version)]); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_newline(out); + + prt_str(out, "Version upgrade complete:"); + prt_tab(out); + bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); prt_printf(out, "Oldest version on disk:"); prt_tab(out); - prt_printf(out, "%s", bch2_metadata_versions[le16_to_cpu(sb->version_min)]); + bch2_version_to_text(out, le16_to_cpu(sb->version_min)); prt_newline(out); prt_printf(out, "Created:"); diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 14a25f6..904adea 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -9,6 +9,19 @@ #include +static inline bool bch2_version_compatible(u16 version) +{ + return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && + version >= bcachefs_metadata_version_min; +} + +void bch2_version_to_text(struct printbuf *, unsigned); +unsigned bch2_latest_compatible_version(unsigned); + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned, + unsigned); + struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, enum bch_sb_field_type, unsigned); @@ -79,7 +92,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) static inline bool bch2_member_exists(struct bch_member *m) { - return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); } static inline bool bch2_dev_exists(struct bch_sb *sb, @@ -104,7 +117,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), - .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), + .valid = bch2_member_exists(mi), }; } @@ -115,6 +128,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); +void bch2_sb_maybe_downgrade(struct bch_fs *); +void bch2_sb_upgrade(struct bch_fs *, unsigned); + int bch2_fs_mark_dirty(struct bch_fs *); void bch2_fs_mark_clean(struct bch_fs *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 5be4c40..eee5696 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -16,11 +16,13 @@ #include "btree_key_cache.h" #include "btree_update_interior.h" #include "btree_io.h" +#include "btree_write_buffer.h" #include "buckets_waiting_for_journal.h" #include "chardev.h" #include "checksum.h" #include "clock.h" #include "compress.h" +#include "counters.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -37,6 +39,7 @@ #include "move.h" #include "migrate.h" #include "movinggc.h" +#include "nocow_locking.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" @@ -45,7 +48,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" -#include "counters.h" +#include "trace.h" #include #include @@ -54,13 +57,10 @@ #include #include #include -#include #include #include #include -#include - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); @@ -109,7 +109,7 @@ static struct kset *bcachefs_kset; static LIST_HEAD(bch_fs_list); static DEFINE_MUTEX(bch_fs_list_lock); -static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); +DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); @@ -139,20 +139,20 @@ found: return c; } -static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) +static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; lockdep_assert_held(&bch_fs_list_lock); list_for_each_entry(c, &bch_fs_list, list) - if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) + if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) return c; return NULL; } -struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) +struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) { struct bch_fs *c; @@ -205,11 +205,15 @@ static void __bch2_fs_read_only(struct bch_fs *c) unsigned i, clean_passes = 0; u64 seq = 0; + bch2_fs_ec_stop(c); + bch2_open_buckets_stop(c, NULL, true); bch2_rebalance_stop(c); bch2_copygc_stop(c); bch2_gc_thread_stop(c); + bch2_fs_ec_flush(c); - bch_verbose(c, "flushing journal and stopping allocators"); + bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", + journal_cur_seq(&c->journal)); do { clean_passes++; @@ -223,7 +227,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) } } while (clean_passes < 2); - bch_verbose(c, "flushing journal and stopping allocators complete"); + bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", + journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) @@ -237,13 +242,15 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_dev_allocator_remove(c, ca); } +#ifndef BCH_WRITE_REF_DEBUG static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - wake_up(&bch_read_only_wait); + wake_up(&bch2_read_only_wait); } +#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -258,9 +265,13 @@ void bch2_fs_read_only(struct bch_fs *c) * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ + set_bit(BCH_FS_GOING_RO, &c->flags); +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); - - cancel_work_sync(&c->ec_stripe_delete_work); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + bch2_write_ref_put(c, i); +#endif /* * If we're not doing an emergency shutdown, we want to wait on @@ -273,16 +284,17 @@ void bch2_fs_read_only(struct bch_fs *c) * we do need to wait on them before returning and signalling * that going RO is complete: */ - wait_event(bch_read_only_wait, + wait_event(bch2_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); __bch2_fs_read_only(c); - wait_event(bch_read_only_wait, + wait_event(bch2_read_only_wait, test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + clear_bit(BCH_FS_GOING_RO, &c->flags); if (!bch2_journal_error(&c->journal) && !test_bit(BCH_FS_ERROR, &c->flags) && @@ -290,6 +302,11 @@ void bch2_fs_read_only(struct bch_fs *c) test_bit(BCH_FS_STARTED, &c->flags) && test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && !c->opts.norecovery) { + BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); + BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); + BUG_ON(c->btree_write_buffer.state.nr); + bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); } @@ -319,7 +336,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); - wake_up(&bch_read_only_wait); + wake_up(&bch2_read_only_wait); return ret; } @@ -327,6 +344,19 @@ static int bch2_fs_read_write_late(struct bch_fs *c) { int ret; + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + * + * Ideally we'd start copygc/rebalance earlier instead of waiting for + * all of recovery/fsck to complete: + */ + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; + } + ret = bch2_rebalance_start(c); if (ret) { bch_err(c, "error starting rebalance thread"); @@ -344,20 +374,21 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); - return -EROFS; + return -BCH_ERR_erofs_unfixed_errors; } if (test_bit(BCH_FS_RW, &c->flags)) return 0; + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + /* * nochanges is used for fsck -n mode - we have to allow going rw * during recovery for that to work: */ - if (c->opts.norecovery || - (c->opts.nochanges && - (!early || c->opts.read_only))) - return -EROFS; + if (c->opts.nochanges && (!early || c->opts.read_only)) + return -BCH_ERR_erofs_nochanges; bch_info(c, "going read-write"); @@ -367,6 +398,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + /* + * First journal write must be a flush write: after a clean shutdown we + * don't read the journal, so the first journal write may end up + * overwriting whatever was there previously, and there must always be + * at least one non-flush write in the journal or recovery will fail: + */ + set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + for_each_rw_member(ca, c, i) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); @@ -377,26 +416,27 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) return ret; } - ret = bch2_copygc_start(c); - if (ret) { - bch_err(c, "error starting copygc thread"); - return ret; - } - - schedule_work(&c->ec_stripe_delete_work); - - bch2_do_discards(c); - bch2_do_invalidates(c); - if (!early) { ret = bch2_fs_read_write_late(c); if (ret) goto err; } +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif set_bit(BCH_FS_RW, &c->flags); set_bit(BCH_FS_WAS_RW, &c->flags); + + bch2_do_discards(c); + bch2_do_invalidates(c); + bch2_do_stripe_deletes(c); + bch2_do_pending_node_rewrites(c); return 0; err: __bch2_fs_read_only(c); @@ -425,6 +465,7 @@ static void __bch2_fs_free(struct bch_fs *c) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); + bch2_free_pending_node_rewrites(c); bch2_fs_counters_exit(c); bch2_fs_snapshots_exit(c); bch2_fs_quota_exit(c); @@ -444,25 +485,30 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_fs_compress_exit(c); bch2_journal_keys_free(&c->journal_keys); bch2_journal_entries_free(c); + bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); + free_percpu(c->online_reserved); if (c->btree_paths_bufs) for_each_possible_cpu(cpu) kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); - free_percpu(c->online_reserved); + darray_exit(&c->btree_roots_extra); free_percpu(c->btree_paths_bufs); free_percpu(c->pcpu); mempool_exit(&c->large_bkey_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_exit(&c->writes); +#endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); kfree(c->unused_inode_hints); - free_heap(&c->copygc_heap); + if (c->write_ref_wq) + destroy_workqueue(c->write_ref_wq); if (c->io_complete_wq) destroy_workqueue(c->io_complete_wq); if (c->copygc_wq) @@ -523,9 +569,12 @@ void __bch2_fs_stop(struct bch_fs *c) cancel_work_sync(&c->read_only_work); - for (i = 0; i < c->sb.nr_devices; i++) - if (c->devs[i]) - bch2_free_super(&c->devs[i]->disk_sb); + for (i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + + if (ca) + bch2_free_super(&ca->disk_sb); + } } void bch2_fs_free(struct bch_fs *c) @@ -612,11 +661,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - pr_verbose_init(opts, ""); - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { - c = ERR_PTR(-ENOMEM); + c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; } @@ -648,11 +695,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_interior_update_init_early(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); bch2_fs_quota_init(c); bch2_fs_ec_init_early(c); + bch2_fs_move_init(c); INIT_LIST_HEAD(&c->list); @@ -671,21 +720,15 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); - INIT_LIST_HEAD(&c->ec_stripe_head_list); - mutex_init(&c->ec_stripe_head_lock); - - INIT_LIST_HEAD(&c->ec_stripe_new_list); - mutex_init(&c->ec_stripe_new_lock); - - INIT_LIST_HEAD(&c->data_progress_list); - mutex_init(&c->data_progress_lock); - - spin_lock_init(&c->ec_stripes_heap_lock); - seqcount_init(&c->gc_pos_lock); seqcount_init(&c->usage_lock); + sema_init(&c->io_in_flight, 128); + + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; @@ -714,16 +757,16 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) strscpy(c->name, name.buf, sizeof(c->name)); printbuf_exit(&name); - ret = name.allocation_failure ? -ENOMEM : 0; + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; /* Compat: */ - if (sb->version <= bcachefs_metadata_version_inode_v2 && + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); - if (sb->version <= bcachefs_metadata_version_inode_v2 && + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); @@ -734,9 +777,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_opts_apply(&c->opts, opts); - /* key cache currently disabled for inodes, because of snapshots: */ - c->opts.inodes_use_key_cache = 0; - c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; @@ -757,29 +797,33 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0)) || +#ifndef BCH_WRITE_REF_DEBUG percpu_ref_init(&c->writes, bch2_writes_disabled, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +#endif mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || - !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || !(c->online_reserved = alloc_percpu(u64)) || + !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { - ret = -ENOMEM; + ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } @@ -793,8 +837,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_interior_update_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: bch2_fs_subvolumes_init(c) ?: bch2_fs_io_init(c) ?: + bch2_fs_nocow_locking_init(c) ?: bch2_fs_encryption_init(c) ?: bch2_fs_compress_init(c) ?: bch2_fs_ec_init(c) ?: @@ -825,7 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; out: - pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err: bch2_fs_free(c); @@ -840,9 +885,13 @@ static void print_mount_opts(struct bch_fs *c) struct printbuf p = PRINTBUF; bool first = true; + prt_str(&p, "mounting version "); + bch2_version_to_text(&p, c->sb.version); + if (c->opts.read_only) { - prt_printf(&p, "ro"); + prt_str(&p, " opts="); first = false; + prt_printf(&p, "ro"); } for (i = 0; i < bch2_opts_nr; i++) { @@ -855,16 +904,12 @@ static void print_mount_opts(struct bch_fs *c) if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) continue; - if (!first) - prt_printf(&p, ","); + prt_str(&p, first ? " opts=" : ","); first = false; bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } - if (!p.pos) - prt_printf(&p, "(null)"); - - bch_info(c, "mounted version=%s opts=%s", bch2_metadata_versions[c->sb.version], p.buf); + bch_info(c, "%s", p.buf); printbuf_exit(&p); } @@ -874,7 +919,9 @@ int bch2_fs_start(struct bch_fs *c) struct bch_dev *ca; time64_t now = ktime_get_real_seconds(); unsigned i; - int ret = -EINVAL; + int ret; + + print_mount_opts(c); down_write(&c->state_lock); @@ -911,9 +958,9 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - ret = -EINVAL; if (bch2_fs_init_fault("fs_start")) { bch_err(c, "fs_start fault injected"); + ret = -EINVAL; goto err; } @@ -929,53 +976,49 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - print_mount_opts(c); ret = 0; out: up_write(&c->state_lock); return ret; err: bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); - - if (ret < -BCH_ERR_START) - ret = -EINVAL; goto out; } -static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) +static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) { struct bch_sb_field_members *sb_mi; sb_mi = bch2_sb_get_members(sb); if (!sb_mi) - return "Invalid superblock: member info area missing"; + return -BCH_ERR_member_info_missing; if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return "mismatched block size"; + return -BCH_ERR_mismatched_block_size; if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return "new cache bucket size is too small"; + return -BCH_ERR_bucket_size_too_small; - return NULL; + return 0; } -static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) { struct bch_sb *newest = le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; struct bch_sb_field_members *mi = bch2_sb_get_members(newest); - if (uuid_le_cmp(fs->uuid, sb->uuid)) - return "device not a member of filesystem"; + if (!uuid_equal(&fs->uuid, &sb->uuid)) + return -BCH_ERR_device_not_a_member_of_filesystem; if (!bch2_dev_exists(newest, mi, sb->dev_idx)) - return "device has been removed"; + return -BCH_ERR_device_has_been_removed; if (fs->block_size != sb->block_size) - return "mismatched block size"; + return -BCH_ERR_mismatched_block_size; - return NULL; + return 0; } /* Device startup/shutdown: */ @@ -1144,8 +1187,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) struct bch_dev *ca = NULL; int ret = 0; - pr_verbose_init(c->opts, ""); - if (bch2_fs_init_fault("dev_alloc")) goto err; @@ -1156,14 +1197,11 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; bch2_dev_attach(c, ca, dev_idx); -out: - pr_verbose_init(c->opts, "ret %i", ret); return ret; err: if (ca) bch2_dev_free(ca); - ret = -ENOMEM; - goto out; + return -BCH_ERR_ENOMEM_dev_alloc; } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -1173,23 +1211,17 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); - return -EINVAL; + return -BCH_ERR_device_already_online; } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); - return -EINVAL; + return -BCH_ERR_device_size_too_small; } BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); - if (get_capacity(sb->bdev->bd_disk) < - ca->mi.bucket_size * ca->mi.nbuckets) { - bch_err(ca, "device too small"); - return -EINVAL; - } - ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; @@ -1358,7 +1390,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return -EINVAL; + return -BCH_ERR_device_state_not_allowed; if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); @@ -1412,6 +1444,8 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, BTREE_TRIGGER_NORUN, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, BTREE_TRIGGER_NORUN, NULL); if (ret) bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); @@ -1423,7 +1457,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_sb_field_members *mi; unsigned dev_idx = ca->dev_idx, data; - int ret = -EINVAL; + int ret; down_write(&c->state_lock); @@ -1435,6 +1469,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); + ret = -BCH_ERR_device_state_not_allowed; goto err; } @@ -1492,6 +1527,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) bch2_dev_free(ca); + /* + * At this point the device object has been removed in-core, but the + * on-disk journal might still refer to the device index via sb device + * usage entries. Recovery fails if it sees usage information for an + * invalid device. Flush journal pins to push the back of the journal + * past now invalid device index references before we update the + * superblock, but after the device object has been removed so any + * further journal writes elide usage info for the device. + */ + bch2_journal_flush_all_pins(&c->journal); + /* * Free this device's slot in the bch_member array - all pointers to * this device must be gone: @@ -1520,7 +1566,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); struct bch_sb_handle sb; - const char *err; struct bch_dev *ca = NULL; struct bch_sb_field_members *mi; struct bch_member dev_mi; @@ -1545,10 +1590,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } } - err = bch2_dev_may_add(sb.sb, c); - if (err) { - bch_err(c, "device add error: %s", err); - ret = -EINVAL; + ret = bch2_dev_may_add(sb.sb, c); + if (ret) { + bch_err(c, "device add error: %s", bch2_err_str(ret)); goto err; } @@ -1682,7 +1726,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) struct bch_sb_field_members *mi; struct bch_dev *ca; unsigned dev_idx; - const char *err; int ret; down_write(&c->state_lock); @@ -1695,9 +1738,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); - if (err) { - bch_err(c, "error bringing %s online: %s", path, err); + ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); + if (ret) { + bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret)); goto err; } @@ -1726,12 +1769,16 @@ int bch2_dev_online(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); + ret = bch2_fs_freespace_init(c); + if (ret) + bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + up_write(&c->state_lock); return 0; err: up_write(&c->state_lock); bch2_free_super(&sb); - return -EINVAL; + return ret; } int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) @@ -1747,7 +1794,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); - return -EINVAL; + return -BCH_ERR_device_state_not_allowed; } __bch2_dev_offline(c, ca); @@ -1773,7 +1820,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); - ret = -EINVAL; + ret = -BCH_ERR_device_size_too_small; goto err; } @@ -1810,7 +1857,7 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) for_each_member_device_rcu(ca, c, i, NULL) if (!strcmp(name, ca->name)) goto found; - ca = ERR_PTR(-ENOENT); + ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); found: rcu_read_unlock(); @@ -1826,15 +1873,12 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_fs *c = NULL; struct bch_sb_field_members *mi; unsigned i, best_sb = 0; - const char *err; struct printbuf errbuf = PRINTBUF; int ret = 0; if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); - pr_verbose_init(opts, ""); - if (!nr_devices) { ret = -EINVAL; goto err; @@ -1870,8 +1914,8 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, continue; } - err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); - if (err) + ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + if (ret) goto err_print; i++; } @@ -1892,9 +1936,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } up_write(&c->state_lock); - err = "insufficient devices"; - if (!bch2_fs_may_start(c)) + if (!bch2_fs_may_start(c)) { + ret = -BCH_ERR_insufficient_devices_to_start; goto err_print; + } if (!c->opts.nostart) { ret = bch2_fs_start(c); @@ -1905,12 +1950,10 @@ out: kfree(sb); printbuf_exit(&errbuf); module_put(THIS_MODULE); - pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err_print: pr_err("bch_fs_open err opening %s: %s", - devices[0], err); - ret = -EINVAL; + devices[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); @@ -1957,5 +2000,8 @@ err: BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM +static unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + module_exit(bcachefs_exit); module_init(bcachefs_init); diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 3c83e9b..36bcb9e 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -223,7 +223,7 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) } struct bch_fs *bch2_dev_to_fs(dev_t); -struct bch_fs *bch2_uuid_to_fs(uuid_le); +struct bch_fs *bch2_uuid_to_fs(__uuid_t); bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, enum bch_member_state, int); @@ -251,7 +251,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (percpu_ref_is_zero(&c->writes)) + if (!test_bit(BCH_FS_RW, &c->flags) && + !test_bit(BCH_FS_WAS_RW, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 647d018..740305e 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -27,6 +27,8 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "movinggc.h" +#include "nocow_locking.h" #include "opts.h" #include "rebalance.h" #include "replicas.h" @@ -34,7 +36,6 @@ #include "tests.h" #include -#include #include #include @@ -194,8 +195,35 @@ read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(stripes_heap); read_attribute(open_buckets); +read_attribute(open_buckets_partial); +read_attribute(write_points); +read_attribute(nocow_lock_table); + +#ifdef BCH_WRITE_REF_DEBUG +read_attribute(write_refs); + +static const char * const bch2_write_refs[] = { +#define x(n) #n, + BCH_WRITE_REFS() +#undef x + NULL +}; + +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_tabstop_push(out, 24); + + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { + prt_str(out, bch2_write_refs[i]); + prt_tab(out); + prt_printf(out, "%li", atomic_long_read(&c->writes[i])); + prt_newline(out); + } +} +#endif read_attribute(internal_uuid); +read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); @@ -221,6 +249,7 @@ read_attribute(io_timers_read); read_attribute(io_timers_write); read_attribute(data_jobs); +read_attribute(moving_ctxts); #ifdef CONFIG_BCACHEFS_TESTS write_attribute(perf_test); @@ -250,25 +279,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) -{ - long ret = 0; - struct bch_move_stats *stats; - - mutex_lock(&c->data_progress_lock); - list_for_each_entry(stats, &c->data_progress_list, list) { - prt_printf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - prt_printf(out, "%s", "\n"); - } - - mutex_unlock(&c->data_progress_lock); - return ret; -} - static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; @@ -369,7 +379,7 @@ static void bch2_btree_wakeup_all(struct bch_fs *c) { struct btree_trans *trans; - mutex_lock(&c->btree_trans_lock); + seqmutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); @@ -377,7 +387,7 @@ static void bch2_btree_wakeup_all(struct bch_fs *c) six_lock_wakeup_all(&b->lock); } - mutex_unlock(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); } SHOW(bch2_fs) @@ -401,9 +411,9 @@ SHOW(bch2_fs) sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ - sysfs_hprint(copy_gc_wait, - max(0LL, c->copygc_wait - - atomic64_read(&c->io_clock[WRITE].now)) << 9); + + if (attr == &sysfs_copy_gc_wait) + bch2_copygc_wait_to_text(out, c); if (attr == &sysfs_rebalance_work) bch2_rebalance_work_to_text(out, c); @@ -430,6 +440,12 @@ SHOW(bch2_fs) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c); + if (attr == &sysfs_open_buckets_partial) + bch2_open_buckets_partial_to_text(out, c); + + if (attr == &sysfs_write_points) + bch2_write_points_to_text(out, c); + if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -443,7 +459,21 @@ SHOW(bch2_fs) bch2_io_timers_to_text(out, &c->io_clock[WRITE]); if (attr == &sysfs_data_jobs) - data_progress_to_text(out, c); + bch2_data_jobs_to_text(out, c); + + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); + +#ifdef BCH_WRITE_REF_DEBUG + if (attr == &sysfs_write_refs) + bch2_write_refs_to_text(out, c); +#endif + + if (attr == &sysfs_nocow_lock_table) + bch2_nocow_locks_to_text(out, &c->nocow_locks); + + if (attr == &sysfs_disk_groups) + bch2_disk_groups_to_text(out, c); return 0; } @@ -576,12 +606,12 @@ SHOW(bch2_fs_counters) counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ prt_printf(out, "since mount:"); \ prt_tab(out); \ - prt_human_readable_u64(out, counter_since_mount << 9); \ + prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ prt_printf(out, "since filesystem creation:"); \ prt_tab(out); \ - prt_human_readable_u64(out, counter << 9); \ + prt_human_readable_u64(out, counter); \ prt_newline(out); \ } BCH_PERSISTENT_COUNTERS() @@ -627,6 +657,12 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_new_stripes, &sysfs_stripes_heap, &sysfs_open_buckets, + &sysfs_open_buckets_partial, + &sysfs_write_points, +#ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, +#endif + &sysfs_nocow_lock_table, &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -646,8 +682,11 @@ struct attribute *bch2_fs_internal_files[] = { sysfs_pd_controller_files(rebalance), &sysfs_data_jobs, + &sysfs_moving_ctxts, &sysfs_internal_uuid, + + &sysfs_disk_groups, NULL }; @@ -678,7 +717,7 @@ STORE(bch2_fs_opts_dir) * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!percpu_ref_tryget_live(&c->writes))) + if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) return -EROFS; tmp = kstrdup(buf, GFP_KERNEL); @@ -708,7 +747,7 @@ STORE(bch2_fs_opts_dir) ret = size; err: - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); return ret; } SYSFS_OPS(bch2_fs_opts_dir); @@ -774,38 +813,100 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].data_type]++; - prt_printf(out, - "\t\t\t buckets\t sectors fragmented\n" - "capacity\t%16llu\n", - ca->mi.nbuckets - ca->mi.first_bucket); - - for (i = 0; i < BCH_DATA_NR; i++) - prt_printf(out, "%-16s%16llu%16llu%16llu\n", - bch2_data_types[i], stats.d[i].buckets, - stats.d[i].sectors, stats.d[i].fragmented); - - prt_printf(out, - "ec\t\t%16llu\n" - "\n" - "freelist_wait\t\t%s\n" - "open buckets allocated\t%u\n" - "open buckets this dev\t%u\n" - "open buckets total\t%u\n" - "open_buckets_wait\t%s\n" - "open_buckets_btree\t%u\n" - "open_buckets_user\t%u\n" - "buckets_to_invalidate\t%llu\n" - "btree reserve cache\t%u\n", - stats.buckets_ec, - c->freelist_wait.list.first ? "waiting" : "empty", - OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, - ca->nr_open_buckets, - OPEN_BUCKETS_COUNT, - c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_btree], - nr[BCH_DATA_user], - should_invalidate_buckets(ca, stats), - c->btree_reserve_cache_nr); + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (i = 0; i < BCH_DATA_NR; i++) { + prt_str(out, bch2_data_types[i]); + prt_tab(out); + prt_u64(out, stats.d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_str(out, "ec"); + prt_tab(out); + prt_u64(out, stats.buckets_ec); + prt_tab_rjust(out); + prt_newline(out); + + prt_newline(out); + + prt_printf(out, "reserves:"); + prt_newline(out); + for (i = 0; i < BCH_WATERMARK_NR; i++) { + prt_str(out, bch2_watermarks[i]); + prt_tab(out); + prt_u64(out, bch2_dev_buckets_reserved(ca, i)); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 24); + + prt_str(out, "freelist_wait"); + prt_tab(out); + prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open buckets allocated"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_newline(out); + + prt_str(out, "open buckets this dev"); + prt_tab(out); + prt_u64(out, ca->nr_open_buckets); + prt_newline(out); + + prt_str(out, "open buckets total"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT); + prt_newline(out); + + prt_str(out, "open_buckets_wait"); + prt_tab(out); + prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open_buckets_btree"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_btree]); + prt_newline(out); + + prt_str(out, "open_buckets_user"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_user]); + prt_newline(out); + + prt_str(out, "buckets_to_invalidate"); + prt_tab(out); + prt_u64(out, should_invalidate_buckets(ca, stats)); + prt_newline(out); + + prt_str(out, "btree reserve cache"); + prt_tab(out); + prt_u64(out, c->btree_reserve_cache_nr); + prt_newline(out); } static const char * const bch2_rw[] = { diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c index 43f974e..1d4b0a5 100644 --- a/libbcachefs/tests.c +++ b/libbcachefs/tests.c @@ -15,13 +15,14 @@ static void delete_test_keys(struct bch_fs *c) int ret; ret = bch2_btree_delete_range(c, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), SPOS_MAX, - 0, - NULL); + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); BUG_ON(ret); ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), SPOS_MAX, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), 0, NULL); BUG_ON(ret); } @@ -46,7 +47,7 @@ static int test_delete(struct bch_fs *c, u64 nr) bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { - bch_err(c, "%s(): update error in: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "update error"); goto err; } @@ -55,7 +56,7 @@ static int test_delete(struct bch_fs *c, u64 nr) bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "%s(): delete error (first): %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "delete error (first)"); goto err; } @@ -64,7 +65,7 @@ static int test_delete(struct bch_fs *c, u64 nr) bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "%s(): delete error (second): %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "delete error (second)"); goto err; } err: @@ -92,7 +93,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_btree_iter_traverse(&iter) ?: bch2_trans_update(&trans, &iter, &k.k_i, 0)); if (ret) { - bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "update error"); goto err; } @@ -103,7 +104,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_btree_iter_traverse(&iter) ?: bch2_btree_delete_at(&trans, &iter, 0)); if (ret) { - bch_err(c, "%s(): delete error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "delete error"); goto err; } err: @@ -136,7 +137,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "insert error"); goto err; } } @@ -145,13 +146,14 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, ({ + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(k.k->p.offset != i++); 0; })); if (ret) { - bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards"); goto err; } @@ -166,7 +168,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) 0; })); if (ret) { - bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating backwards"); goto err; } @@ -202,7 +204,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "insert error"); goto err; } } @@ -211,14 +213,15 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0, k, ({ + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); i = k.k->p.offset; 0; })); if (ret) { - bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards"); goto err; } @@ -234,7 +237,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) 0; })); if (ret) { - bch_err(c, "%s(): error iterating backwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating backwards"); goto err; } @@ -269,7 +272,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "insert error"); goto err; } } @@ -278,14 +281,15 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, ({ + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(k.k->p.offset != i); i += 2; 0; })); if (ret) { - bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards"); goto err; } @@ -295,8 +299,8 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i >= nr * 2) break; @@ -308,7 +312,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) 0; })); if (ret < 0) { - bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards by slots"); goto err; } ret = 0; @@ -342,7 +346,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) { - bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "insert error"); goto err; } } @@ -351,15 +355,16 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), 0, k, ({ + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); BUG_ON(k.k->size != 8); i += 16; 0; })); if (ret) { - bch_err(c, "%s(): error iterating forwards: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards"); goto err; } @@ -369,8 +374,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; - ret = for_each_btree_key2(&trans, iter, BTREE_ID_extents, - SPOS(0, 0, U32_MAX), + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_SLOTS, k, ({ if (i == nr) break; @@ -382,7 +387,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) 0; })); if (ret) { - bch_err(c, "%s(): error iterating forwards by slots: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "error iterating forwards by slots"); goto err; } ret = 0; @@ -405,10 +410,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(&trans, &iter); @@ -426,10 +431,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(&trans, &iter); @@ -439,7 +444,7 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) /* extent unit tests */ -u64 test_version; +static u64 test_version; static int insert_test_extent(struct bch_fs *c, u64 start, u64 end) @@ -456,7 +461,7 @@ static int insert_test_extent(struct bch_fs *c, ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, NULL, 0); if (ret) - bch_err(c, "%s(): insert error: %s", __func__, bch2_err_str(ret)); + bch_err_fn(c, ret); return ret; } @@ -498,6 +503,36 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) __test_extent_overwrite(c, 32, 64, 32, 128); } +static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) +{ + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.inode = inum; + k.k_i.k.p.offset = start + len; + k.k_i.k.p.snapshot = snapid; + k.k_i.k.size = len; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) +{ + return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ + insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: + insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: + insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ + insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: + insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: + insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); +} + /* snapshot unit tests */ /* Test skipping over keys in unrelated snapshots: */ @@ -519,7 +554,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) bch2_trans_init(&trans, c, 0, 0); bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -555,7 +590,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr) ret = test_snapshot_filter(c, snapids[0], snapids[1]); if (ret) { - bch_err(c, "%s(): err from test_snapshot_filter: %s", __func__, bch2_err_str(ret)); + bch_err_msg(c, ret, "from test_snapshot_filter"); return ret; } @@ -568,7 +603,7 @@ static u64 test_rand(void) { u64 v; - prandom_bytes(&v, sizeof(v)); + get_random_bytes(&v, sizeof(v)); return v; } @@ -587,11 +622,9 @@ static int rand_insert(struct bch_fs *c, u64 nr) k.k.p.snapshot = U32_MAX; ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i)); - if (ret) { - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0)); + if (ret) break; - } } bch2_trans_exit(&trans); @@ -616,18 +649,16 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) } ret = commit_do(&trans, NULL, NULL, 0, - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?: - __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i)); - if (ret) { - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + if (ret) break; - } } bch2_trans_exit(&trans); @@ -651,10 +682,8 @@ static int rand_lookup(struct bch_fs *c, u64 nr) lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); ret = bkey_err(k); - if (ret) { - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + if (ret) break; - } } bch2_trans_iter_exit(&trans, &iter); @@ -675,7 +704,7 @@ static int rand_mixed_trans(struct btree_trans *trans, k = bch2_btree_iter_peek(iter); ret = bkey_err(k); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err(trans->c, "%s(): lookup error: %s", __func__, bch2_err_str(ret)); + bch_err_msg(trans->c, ret, "lookup error"); if (ret) return ret; @@ -704,10 +733,8 @@ static int rand_mixed(struct bch_fs *c, u64 nr) rand = test_rand(); ret = commit_do(&trans, NULL, NULL, 0, rand_mixed_trans(&trans, &iter, &cookie, i, rand)); - if (ret) { - bch_err(c, "%s(): update error: %s", __func__, bch2_err_str(ret)); + if (ret) break; - } } bch2_trans_iter_exit(&trans, &iter); @@ -723,7 +750,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_INTENT); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + k = bch2_btree_iter_peek(&iter); ret = bkey_err(k); if (ret) goto err; @@ -750,10 +777,8 @@ static int rand_delete(struct bch_fs *c, u64 nr) ret = commit_do(&trans, NULL, NULL, 0, __do_delete(&trans, pos)); - if (ret) { - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); + if (ret) break; - } } bch2_trans_exit(&trans); @@ -762,88 +787,59 @@ static int rand_delete(struct bch_fs *c, u64 nr) static int seq_insert(struct bch_fs *c, u64 nr) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_i_cookie insert; - int ret = 0; bkey_cookie_init(&insert.k_i); - bch2_trans_init(&trans, c, 0, 0); - - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + return bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, - NULL, NULL, 0, - ({ + NULL, NULL, 0, ({ if (iter.pos.offset >= nr) break; insert.k.p = iter.pos; bch2_trans_update(&trans, &iter, &insert.k_i, 0); - })); - if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); - - bch2_trans_exit(&trans); - return ret; + }))); } static int seq_lookup(struct bch_fs *c, u64 nr) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - int ret = 0; - bch2_trans_init(&trans, c, 0, 0); - - ret = for_each_btree_key2(&trans, iter, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), 0, k, - 0); - if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); - - bch2_trans_exit(&trans); - return ret; + return bch2_trans_run(c, + for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, + 0)); } static int seq_overwrite(struct bch_fs *c, u64 nr) { - struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, 0, 0); - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + return bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), BTREE_ITER_INTENT, k, - NULL, NULL, 0, - ({ + NULL, NULL, 0, ({ struct bkey_i_cookie u; bkey_reassemble(&u.k_i, k); bch2_trans_update(&trans, &iter, &u.k_i, 0); - })); - if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); - - bch2_trans_exit(&trans); - return ret; + }))); } static int seq_delete(struct bch_fs *c, u64 nr) { - int ret; - - ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, - SPOS(0, 0, U32_MAX), SPOS_MAX, + return bch2_btree_delete_range(c, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), 0, NULL); - if (ret) - bch_err(c, "%s(): error %s", __func__, bch2_err_str(ret)); - return ret; } typedef int (*perf_test_fn)(struct bch_fs *, u64); @@ -935,6 +931,7 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, perf_test(test_extent_overwrite_back); perf_test(test_extent_overwrite_middle); perf_test(test_extent_overwrite_all); + perf_test(test_extent_create_overlapping); perf_test(test_snapshots); diff --git a/libbcachefs/trace.c b/libbcachefs/trace.c index 7057398..d294b3d 100644 --- a/libbcachefs/trace.c +++ b/libbcachefs/trace.c @@ -2,8 +2,10 @@ #include "bcachefs.h" #include "alloc_types.h" #include "buckets.h" +#include "btree_cache.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update_interior.h" #include "keylist.h" #include "opts.h" @@ -11,4 +13,4 @@ #include #define CREATE_TRACE_POINTS -#include +#include "trace.h" diff --git a/include/trace/events/bcachefs.h b/libbcachefs/trace.h similarity index 82% rename from include/trace/events/bcachefs.h rename to libbcachefs/trace.h index 47ba750..a743ab4 100644 --- a/include/trace/events/bcachefs.h +++ b/libbcachefs/trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM bcachefs -#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_BCACHE_H +#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BCACHEFS_H #include @@ -33,23 +33,18 @@ DECLARE_EVENT_CLASS(bpos, ); DECLARE_EVENT_CLASS(bkey, - TP_PROTO(const struct bkey *k), - TP_ARGS(k), + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k), TP_STRUCT__entry( - __field(u64, inode ) - __field(u64, offset ) - __field(u32, size ) + __string(k, k ) ), TP_fast_assign( - __entry->inode = k->p.inode; - __entry->offset = k->p.offset; - __entry->size = k->size; + __assign_str(k, k); ), - TP_printk("%llu:%llu len %u", __entry->inode, - __entry->offset, __entry->size) + TP_printk("%s", __get_str(k)) ); DECLARE_EVENT_CLASS(btree_node, @@ -425,7 +420,9 @@ TRACE_EVENT(btree_path_relock_fail, else scnprintf(__entry->node, sizeof(__entry->node), "%px", b); __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", @@ -480,7 +477,9 @@ TRACE_EVENT(btree_path_upgrade_fail, __entry->read_count = c.n[SIX_LOCK_read]; __entry->intent_count = c.n[SIX_LOCK_read]; __entry->iter_lock_seq = path->l[level].lock_seq; - __entry->node_lock_seq = is_btree_node(path, level) ? path->l[level].b->c.lock.state.seq : 0; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; ), TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", @@ -514,34 +513,9 @@ DEFINE_EVENT(bch_fs, gc_gens_end, /* Allocator */ -TRACE_EVENT(bucket_alloc, - TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, - bool user, u64 bucket), - TP_ARGS(ca, alloc_reserve, user, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __array(char, reserve, 16 ) - __field(bool, user ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = ca->dev; - strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); - __entry->user = user; - __entry->bucket = bucket; - ), - - TP_printk("%d,%d reserve %s user %u bucket %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->reserve, - __entry->user, - __entry->bucket) -); - -TRACE_EVENT(bucket_alloc_fail, +DECLARE_EVENT_CLASS(bucket_alloc, TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, u64 free, u64 avail, u64 copygc_wait_amount, @@ -549,12 +523,14 @@ TRACE_EVENT(bucket_alloc_fail, struct bucket_alloc_state *s, bool nonblocking, const char *err), - TP_ARGS(ca, alloc_reserve, free, avail, copygc_wait_amount, copygc_waiting_for, + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, s, nonblocking, err), TP_STRUCT__entry( - __field(dev_t, dev ) + __field(u8, dev ) __array(char, reserve, 16 ) + __field(u64, bucket ) __field(u64, free ) __field(u64, avail ) __field(u64, copygc_wait_amount ) @@ -564,12 +540,14 @@ TRACE_EVENT(bucket_alloc_fail, __field(u64, need_journal_commit ) __field(u64, nouse ) __field(bool, nonblocking ) + __field(u64, nocow ) __array(char, err, 32 ) ), TP_fast_assign( - __entry->dev = ca->dev; + __entry->dev = ca->dev_idx; strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->bucket = bucket; __entry->free = free; __entry->avail = avail; __entry->copygc_wait_amount = copygc_wait_amount; @@ -579,12 +557,14 @@ TRACE_EVENT(bucket_alloc_fail, __entry->need_journal_commit = s->skipped_need_journal_commit; __entry->nouse = s->skipped_nouse; __entry->nonblocking = nonblocking; + __entry->nocow = s->skipped_nocow; strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d,%d reserve %s free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", - MAJOR(__entry->dev), MINOR(__entry->dev), + TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", __entry->reserve, + __entry->dev, + __entry->bucket, __entry->free, __entry->avail, __entry->copygc_wait_amount, @@ -593,10 +573,41 @@ TRACE_EVENT(bucket_alloc_fail, __entry->open, __entry->need_journal_commit, __entry->nouse, + __entry->nocow, __entry->nonblocking, __entry->err) ); +DEFINE_EVENT(bucket_alloc, bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + TRACE_EVENT(discard_buckets, TP_PROTO(struct bch_fs *c, u64 seen, u64 open, u64 need_journal_commit, u64 discarded, const char *err), @@ -655,29 +666,67 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ +TRACE_EVENT(bucket_evacuate, + TP_PROTO(struct bch_fs *c, struct bpos *bucket), + TP_ARGS(c, bucket), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = bucket->inode; + __entry->bucket = bucket->offset; + ), + + TP_printk("%d:%d %u:%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket) +); + +DEFINE_EVENT(bkey, move_extent, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + DEFINE_EVENT(bkey, move_extent_read, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) ); DEFINE_EVENT(bkey, move_extent_write, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) ); DEFINE_EVENT(bkey, move_extent_finish, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) ); -DEFINE_EVENT(bkey, move_extent_race, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) +TRACE_EVENT(move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *msg), + TP_ARGS(c, msg), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __string(msg, msg ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __assign_str(msg, msg); + ), + + TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) ); DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) ); TRACE_EVENT(move_data, @@ -702,6 +751,39 @@ TRACE_EVENT(move_data, __entry->sectors_moved, __entry->keys_moved) ); +TRACE_EVENT(evacuate_bucket, + TP_PROTO(struct bch_fs *c, struct bpos *bucket, + unsigned sectors, unsigned bucket_size, + u64 fragmentation, int ret), + TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, member ) + __field(u64, bucket ) + __field(u32, sectors ) + __field(u32, bucket_size ) + __field(u64, fragmentation ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->member = bucket->inode; + __entry->bucket = bucket->offset; + __entry->sectors = sectors; + __entry->bucket_size = bucket_size; + __entry->fragmentation = fragmentation; + __entry->ret = ret; + ), + + TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->member, __entry->bucket, + __entry->sectors, __entry->bucket_size, + __entry->fragmentation, __entry->ret) +); + TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 sectors_moved, u64 sectors_not_moved, @@ -786,19 +868,43 @@ DEFINE_EVENT(transaction_event, trans_restart_injected, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, +TRACE_EVENT(trans_restart_split_race, TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) + unsigned long caller_ip, + struct btree *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, level ) + __field(u16, written ) + __field(u16, blocks ) + __field(u16, u64s_remaining ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->level = b->c.level; + __entry->written = b->written; + __entry->blocks = btree_blocks(trans->c); + __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + ), + + TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->level, + __entry->written, __entry->blocks, + __entry->u64s_remaining) ); -DEFINE_EVENT(transaction_event, trans_restart_journal_res_get, +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), TP_ARGS(trans, caller_ip) ); - TRACE_EVENT(trans_restart_journal_preres_get, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -822,12 +928,6 @@ TRACE_EVENT(trans_restart_journal_preres_get, __entry->flags) ); -DEFINE_EVENT(transaction_event, trans_restart_journal_reclaim, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - DEFINE_EVENT(transaction_event, trans_restart_fault_inject, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), @@ -840,12 +940,6 @@ DEFINE_EVENT(transaction_event, trans_traverse_all, TP_ARGS(trans, caller_ip) ); -DEFINE_EVENT(transaction_event, trans_restart_mark_replicas, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) -); - DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip), @@ -1096,7 +1190,58 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); -#endif /* _TRACE_BCACHE_H */ +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(write_buffer_flush, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), + TP_ARGS(trans, nr, skipped, fast, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, skipped ) + __field(size_t, fast ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->skipped = skipped; + __entry->fast = fast; + __entry->size = size; + ), + + TP_printk("%zu/%zu skipped %zu fast %zu", + __entry->nr, __entry->size, __entry->skipped, __entry->fast) +); + +TRACE_EVENT(write_buffer_flush_slowpath, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), + TP_ARGS(trans, nr, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->size = size; + ), + + TP_printk("%zu/%zu", __entry->nr, __entry->size) +); + +#endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../fs/bcachefs + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + #include diff --git a/libbcachefs/two_state_shared_lock.c b/libbcachefs/two_state_shared_lock.c index dc508d5..9764c2e 100644 --- a/libbcachefs/two_state_shared_lock.c +++ b/libbcachefs/two_state_shared_lock.c @@ -2,32 +2,7 @@ #include "two_state_shared_lock.h" -void bch2_two_state_unlock(two_state_lock_t *lock, int s) +void __bch2_two_state_lock(two_state_lock_t *lock, int s) { - long i = s ? 1 : -1; - - BUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -bool bch2_two_state_trylock(two_state_lock_t *lock, int s) -{ - long i = s ? 1 : -1; - long v = atomic_long_read(&lock->v), old; - - do { - old = v; - - if (i > 0 ? v < 0 : v > 0) - return false; - } while ((v = atomic_long_cmpxchg_acquire(&lock->v, - old, old + i)) != old); - return true; -} - -void bch2_two_state_lock(two_state_lock_t *lock, int s) -{ - wait_event(lock->wait, bch2_two_state_trylock(lock, s)); + __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); } diff --git a/libbcachefs/two_state_shared_lock.h b/libbcachefs/two_state_shared_lock.h index 1b4f108..9058017 100644 --- a/libbcachefs/two_state_shared_lock.h +++ b/libbcachefs/two_state_shared_lock.h @@ -6,6 +6,8 @@ #include #include +#include "util.h" + /* * Two-state lock - can be taken for add or block - both states are shared, * like read side of rwsem, but conflict with other state: @@ -21,8 +23,37 @@ static inline void two_state_lock_init(two_state_lock_t *lock) init_waitqueue_head(&lock->wait); } -void bch2_two_state_unlock(two_state_lock_t *, int); -bool bch2_two_state_trylock(two_state_lock_t *, int); -void bch2_two_state_lock(two_state_lock_t *, int); +static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + + EBUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +void __bch2_two_state_lock(two_state_lock_t *, int); + +static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + if (!bch2_two_state_trylock(lock, s)) + __bch2_two_state_lock(lock, s); +} #endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 62fa662..ae4f6de 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -240,36 +240,6 @@ bool bch2_is_zero(const void *_p, size_t n) return true; } -static void bch2_quantiles_update(struct quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) { while (nr_bits) @@ -292,45 +262,96 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines) if (!*p) break; lines = p + 1; - prefix = KERN_CONT; } console_unlock(); } -int bch2_prt_backtrace(struct printbuf *out, struct task_struct *task) +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) { - unsigned long entries[32]; - unsigned i, nr_entries; - int ret; + unsigned nr_entries = 0; + int ret = 0; - ret = down_read_killable(&task->signal->exec_update_lock); + stack->nr = 0; + ret = darray_make_room(stack, 32); if (ret) return ret; - nr_entries = stack_trace_save_tsk(task, entries, ARRAY_SIZE(entries), 0); - for (i = 0; i < nr_entries; i++) { - prt_printf(out, "[<0>] %pB", (void *)entries[i]); + if (!down_read_trylock(&task->signal->exec_update_lock)) + return -1; + + do { + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + } while (nr_entries == stack->size && + !(ret = darray_make_room(stack, stack->size * 2))); + + stack->nr = nr_entries; + up_read(&task->signal->exec_update_lock); + + return ret; +} + +void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) +{ + unsigned long *i; + + darray_for_each(*stack, i) { + prt_printf(out, "[<0>] %pB", (void *) *i); prt_newline(out); } +} - up_read(&task->signal->exec_update_lock); - return 0; +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +{ + bch_stacktrace stack = { 0 }; + int ret = bch2_save_backtrace(&stack, task); + + bch2_prt_backtrace(out, &stack); + darray_exit(&stack); + return ret; } /* time stats: */ -static void bch2_time_stats_update_one(struct time_stats *stats, - u64 start, u64 end) +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct bch2_quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) { u64 duration, freq; if (time_after64(end, start)) { duration = end - start; - stats->duration_stats = mean_and_variance_update(stats->duration_stats, - duration); - stats->duration_stats_weighted = mean_and_variance_weighted_update( - stats->duration_stats_weighted, - duration); + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); stats->max_duration = max(stats->max_duration, duration); stats->min_duration = min(stats->min_duration, duration); bch2_quantiles_update(&stats->quantiles, duration); @@ -338,17 +359,31 @@ static void bch2_time_stats_update_one(struct time_stats *stats, if (time_after64(end, stats->last_event)) { freq = end - stats->last_event; - stats->freq_stats = mean_and_variance_update(stats->freq_stats, freq); - stats->freq_stats_weighted = mean_and_variance_weighted_update( - stats->freq_stats_weighted, - freq); + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); stats->max_freq = max(stats->max_freq, freq); stats->min_freq = min(stats->min_freq, freq); stats->last_event = end; } } -void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) +static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + struct bch2_time_stat_buffer_entry *i; + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + for (i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + spin_unlock_irqrestore(&stats->lock, flags); + + b->nr = 0; +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; @@ -363,36 +398,27 @@ void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && stats->duration_stats.n > 1024) stats->buffer = - alloc_percpu_gfp(struct time_stat_buffer, + alloc_percpu_gfp(struct bch2_time_stat_buffer, GFP_ATOMIC); spin_unlock_irqrestore(&stats->lock, flags); } else { - struct time_stat_buffer_entry *i; - struct time_stat_buffer *b; + struct bch2_time_stat_buffer *b; preempt_disable(); b = this_cpu_ptr(stats->buffer); BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct time_stat_buffer_entry) { + b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { .start = start, .end = end }; - if (b->nr == ARRAY_SIZE(b->entries)) { - spin_lock_irqsave(&stats->lock, flags); - for (i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - spin_unlock_irqrestore(&stats->lock, flags); - - b->nr = 0; - } - + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + bch2_time_stats_clear_buffer(stats, b); preempt_enable(); } } +#endif static const struct time_unit { const char *name; @@ -402,8 +428,8 @@ static const struct time_unit { { "us", NSEC_PER_USEC }, { "ms", NSEC_PER_MSEC }, { "s", NSEC_PER_SEC }, - { "m", NSEC_PER_SEC * 60}, - { "h", NSEC_PER_SEC * 3600}, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, { "eon", U64_MAX }, }; @@ -420,7 +446,14 @@ static const struct time_unit *pick_time_units(u64 ns) return u; } -static void pr_time_units(struct printbuf *out, u64 ns) +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + +static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { const struct time_unit *u = pick_time_units(ns); @@ -435,11 +468,11 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 { prt_str(out, name); prt_tab(out); - pr_time_units(out, ns); + bch2_pr_time_units_aligned(out, ns); prt_newline(out); } -void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { const struct time_unit *u; s64 f_mean = 0, d_mean = 0; @@ -493,16 +526,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) prt_printf(out, "mean:"); prt_tab(out); - pr_time_units(out, d_mean); + bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); - pr_time_units(out, d_stddev); + bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -516,16 +549,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) prt_printf(out, "mean:"); prt_tab(out); - pr_time_units(out, f_mean); + bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); - pr_time_units(out, f_stddev); + bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - pr_time_units(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -548,16 +581,16 @@ void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) } } -void bch2_time_stats_exit(struct time_stats *stats) +void bch2_time_stats_exit(struct bch2_time_stats *stats) { free_percpu(stats->buffer); } -void bch2_time_stats_init(struct time_stats *stats) +void bch2_time_stats_init(struct bch2_time_stats *stats) { memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.w = 8; - stats->freq_stats_weighted.w = 8; + stats->duration_stats_weighted.weight = 8; + stats->freq_stats_weighted.weight = 8; stats->min_duration = U64_MAX; stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); @@ -723,10 +756,10 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size) } } -int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) +int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) { while (size) { - struct page *page = alloc_page(gfp_mask); + struct page *page = alloc_pages_noprof(gfp_mask, 0); unsigned len = min_t(size_t, PAGE_SIZE, size); if (!page) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 473c969..5fa29da 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -20,6 +19,8 @@ #include #include +#include "darray.h" + struct closure; #ifdef CONFIG_BCACHEFS_DEBUG @@ -59,12 +60,13 @@ static inline void vpfree(void *p, size_t size) free_pages((unsigned long) p, get_order(size)); } -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) +static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) { - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); + return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc_noprof(size, gfp_mask); } +#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) static inline void kvpfree(void *p, size_t size) { @@ -74,12 +76,13 @@ static inline void kvpfree(void *p, size_t size) vpfree(p, size); } -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) +static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) { return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); + ? kmalloc_noprof(size, gfp_mask) + : vpmalloc_noprof(size, gfp_mask); } +#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); @@ -215,6 +218,34 @@ do { \ #define ANYSINT_MAX(t) \ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) +#include "printbuf.h" + +#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) +#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) +#define printbuf_str(_buf) bch2_printbuf_str(_buf) +#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) + +#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) +#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) +#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) + +#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) + +#define prt_newline(_out) bch2_prt_newline(_out) +#define prt_tab(_out) bch2_prt_tab(_out) +#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) + +#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) +#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) +#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) +#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) +#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) +#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) +#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) +#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) + +void bch2_pr_time_units(struct printbuf *, u64); #ifdef __KERNEL__ static inline void pr_time(struct printbuf *out, u64 time) @@ -333,29 +364,33 @@ u64 bch2_read_flag_list(char *, const char * const[]); void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); void bch2_print_string_as_lines(const char *prefix, const char *lines); -int bch2_prt_backtrace(struct printbuf *, struct task_struct *); + +typedef DARRAY(unsigned long) bch_stacktrace; +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); #define NR_QUANTILES 15 #define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) #define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) #define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -struct quantiles { - struct quantile_entry { +struct bch2_quantiles { + struct bch2_quantile_entry { u64 m; u64 step; } entries[NR_QUANTILES]; }; -struct time_stat_buffer { +struct bch2_time_stat_buffer { unsigned nr; - struct time_stat_buffer_entry { + struct bch2_time_stat_buffer_entry { u64 start; u64 end; } entries[32]; }; -struct time_stats { +struct bch2_time_stats { spinlock_t lock; /* all fields are in nanoseconds */ u64 max_duration; @@ -363,26 +398,30 @@ struct time_stats { u64 max_freq; u64 min_freq; u64 last_event; - struct quantiles quantiles; + struct bch2_quantiles quantiles; struct mean_and_variance duration_stats; struct mean_and_variance_weighted duration_stats_weighted; struct mean_and_variance freq_stats; struct mean_and_variance_weighted freq_stats_weighted; - struct time_stat_buffer __percpu *buffer; + struct bch2_time_stat_buffer __percpu *buffer; }; -void __bch2_time_stats_update(struct time_stats *stats, u64, u64); +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +#endif -static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) { __bch2_time_stats_update(stats, start, local_clock()); } -void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); +void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct time_stats *); -void bch2_time_stats_init(struct time_stats *); +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); #define ewma_add(ewma, val, weight) \ ({ \ @@ -493,7 +532,9 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) } void bch2_bio_map(struct bio *bio, void *base, size_t); -int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); +int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); +#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ + alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) static inline sector_t bdev_sectors(struct block_device *bdev) { @@ -506,6 +547,26 @@ do { \ submit_bio(bio); \ } while (0) +#define kthread_wait(cond) \ +({ \ + int _ret = 0; \ + \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + #define kthread_wait_freezable(cond) \ ({ \ int _ret = 0; \ @@ -582,6 +643,20 @@ static inline void memmove_u64s_down(void *dst, const void *src, __memmove_u64s_down(dst, src, u64s); } +static inline void __memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + memcpy_u64s_small(dst, src, u64s); +} + +static inline void memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down_small(dst, src, u64s); +} + static inline void __memmove_u64s_up_small(void *_dst, const void *_src, unsigned u64s) { @@ -766,4 +841,6 @@ static inline int u8_cmp(u8 l, u8 r) return cmp_int(l, r); } +#include + #endif /* _BCACHEFS_UTIL_H */ diff --git a/libbcachefs/varint.c b/libbcachefs/varint.c index 5143b60..ef030fc 100644 --- a/libbcachefs/varint.c +++ b/libbcachefs/varint.c @@ -22,12 +22,13 @@ int bch2_varint_encode(u8 *out, u64 v) { unsigned bits = fls64(v|1); unsigned bytes = DIV_ROUND_UP(bits, 7); + __le64 v_le; if (likely(bytes < 9)) { v <<= bytes; v |= ~(~0 << (bytes - 1)); - v = cpu_to_le64(v); - memcpy(out, &v, bytes); + v_le = cpu_to_le64(v); + memcpy(out, &v_le, bytes); } else { *out++ = 255; bytes = 9; @@ -57,9 +58,9 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) return -1; if (likely(bytes < 9)) { - v = 0; - memcpy(&v, in, bytes); - v = le64_to_cpu(v); + __le64 v_le = 0; + memcpy(&v_le, in, bytes); + v = le64_to_cpu(v_le); v >>= bytes; } else { v = get_unaligned_le64(++in); diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 4fc1c3a..70f7800 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -70,17 +70,12 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, - int rw, struct printbuf *err) + enum bkey_invalid_flags flags, + struct printbuf *err) { const struct xattr_handler *handler; struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); - if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) { - prt_printf(err, "incorrect value size (%zu < %zu)", - bkey_val_bytes(k.k), sizeof(*xattr.v)); - return -EINVAL; - } - if (bkey_val_u64s(k.k) < xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len))) { @@ -88,7 +83,7 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, bkey_val_u64s(k.k), xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len))); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } /* XXX why +4 ? */ @@ -99,18 +94,18 @@ int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, bkey_val_u64s(k.k), xattr_val_u64s(xattr.v->x_name_len, le16_to_cpu(xattr.v->x_val_len) + 4)); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } handler = bch2_xattr_type_to_handler(xattr.v->x_type); if (!handler) { prt_printf(err, "invalid type (%u)", xattr.v->x_type); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { prt_printf(err, "xattr name has invalid characters"); - return -EINVAL; + return -BCH_ERR_invalid_bkey; } return 0; @@ -141,15 +136,14 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info const char *name, void *buffer, size_t size, int type) { struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); struct btree_iter iter; struct bkey_s_c_xattr xattr; struct bkey_s_c k; int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode_inum(inode), - &X_SEARCH(type, name, strlen(name)), - 0); + inode_inum(inode), &search, 0); if (ret) goto err1; @@ -169,34 +163,26 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info err2: bch2_trans_iter_exit(trans, &iter); err1: - return ret == -ENOENT ? -ENODATA : ret; -} - -int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, - const char *name, void *buffer, size_t size, int type) -{ - return bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); + return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) { + struct bch_fs *c = trans->c; struct btree_iter inode_iter = { NULL }; - struct bch_inode_unpacked inode_u; int ret; - /* - * We need to do an inode update so that bi_journal_sync gets updated - * and fsync works: - * - * Perhaps we should be updating bi_mtime too? - */ + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + inode_u->bi_ctime = bch2_current_time(c); - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: - bch2_inode_write(trans, &inode_iter, &inode_u); + ret = bch2_inode_write(trans, &inode_iter, inode_u); bch2_trans_iter_exit(trans, &inode_iter); if (ret) @@ -235,7 +221,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, hash_info, inum, &search); } - if (ret == -ENOENT) + if (bch2_err_matches(ret, ENOENT)) ret = flags & XATTR_REPLACE ? -ENODATA : 0; return ret; @@ -371,14 +357,14 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret; + int ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); - ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags); return bch2_err_class(ret); } static int bch2_xattr_set_handler(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -386,12 +372,20 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct bch_inode_unpacked inode_u; + struct btree_trans trans; int ret; - ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, inode_inum(inode), &hash, - name, value, size, + bch2_trans_init(&trans, c, 0, 0); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &inode_u, + &hash, name, value, size, handler->flags, flags)); + if (!ret) + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + bch2_trans_exit(&trans); + return bch2_err_class(ret); } @@ -444,7 +438,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_opts opts = - bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); + bch2_inode_opts_to_opts(&inode->ei_inode); const struct bch_option *opt; int id, inode_opt_id; struct printbuf out = PRINTBUF; @@ -517,7 +511,7 @@ static int inode_opt_set_fn(struct bch_inode_info *inode, } static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, - struct user_namespace *mnt_userns, + struct mnt_idmap *idmap, struct dentry *dentry, struct inode *vinode, const char *name, const void *value, size_t size, int flags) @@ -595,7 +589,7 @@ err: opt_id == Opt_background_target)) bch2_rebalance_add_work(c, inode->v.i_blocks); - return ret; + return bch2_err_class(ret); } static const struct xattr_handler bch_xattr_bcachefs_handler = { @@ -624,8 +618,8 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { const struct xattr_handler *bch2_xattr_handlers[] = { &bch_xattr_user_handler, #ifdef CONFIG_BCACHEFS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, + &nop_posix_acl_access, + &nop_posix_acl_default, #endif &bch_xattr_trusted_handler, &bch_xattr_security_handler, @@ -639,9 +633,9 @@ const struct xattr_handler *bch2_xattr_handlers[] = { static const struct xattr_handler *bch_xattr_handler_map[] = { [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = - &posix_acl_access_xattr_handler, + &nop_posix_acl_access, [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = - &posix_acl_default_xattr_handler, + &nop_posix_acl_default, [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, }; diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h index 03f1b73..f5a52e3 100644 --- a/libbcachefs/xattr.h +++ b/libbcachefs/xattr.h @@ -6,12 +6,14 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, int, struct printbuf *); +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ .key_invalid = bch2_xattr_invalid, \ .val_to_text = bch2_xattr_to_text, \ + .min_val_size = 8, \ }) static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) @@ -36,11 +38,9 @@ struct xattr_handler; struct bch_hash_info; struct bch_inode_info; -int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, - const char *, void *, size_t, int); - +/* Exported for cmd_migrate.c in tools: */ int bch2_xattr_set(struct btree_trans *, subvol_inum, - const struct bch_hash_info *, + struct bch_inode_unpacked *, const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); diff --git a/linux/blkdev.c b/linux/blkdev.c index 9b3ea93..ea901a4 100644 --- a/linux/blkdev.c +++ b/linux/blkdev.c @@ -118,6 +118,14 @@ int blkdev_issue_discard(struct block_device *bdev, return 0; } +int blkdev_issue_zeroout(struct block_device *bdev, + sector_t sector, sector_t nr_sects, + gfp_t gfp_mask, unsigned flags) +{ + /* Not yet implemented: */ + BUG(); +} + unsigned bdev_logical_block_size(struct block_device *bdev) { struct stat statbuf; @@ -175,18 +183,34 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, else if (mode & FMODE_WRITE) flags = O_WRONLY; + if (!(mode & FMODE_BUFFERED)) + flags |= O_DIRECT; + #if 0 /* using O_EXCL doesn't work with opening twice for an O_SYNC fd: */ if (mode & FMODE_EXCL) flags |= O_EXCL; #endif + buffered_fd = open(path, flags & ~O_DIRECT); + if (buffered_fd < 0) + return ERR_PTR(-errno); - fd = open(path, flags|O_DIRECT); + fd = open(path, flags); if (fd < 0) + fd = dup(buffered_fd); + if (fd < 0) { + close(buffered_fd); return ERR_PTR(-errno); + } - sync_fd = xopen(path, flags|O_DIRECT|O_SYNC); - buffered_fd = xopen(path, flags); + sync_fd = open(path, flags|O_SYNC); + if (sync_fd < 0) + sync_fd = open(path, (flags & ~O_DIRECT)|O_SYNC); + if (sync_fd < 0) { + close(fd); + close(buffered_fd); + return ERR_PTR(-errno); + } bdev = malloc(sizeof(*bdev)); memset(bdev, 0, sizeof(*bdev)); diff --git a/linux/closure.c b/linux/closure.c index b38ded0..0855e69 100644 --- a/linux/closure.c +++ b/linux/closure.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff --git a/linux/fs.c b/linux/fs.c index 0002846..623ca26 100644 --- a/linux/fs.c +++ b/linux/fs.c @@ -3,12 +3,12 @@ #include #include -const struct xattr_handler posix_acl_access_xattr_handler = { +const struct xattr_handler nop_posix_acl_access = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, }; -const struct xattr_handler posix_acl_default_xattr_handler = { +const struct xattr_handler nop_posix_acl_default = { .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, }; diff --git a/linux/kthread.c b/linux/kthread.c index 3c7bdb8..17830e5 100644 --- a/linux/kthread.c +++ b/linux/kthread.c @@ -80,7 +80,13 @@ struct task_struct *kthread_create(int (*thread_fn)(void *data), pthread_attr_init(&attr); pthread_attr_setstacksize(&attr, 32 << 10); - ret = pthread_create(&p->thread, &attr, kthread_start_fn, p); + for (unsigned i = 0; i < 10; i++) { + ret = pthread_create(&p->thread, &attr, kthread_start_fn, p); + if (!ret) + break; + + run_shrinkers(GFP_KERNEL, true); + } if (ret) return ERR_PTR(-ret); pthread_setname_np(p->thread, p->comm); @@ -99,6 +105,11 @@ bool kthread_should_stop(void) return test_bit(KTHREAD_SHOULD_STOP, ¤t->kthread_flags); } +bool kthread_freezable_should_stop(bool *was_frozen) +{ + return test_bit(KTHREAD_SHOULD_STOP, ¤t->kthread_flags); +} + /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c index aa95db1..eb5f2ba 100644 --- a/linux/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -42,47 +42,29 @@ #include #include #include -#include - -/** - * fast_divpow2() - fast approximation for n / (1 << d) - * @n: numerator - * @d: the power of 2 denominator. - * - * note: this rounds towards 0. - */ -s64 fast_divpow2(s64 n, u8 d) -{ - return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -} - -/** - * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 - * and return it. - * @s1: the mean_and_variance to update. - * @v1: the new sample. - * - * see linked pdf equation 12. - */ -struct mean_and_variance mean_and_variance_update(struct mean_and_variance s1, s64 v1) +u128_u u128_div(u128_u n, u64 d) { - struct mean_and_variance s2; - u64 v2 = abs(v1); - - s2.n = s1.n + 1; - s2.sum = s1.sum + v1; - s2.sum_squares = u128_add(s1.sum_squares, u128_square(v2)); - return s2; + u128_u r; + u64 rem; + u64 hi = u128_hi(n); + u64 lo = u128_lo(n); + u64 h = hi & ((u64) U32_MAX << 32); + u64 l = (hi & (u64) U32_MAX) << 32; + + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; } -EXPORT_SYMBOL_GPL(mean_and_variance_update); +EXPORT_SYMBOL_GPL(u128_div); /** * mean_and_variance_get_mean() - get mean from @s */ s64 mean_and_variance_get_mean(struct mean_and_variance s) { - return div64_u64(s.sum, s.n); + return s.n ? div64_u64(s.sum, s.n) : 0; } EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); @@ -93,10 +75,14 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); */ u64 mean_and_variance_get_variance(struct mean_and_variance s1) { - u128 s2 = u128_div(s1.sum_squares, s1.n); - u64 s3 = abs(mean_and_variance_get_mean(s1)); + if (s1.n) { + u128_u s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); - return u128_to_u64(u128_sub(s2, u128_square(s3))); + return u128_lo(u128_sub(s2, u128_square(s3))); + } else { + return 0; + } } EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); @@ -117,32 +103,26 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -struct mean_and_variance_weighted mean_and_variance_weighted_update(struct mean_and_variance_weighted s1, - s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) { - struct mean_and_variance_weighted s2; // previous weighted variance. - u64 var_w0 = s1.variance; - u8 w = s2.w = s1.w; + u8 w = s->weight; + u64 var_w0 = s->variance; // new value weighted. - s64 x_w = x << w; - s64 diff_w = x_w - s1.mean; - s64 diff = fast_divpow2(diff_w, w); + s64 x_w = x << w; + s64 diff_w = x_w - s->mean; + s64 diff = fast_divpow2(diff_w, w); // new mean weighted. - s64 u_w1 = s1.mean + diff; + s64 u_w1 = s->mean + diff; - BUG_ON(w % 2 != 0); - - if (!s1.init) { - s2.mean = x_w; - s2.variance = 0; + if (!s->init) { + s->mean = x_w; + s->variance = 0; } else { - s2.mean = u_w1; - s2.variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + s->mean = u_w1; + s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s2.init = true; - - return s2; + s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); @@ -151,7 +131,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) { - return fast_divpow2(s.mean, s.w); + return fast_divpow2(s.mean, s.weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); @@ -161,7 +141,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) { // always positive don't need fast divpow2 - return s.variance >> s.w; + return s.variance >> s.weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); diff --git a/linux/pretty-printers.c b/linux/pretty-printers.c deleted file mode 100644 index addbac9..0000000 --- a/linux/pretty-printers.c +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1+ -/* Copyright (C) 2022 Kent Overstreet */ - -#include -#include -#include -#include - -/** - * prt_string_option - Given a list of strings, print out the list and indicate - * which option is selected, with square brackets (sysfs style) - * - * @out: The printbuf to output to - * @list: List of strings to choose from - * @selected: The option to highlight, with square brackets - */ -void prt_string_option(struct printbuf *out, - const char * const list[], - size_t selected) -{ - size_t i; - - for (i = 0; list[i]; i++) { - if (i) - prt_char(out, ' '); - if (i == selected) - prt_char(out, '['); - prt_str(out, list[i]); - if (i == selected) - prt_char(out, ']'); - } -} -EXPORT_SYMBOL(prt_string_option); - -/** - * prt_bitflags: Given a bitmap and a list of names for each bit, print out which - * bits are on, comma separated - * - * @out: The printbuf to output to - * @list: List of names for each bit - * @flags: Bits to print - */ -void prt_bitflags(struct printbuf *out, - const char * const list[], u64 flags) -{ - unsigned bit, nr = 0; - bool first = true; - - while (list[nr]) - nr++; - - while (flags && (bit = __ffs(flags)) < nr) { - if (!first) - prt_char(out, ','); - first = false; - prt_str(out, list[bit]); - flags ^= 1 << bit; - } -} -EXPORT_SYMBOL(prt_bitflags); diff --git a/linux/printbuf_userspace.c b/linux/printbuf_userspace.c deleted file mode 100644 index 0ae56ee..0000000 --- a/linux/printbuf_userspace.c +++ /dev/null @@ -1,34 +0,0 @@ - -#include -#include - -void prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -{ - int len; - - do { - va_list args2; - - va_copy(args2, args); - len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); - } while (len + 1 >= printbuf_remaining(out) && - !printbuf_make_room(out, len + 1)); - - len = min_t(size_t, len, - printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); - out->pos += len; -} - -void prt_printf(struct printbuf *out, const char *fmt, ...) -{ - va_list args; - - va_start(args, fmt); - prt_vprintf(out, fmt, args); - va_end(args); -} - -void prt_u64(struct printbuf *out, u64 v) -{ - prt_printf(out, "%llu", v); -} diff --git a/linux/seq_buf.c b/linux/seq_buf.c new file mode 100644 index 0000000..cf8709a --- /dev/null +++ b/linux/seq_buf.c @@ -0,0 +1,152 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * seq_buf.c + * + * Copyright (C) 2014 Red Hat Inc, Steven Rostedt + * + * The seq_buf is a handy tool that allows you to pass a descriptor around + * to a buffer that other functions can write to. It is similar to the + * seq_file functionality but has some differences. + * + * To use it, the seq_buf must be initialized with seq_buf_init(). + * This will set up the counters within the descriptor. You can call + * seq_buf_init() more than once to reset the seq_buf to start + * from scratch. + */ +#include +#include + +/** + * seq_buf_can_fit - can the new data fit in the current buffer? + * @s: the seq_buf descriptor + * @len: The length to see if it can fit in the current buffer + * + * Returns true if there's enough unused space in the seq_buf buffer + * to fit the amount of new data according to @len. + */ +static bool seq_buf_can_fit(struct seq_buf *s, size_t len) +{ + return s->len + len <= s->size; +} + +/** + * seq_buf_vprintf - sequence printing of information. + * @s: seq_buf descriptor + * @fmt: printf format string + * @args: va_list of arguments from a printf() type function + * + * Writes a vnprintf() format into the sequencce buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_vprintf(struct seq_buf *s, const char *fmt, va_list args) +{ + int len; + + WARN_ON(s->size == 0); + + if (s->len < s->size) { + len = vsnprintf(s->buffer + s->len, s->size - s->len, fmt, args); + if (s->len + len < s->size) { + s->len += len; + return 0; + } + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_printf - sequence printing of information + * @s: seq_buf descriptor + * @fmt: printf format string + * + * Writes a printf() format into the sequence buffer. + * + * Returns zero on success, -1 on overflow. + */ +int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = seq_buf_vprintf(s, fmt, ap); + va_end(ap); + + return ret; +} + +/** + * seq_buf_puts - sequence printing of simple string + * @s: seq_buf descriptor + * @str: simple string to record + * + * Copy a simple string into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_puts(struct seq_buf *s, const char *str) +{ + size_t len = strlen(str); + + WARN_ON(s->size == 0); + + /* Add 1 to len for the trailing null byte which must be there */ + len += 1; + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, str, len); + /* Don't count the trailing null byte against the capacity */ + s->len += len - 1; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putc - sequence printing of simple character + * @s: seq_buf descriptor + * @c: simple character to record + * + * Copy a single character into the sequence buffer. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putc(struct seq_buf *s, unsigned char c) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, 1)) { + s->buffer[s->len++] = c; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} + +/** + * seq_buf_putmem - write raw data into the sequenc buffer + * @s: seq_buf descriptor + * @mem: The raw memory to copy into the buffer + * @len: The length of the raw memory to copy (in bytes) + * + * There may be cases where raw memory needs to be written into the + * buffer and a strcpy() would not work. Using this function allows + * for such cases. + * + * Returns zero on success, -1 on overflow + */ +int seq_buf_putmem(struct seq_buf *s, const void *mem, unsigned int len) +{ + WARN_ON(s->size == 0); + + if (seq_buf_can_fit(s, len)) { + memcpy(s->buffer + s->len, mem, len); + s->len += len; + return 0; + } + seq_buf_set_overflow(s); + return -1; +} diff --git a/linux/shrinker.c b/linux/shrinker.c index 23e288d..0b5715b 100644 --- a/linux/shrinker.c +++ b/linux/shrinker.c @@ -1,6 +1,7 @@ #include +#include #include #include #include @@ -126,3 +127,31 @@ void run_shrinkers(gfp_t gfp_mask, bool allocation_failed) } mutex_unlock(&shrinker_lock); } + +static int shrinker_thread(void *arg) +{ + while (!kthread_should_stop()) { + sleep(1); + run_shrinkers(GFP_KERNEL, false); + } + + return 0; +} + +struct task_struct *shrinker_task; + +__attribute__((constructor(103))) +static void shrinker_thread_init(void) +{ + shrinker_task = kthread_run(shrinker_thread, NULL, "shrinkers"); + BUG_ON(IS_ERR(shrinker_task)); +} + +__attribute__((destructor(103))) +static void shrinker_thread_exit(void) +{ + int ret = kthread_stop(shrinker_task); + BUG_ON(ret); + + shrinker_task = NULL; +} diff --git a/linux/six.c b/linux/six.c index 39a9bd6..0b9c4bb 100644 --- a/linux/six.c +++ b/linux/six.c @@ -11,70 +11,82 @@ #include #include +#include + #ifdef DEBUG -#define EBUG_ON(cond) BUG_ON(cond) +#define EBUG_ON(cond) BUG_ON(cond) #else -#define EBUG_ON(cond) do {} while (0) +#define EBUG_ON(cond) do {} while (0) #endif -#define six_acquire(l, t, r) lock_acquire(l, 0, t, r, 1, NULL, _RET_IP_) -#define six_release(l) lock_release(l, _RET_IP_) +#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) +#define six_release(l, ip) lock_release(l, ip) static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); +#define SIX_LOCK_HELD_read_OFFSET 0 +#define SIX_LOCK_HELD_read ~(~0U << 26) +#define SIX_LOCK_HELD_intent (1U << 26) +#define SIX_LOCK_HELD_write (1U << 27) +#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) +#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) +#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) +#define SIX_LOCK_NOSPIN (1U << 31) + struct six_lock_vals { /* Value we add to the lock in order to take the lock: */ - u64 lock_val; + u32 lock_val; /* If the lock has this value (used as a mask), taking the lock fails: */ - u64 lock_fail; - - /* Value we add to the lock in order to release the lock: */ - u64 unlock_val; + u32 lock_fail; /* Mask that indicates lock is held for this type: */ - u64 held_mask; + u32 held_mask; /* Waitlist we wakeup when releasing the lock: */ enum six_lock_type unlock_wakeup; }; -#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) - -#define LOCK_VALS { \ - [SIX_LOCK_read] = { \ - .lock_val = __SIX_VAL(read_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\ - .unlock_val = -__SIX_VAL(read_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_read, \ - .unlock_wakeup = SIX_LOCK_write, \ - }, \ - [SIX_LOCK_intent] = { \ - .lock_val = __SIX_VAL(intent_lock, 1), \ - .lock_fail = __SIX_LOCK_HELD_intent, \ - .unlock_val = -__SIX_VAL(intent_lock, 1), \ - .held_mask = __SIX_LOCK_HELD_intent, \ - .unlock_wakeup = SIX_LOCK_intent, \ - }, \ - [SIX_LOCK_write] = { \ - .lock_val = __SIX_VAL(seq, 1), \ - .lock_fail = __SIX_LOCK_HELD_read, \ - .unlock_val = __SIX_VAL(seq, 1), \ - .held_mask = __SIX_LOCK_HELD_write, \ - .unlock_wakeup = SIX_LOCK_read, \ - }, \ +static const struct six_lock_vals l[] = { + [SIX_LOCK_read] = { + .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, + .lock_fail = SIX_LOCK_HELD_write, + .held_mask = SIX_LOCK_HELD_read, + .unlock_wakeup = SIX_LOCK_write, + }, + [SIX_LOCK_intent] = { + .lock_val = SIX_LOCK_HELD_intent, + .lock_fail = SIX_LOCK_HELD_intent, + .held_mask = SIX_LOCK_HELD_intent, + .unlock_wakeup = SIX_LOCK_intent, + }, + [SIX_LOCK_write] = { + .lock_val = SIX_LOCK_HELD_write, + .lock_fail = SIX_LOCK_HELD_read, + .held_mask = SIX_LOCK_HELD_write, + .unlock_wakeup = SIX_LOCK_read, + }, +}; + +static inline void six_set_bitmask(struct six_lock *lock, u32 mask) +{ + if ((atomic_read(&lock->state) & mask) != mask) + atomic_or(mask, &lock->state); +} + +static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) +{ + if (atomic_read(&lock->state) & mask) + atomic_and(~mask, &lock->state); } static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, - union six_lock_state old, - struct task_struct *owner) + u32 old, struct task_struct *owner) { if (type != SIX_LOCK_intent) return; - if (!old.intent_lock) { + if (!(old & SIX_LOCK_HELD_intent)) { EBUG_ON(lock->owner); lock->owner = owner; } else { @@ -92,22 +104,24 @@ static inline unsigned pcpu_read_count(struct six_lock *lock) return read_count; } -/* This is probably up there with the more evil things I've done */ -#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) - -static int __do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - struct task_struct *task, - bool try) +/* + * __do_six_trylock() - main trylock routine + * + * Returns 1 on success, 0 on failure + * + * In percpu reader mode, a failed trylock may cause a spurious trylock failure + * for anoter thread taking the competing lock type, and we may havve to do a + * wakeup: when a wakeup is required, we return -1 - wakeup_type. + */ +static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + struct task_struct *task, bool try) { - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old, new; int ret; - u64 v; + u32 old; EBUG_ON(type == SIX_LOCK_write && lock->owner != task); - EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1)); - EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking))); + EBUG_ON(type == SIX_LOCK_write && + (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); /* * Percpu reader mode: @@ -122,92 +136,70 @@ static int __do_six_trylock_type(struct six_lock *lock, * the lock, then issues a full memory barrier, then reads from the * other thread's variable to check if the other thread thinks it has * the lock. If we raced, we backoff and retry/sleep. + * + * Failure to take the lock may cause a spurious trylock failure in + * another thread, because we temporarily set the lock to indicate that + * we held it. This would be a problem for a thread in six_lock(), when + * they are calling trylock after adding themself to the waitlist and + * prior to sleeping. + * + * Therefore, if we fail to get the lock, and there were waiters of the + * type we conflict with, we will have to issue a wakeup. + * + * Since we may be called under wait_lock (and by the wakeup code + * itself), we return that the wakeup has to be done instead of doing it + * here. */ - if (type == SIX_LOCK_read && lock->readers) { preempt_disable(); this_cpu_inc(*lock->readers); /* signal that we own lock */ smp_mb(); - old.v = READ_ONCE(lock->state.v); - ret = !(old.v & l[type].lock_fail); + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); this_cpu_sub(*lock->readers, !ret); preempt_enable(); - /* - * If we failed because a writer was trying to take the - * lock, issue a wakeup because we might have caused a - * spurious trylock failure: - */ - if (old.write_locking) + if (!ret && (old & SIX_LOCK_WAITING_write)) ret = -1 - SIX_LOCK_write; } else if (type == SIX_LOCK_write && lock->readers) { if (try) { - atomic64_add(__SIX_VAL(write_locking, 1), - &lock->state.counter); - smp_mb__after_atomic(); - } else if (!(lock->state.waiters & (1 << SIX_LOCK_write))) { - atomic64_add(__SIX_VAL(waiters, 1 << SIX_LOCK_write), - &lock->state.counter); - /* - * pairs with barrier after unlock and before checking - * for readers in unlock path - */ + atomic_add(SIX_LOCK_HELD_write, &lock->state); smp_mb__after_atomic(); } ret = !pcpu_read_count(lock); - /* - * On success, we increment lock->seq; also we clear - * write_locking unless we failed from the lock path: - */ - v = 0; - if (ret) - v += __SIX_VAL(seq, 1); - if (ret || try) - v -= __SIX_VAL(write_locking, 1); - if (try && !ret) { - old.v = atomic64_add_return(v, &lock->state.counter); - if (old.waiters & (1 << SIX_LOCK_read)) + old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); + if (old & SIX_LOCK_WAITING_read) ret = -1 - SIX_LOCK_read; - } else { - atomic64_add(v, &lock->state.counter); } } else { - v = READ_ONCE(lock->state.v); + old = atomic_read(&lock->state); do { - new.v = old.v = v; - - if (!(old.v & l[type].lock_fail)) { - new.v += l[type].lock_val; - - if (type == SIX_LOCK_write) - new.write_locking = 0; - } else if (!try && !(new.waiters & (1 << type))) - new.waiters |= 1 << type; - else - break; /* waiting bit already set */ - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); - - ret = !(old.v & l[type].lock_fail); - - EBUG_ON(ret && !(lock->state.v & l[type].held_mask)); + ret = !(old & l[type].lock_fail); + if (!ret || (type == SIX_LOCK_write && !try)) { + smp_mb(); + break; + } + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); + + EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); } if (ret > 0) six_set_owner(lock, type, old, task); - EBUG_ON(type == SIX_LOCK_write && (try || ret > 0) && (lock->state.write_locking)); + EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && + (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); return ret; } -static inline void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) { struct six_lock_waiter *w, *next; struct task_struct *task; @@ -226,7 +218,7 @@ again: goto unlock; saw_one = true; - ret = __do_six_trylock_type(lock, lock_type, w->task, false); + ret = __do_six_trylock(lock, lock_type, w->task, false); if (ret <= 0) goto unlock; @@ -241,7 +233,7 @@ again: wake_up_process(task); } - clear_bit(waitlist_bitnr(lock_type), (unsigned long *) &lock->state.v); + six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); unlock: raw_spin_unlock(&lock->wait_lock); @@ -251,152 +243,198 @@ unlock: } } -static inline void six_lock_wakeup(struct six_lock *lock, - union six_lock_state state, - enum six_lock_type lock_type) +__always_inline +static void six_lock_wakeup(struct six_lock *lock, u32 state, + enum six_lock_type lock_type) { - if (lock_type == SIX_LOCK_write && state.read_lock) + if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) return; - if (!(state.waiters & (1 << lock_type))) + if (!(state & (SIX_LOCK_WAITING_read << lock_type))) return; __six_lock_wakeup(lock, lock_type); } -static bool do_six_trylock_type(struct six_lock *lock, - enum six_lock_type type, - bool try) +__always_inline +static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) { int ret; - ret = __do_six_trylock_type(lock, type, current, try); + ret = __do_six_trylock(lock, type, current, try); if (ret < 0) __six_lock_wakeup(lock, -ret - 1); return ret > 0; } -__always_inline __flatten -static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) +/** + * six_trylock_ip - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) { - if (!do_six_trylock_type(lock, type, true)) + if (!do_six_trylock(lock, type, true)) return false; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); return true; } - -__always_inline __flatten -static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, - unsigned seq) +EXPORT_SYMBOL_GPL(six_trylock_ip); + +/** + * six_relock_ip - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip) { - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state old; - u64 v; + if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) + return false; - EBUG_ON(type == SIX_LOCK_write); + if (six_lock_seq(lock) != seq) { + six_unlock_ip(lock, type, ip); + return false; + } - if (type == SIX_LOCK_read && - lock->readers) { - bool ret; + return true; +} +EXPORT_SYMBOL_GPL(six_relock_ip); - preempt_disable(); - this_cpu_inc(*lock->readers); +#ifdef CONFIG_LOCK_SPIN_ON_OWNER - smp_mb(); +static inline bool six_can_spin_on_owner(struct six_lock *lock) +{ + struct task_struct *owner; + bool ret; - old.v = READ_ONCE(lock->state.v); - ret = !(old.v & l[type].lock_fail) && old.seq == seq; + if (need_resched()) + return false; - this_cpu_sub(*lock->readers, !ret); - preempt_enable(); + rcu_read_lock(); + owner = READ_ONCE(lock->owner); + ret = !owner || owner_on_cpu(owner); + rcu_read_unlock(); + + return ret; +} +static inline bool six_spin_on_owner(struct six_lock *lock, + struct task_struct *owner, + u64 end_time) +{ + bool ret = true; + unsigned loop = 0; + + rcu_read_lock(); + while (lock->owner == owner) { /* - * Similar to the lock path, we may have caused a spurious write - * lock fail and need to issue a wakeup: + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. */ - if (old.write_locking) - six_lock_wakeup(lock, old, SIX_LOCK_write); - - if (ret) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); + barrier(); - return ret; - } + if (!owner_on_cpu(owner) || need_resched()) { + ret = false; + break; + } - v = READ_ONCE(lock->state.v); - do { - old.v = v; + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); + ret = false; + break; + } - if (old.seq != seq || old.v & l[type].lock_fail) - return false; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, - old.v + l[type].lock_val)) != old.v); + cpu_relax(); + } + rcu_read_unlock(); - six_set_owner(lock, type, old, current); - if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read); - return true; + return ret; } -/* - * We don't see stable performance with SIX_LOCK_SPIN_ON_OWNER enabled, so it's - * off for now: - */ -#ifdef SIX_LOCK_SPIN_ON_OWNER - -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait) +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) { - struct task_struct *owner, *task = current; + struct task_struct *task = current; + u64 end_time; - switch (wait->lock_want) { - case SIX_LOCK_read: - break; - case SIX_LOCK_intent: - if (lock->wait_list.next != &wait->list) - return false; - break; - case SIX_LOCK_write: + if (type == SIX_LOCK_write) return false; - } - rcu_read_lock(); - owner = READ_ONCE(lock->owner); + preempt_disable(); + if (!six_can_spin_on_owner(lock)) + goto fail; + + if (!osq_lock(&lock->osq)) + goto fail; + + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (1) { + struct task_struct *owner; - while (owner && lock->owner == owner) { /* - * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * If there's an owner, wait for it to either + * release the lock or go to sleep. */ - barrier(); + owner = READ_ONCE(lock->owner); + if (owner && !six_spin_on_owner(lock, owner, end_time)) + break; + + if (do_six_trylock(lock, type, false)) { + osq_unlock(&lock->osq); + preempt_enable(); + return true; + } /* - * If we're an RT task that will live-lock because we won't let + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (wait->lock_acquired || - !owner->on_cpu || - rt_task(task) || - need_resched()) + if (!owner && (need_resched() || rt_task(task))) break; + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ cpu_relax(); } - rcu_read_unlock(); - return wait->lock_acquired; + osq_unlock(&lock->osq); +fail: + preempt_enable(); + + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock again. This avoids getting + * scheduled out right after we obtained the lock. + */ + if (need_resched()) + schedule(); + + return false; } #else /* CONFIG_LOCK_SPIN_ON_OWNER */ -static inline bool six_optimistic_spin(struct six_lock *lock, - struct six_lock_waiter *wait) +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) { return false; } @@ -404,33 +442,36 @@ static inline bool six_optimistic_spin(struct six_lock *lock, #endif noinline -static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) +static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { - union six_lock_state old; int ret = 0; if (type == SIX_LOCK_write) { - EBUG_ON(lock->state.write_locking); - atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter); + EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + atomic_add(SIX_LOCK_HELD_write, &lock->state); smp_mb__after_atomic(); } - lock_contended(&lock->dep_map, _RET_IP_); + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + + if (six_optimistic_spin(lock, type)) + goto out; wait->task = current; wait->lock_want = type; wait->lock_acquired = false; raw_spin_lock(&lock->wait_lock); - if (!(lock->state.waiters & (1 << type))) - set_bit(waitlist_bitnr(type), (unsigned long *) &lock->state.v); + six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); /* - * Retry taking the lock after taking waitlist lock, have raced with an - * unlock: + * Retry taking the lock after taking waitlist lock, in case we raced + * with an unlock: */ - ret = __do_six_trylock_type(lock, type, current, false); + ret = __do_six_trylock(lock, type, current, false); if (ret <= 0) { wait->start_time = local_clock(); @@ -457,9 +498,6 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty ret = 0; } - if (six_optimistic_spin(lock, wait)) - goto out; - while (1) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -473,7 +511,7 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty list_del(&wait->list); raw_spin_unlock(&lock->wait_lock); - if (wait->lock_acquired) + if (unlikely(wait->lock_acquired)) do_six_unlock_type(lock, type); break; } @@ -483,52 +521,73 @@ static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type ty __set_current_state(TASK_RUNNING); out: - if (ret && type == SIX_LOCK_write && lock->state.write_locking) { - old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1), - &lock->state.counter); - six_lock_wakeup(lock, old, SIX_LOCK_read); + if (ret && type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); } + trace_contention_end(lock, 0); return ret; } -__always_inline __flatten -static int __six_lock_type_waiter(struct six_lock *lock, enum six_lock_type type, - struct six_lock_waiter *wait, - six_lock_should_sleep_fn should_sleep_fn, void *p) +/** + * six_lock_ip_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * This is the most general six_lock() variant, with parameters to support full + * cycle detection for deadlock avoidance. + * + * The code calling this function must implement tracking of held locks, and the + * @wait object should be embedded into the struct that tracks held locks - + * which must also be accessible in a thread-safe way. + * + * @should_sleep_fn should invoke the cycle detector; it should walk each + * lock's waiters, and for each waiter recursively walk their held locks. + * + * When this function must block, @wait will be added to @lock's waitlist before + * calling trylock, and before calling @should_sleep_fn, and @wait will not be + * removed from the lock waitlist until the lock has been successfully acquired, + * or we abort. + * + * @wait.start_time will be monotonically increasing for any given waitlist, and + * thus may be used as a loop cursor. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) { int ret; wait->start_time = 0; if (type != SIX_LOCK_write) - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); - ret = do_six_trylock_type(lock, type, true) ? 0 - : __six_lock_type_slowpath(lock, type, wait, should_sleep_fn, p); + ret = do_six_trylock(lock, type, true) ? 0 + : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); if (ret && type != SIX_LOCK_write) - six_release(&lock->dep_map); + six_release(&lock->dep_map, ip); if (!ret) - lock_acquired(&lock->dep_map, _RET_IP_); + lock_acquired(&lock->dep_map, ip); return ret; } +EXPORT_SYMBOL_GPL(six_lock_ip_waiter); __always_inline -static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, - six_lock_should_sleep_fn should_sleep_fn, void *p) -{ - struct six_lock_waiter wait; - - return __six_lock_type_waiter(lock, type, &wait, should_sleep_fn, p); -} - -__always_inline __flatten static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) { - const struct six_lock_vals l[] = LOCK_VALS; - union six_lock_state state; + u32 state; if (type == SIX_LOCK_intent) lock->owner = NULL; @@ -538,27 +597,47 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) smp_mb(); /* unlock barrier */ this_cpu_dec(*lock->readers); smp_mb(); /* between unlocking and checking for waiters */ - state.v = READ_ONCE(lock->state.v); + state = atomic_read(&lock->state); } else { - EBUG_ON(!(lock->state.v & l[type].held_mask)); - state.v = atomic64_add_return_release(l[type].unlock_val, - &lock->state.counter); + u32 v = l[type].lock_val; + + if (type != SIX_LOCK_read) + v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; + + EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); + state = atomic_sub_return_release(v, &lock->state); } six_lock_wakeup(lock, state, l[type].unlock_wakeup); } -__always_inline __flatten -static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) +/** + * six_unlock_ip - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) { EBUG_ON(type == SIX_LOCK_write && - !(lock->state.v & __SIX_LOCK_HELD_intent)); + !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); EBUG_ON((type == SIX_LOCK_write || type == SIX_LOCK_intent) && lock->owner != current); if (type != SIX_LOCK_write) - six_release(&lock->dep_map); + six_release(&lock->dep_map, ip); + else + lock->seq++; if (type == SIX_LOCK_intent && lock->intent_lock_recurse) { @@ -568,48 +647,14 @@ static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) do_six_unlock_type(lock, type); } +EXPORT_SYMBOL_GPL(six_unlock_ip); -#define __SIX_LOCK(type) \ -bool six_trylock_##type(struct six_lock *lock) \ -{ \ - return __six_trylock_type(lock, SIX_LOCK_##type); \ -} \ -EXPORT_SYMBOL_GPL(six_trylock_##type); \ - \ -bool six_relock_##type(struct six_lock *lock, u32 seq) \ -{ \ - return __six_relock_type(lock, SIX_LOCK_##type, seq); \ -} \ -EXPORT_SYMBOL_GPL(six_relock_##type); \ - \ -int six_lock_##type(struct six_lock *lock, \ - six_lock_should_sleep_fn should_sleep_fn, void *p) \ -{ \ - return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ -} \ -EXPORT_SYMBOL_GPL(six_lock_##type); \ - \ -int six_lock_waiter_##type(struct six_lock *lock, \ - struct six_lock_waiter *wait, \ - six_lock_should_sleep_fn should_sleep_fn, void *p)\ -{ \ - return __six_lock_type_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p);\ -} \ -EXPORT_SYMBOL_GPL(six_lock_waiter_##type); \ - \ -void six_unlock_##type(struct six_lock *lock) \ -{ \ - __six_unlock_type(lock, SIX_LOCK_##type); \ -} \ -EXPORT_SYMBOL_GPL(six_unlock_##type); - -__SIX_LOCK(read) -__SIX_LOCK(intent) -__SIX_LOCK(write) - -#undef __SIX_LOCK - -/* Convert from intent to read: */ +/** + * six_lock_downgrade - convert an intent lock to a read lock + * @lock: lock to dowgrade + * + * @lock will have read count incremented and intent count decremented + */ void six_lock_downgrade(struct six_lock *lock) { six_lock_increment(lock, SIX_LOCK_read); @@ -617,25 +662,32 @@ void six_lock_downgrade(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_downgrade); +/** + * six_lock_tryupgrade - attempt to convert read lock to an intent lock + * @lock: lock to upgrade + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ bool six_lock_tryupgrade(struct six_lock *lock) { - union six_lock_state old, new; - u64 v = READ_ONCE(lock->state.v); + u32 old = atomic_read(&lock->state), new; do { - new.v = old.v = v; + new = old; - if (new.intent_lock) + if (new & SIX_LOCK_HELD_intent) return false; if (!lock->readers) { - EBUG_ON(!new.read_lock); - new.read_lock--; + EBUG_ON(!(new & SIX_LOCK_HELD_read)); + new -= l[SIX_LOCK_read].lock_val; } - new.intent_lock = 1; - } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, - old.v, new.v)) != old.v); + new |= SIX_LOCK_HELD_intent; + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); if (lock->readers) this_cpu_dec(*lock->readers); @@ -646,6 +698,17 @@ bool six_lock_tryupgrade(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_tryupgrade); +/** + * six_trylock_convert - attempt to convert a held lock from one type to another + * @lock: lock to upgrade + * @from: SIX_LOCK_read or SIX_LOCK_intent + * @to: SIX_LOCK_read or SIX_LOCK_intent + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ bool six_trylock_convert(struct six_lock *lock, enum six_lock_type from, enum six_lock_type to) @@ -664,15 +727,20 @@ bool six_trylock_convert(struct six_lock *lock, } EXPORT_SYMBOL_GPL(six_trylock_convert); -/* - * Increment read/intent lock count, assuming we already have it read or intent - * locked: +/** + * six_lock_increment - increase held lock count on a lock that is already held + * @lock: lock to increment + * @type: SIX_LOCK_read or SIX_LOCK_intent + * + * @lock must already be held, with a lock type that is greater than or equal to + * @type + * + * A corresponding six_unlock_type() call will be required for @lock to be fully + * unlocked. */ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) { - const struct six_lock_vals l[] = LOCK_VALS; - - six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read); + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); /* XXX: assert already locked, and that we don't overflow: */ @@ -681,13 +749,14 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) if (lock->readers) { this_cpu_inc(*lock->readers); } else { - EBUG_ON(!lock->state.read_lock && - !lock->state.intent_lock); - atomic64_add(l[type].lock_val, &lock->state.counter); + EBUG_ON(!(atomic_read(&lock->state) & + (SIX_LOCK_HELD_read| + SIX_LOCK_HELD_intent))); + atomic_add(l[type].lock_val, &lock->state); } break; case SIX_LOCK_intent: - EBUG_ON(!lock->state.intent_lock); + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); lock->intent_lock_recurse++; break; case SIX_LOCK_write: @@ -697,9 +766,19 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) } EXPORT_SYMBOL_GPL(six_lock_increment); +/** + * six_lock_wakeup_all - wake up all waiters on @lock + * @lock: lock to wake up waiters for + * + * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * abort the lock operation. + * + * This function is never needed in a bug-free program; it's only useful in + * debug code, e.g. to determine if a cycle detector is at fault. + */ void six_lock_wakeup_all(struct six_lock *lock) { - union six_lock_state state = lock->state; + u32 state = atomic_read(&lock->state); struct six_lock_waiter *w; six_lock_wakeup(lock, state, SIX_LOCK_read); @@ -713,45 +792,102 @@ void six_lock_wakeup_all(struct six_lock *lock) } EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -void six_lock_pcpu_free(struct six_lock *lock) +/** + * six_lock_counts - return held lock counts, for each lock type + * @lock: lock to return counters for + * + * Return: the number of times a lock is held for read, intent and write. + */ +struct six_lock_count six_lock_counts(struct six_lock *lock) { - BUG_ON(lock->readers && pcpu_read_count(lock)); - BUG_ON(lock->state.read_lock); + struct six_lock_count ret; - free_percpu(lock->readers); - lock->readers = NULL; + ret.n[SIX_LOCK_read] = !lock->readers + ? atomic_read(&lock->state) & SIX_LOCK_HELD_read + : pcpu_read_count(lock); + ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + + return ret; } -EXPORT_SYMBOL_GPL(six_lock_pcpu_free); +EXPORT_SYMBOL_GPL(six_lock_counts); -void six_lock_pcpu_alloc(struct six_lock *lock) +/** + * six_lock_readers_add - directly manipulate reader count of a lock + * @lock: lock to add/subtract readers for + * @nr: reader count to add/subtract + * + * When an upper layer is implementing lock reentrency, we may have both read + * and intent locks on the same lock. + * + * When we need to take a write lock, the read locks will cause self-deadlock, + * because six locks themselves do not track which read locks are held by the + * current thread and which are held by a different thread - it does no + * per-thread tracking of held locks. + * + * The upper layer that is tracking held locks may however, if trylock() has + * failed, count up its own read locks, subtract them, take the write lock, and + * then re-add them. + * + * As in any other situation when taking a write lock, @lock must be held for + * intent one (or more) times, so @lock will never be left unlocked. + */ +void six_lock_readers_add(struct six_lock *lock, int nr) { -#ifdef __KERNEL__ - if (!lock->readers) - lock->readers = alloc_percpu(unsigned); -#endif + if (lock->readers) { + this_cpu_add(*lock->readers, nr); + } else { + EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); + /* reader count starts at bit 0 */ + atomic_add(nr, &lock->state); + } } -EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc); - -/* - * Returns lock held counts, for both read and intent +EXPORT_SYMBOL_GPL(six_lock_readers_add); + +/** + * six_lock_exit - release resources held by a lock prior to freeing + * @lock: lock to exit + * + * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * required to free the percpu read counts. */ -struct six_lock_count six_lock_counts(struct six_lock *lock) +void six_lock_exit(struct six_lock *lock) { - struct six_lock_count ret; + WARN_ON(lock->readers && pcpu_read_count(lock)); + WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); - ret.n[SIX_LOCK_read] = 0; - ret.n[SIX_LOCK_intent] = lock->state.intent_lock + lock->intent_lock_recurse; - ret.n[SIX_LOCK_write] = lock->state.seq & 1; + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_exit); - if (!lock->readers) - ret.n[SIX_LOCK_read] += lock->state.read_lock; - else { - int cpu; +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags) +{ + atomic_set(&lock->state, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif - for_each_possible_cpu(cpu) - ret.n[SIX_LOCK_read] += *per_cpu_ptr(lock->readers, cpu); + /* + * Don't assume that we have real percpu variables available in + * userspace: + */ +#ifdef __KERNEL__ + if (flags & SIX_LOCK_INIT_PCPU) { + /* + * We don't return an error here on memory allocation failure + * since percpu is an optimization, and locks will work with the + * same semantics in non-percpu mode: callers can check for + * failure if they wish by checking lock->readers, but generally + * will not want to treat it as an error. + */ + lock->readers = alloc_percpu(unsigned); } - - return ret; +#endif } -EXPORT_SYMBOL_GPL(six_lock_counts); +EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/linux/string_helpers.c b/linux/string_helpers.c index 29c498a..0810ca1 100644 --- a/linux/string_helpers.c +++ b/linux/string_helpers.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/mount.bcachefs b/mount.bcachefs new file mode 100755 index 0000000..5900232 --- /dev/null +++ b/mount.bcachefs @@ -0,0 +1,4 @@ +#!/bin/sh + +SDIR="$(readlink -f "$0")" +exec "${SDIR%/*}/bcachefs" mount "$@" diff --git a/nix/bcachefs-kernel.nix b/nix/bcachefs-kernel.nix deleted file mode 100644 index c937df4..0000000 --- a/nix/bcachefs-kernel.nix +++ /dev/null @@ -1,34 +0,0 @@ -{ lib -, fetchpatch -, fetchgit -, fetchFromGitHub -, buildLinux -, commit -, sha256 ? lib.fakeSha256 -, kernelVersion ? "5.13.0" -, kernelPatches ? [] # must always be defined in bcachefs' all-packages.nix entry because it's also a top-level attribute supplied by callPackage -, argsOverride ? {} -, versionString ? (builtins.substring 0 8 commit) -, ... -} @ args: - -buildLinux { - inherit kernelPatches; - - # pname = "linux"; - version = "${kernelVersion}-bcachefs-${versionString}"; - - modDirVersion = kernelVersion; - - - src = fetchFromGitHub { - name = "bcachefs-kernel-src"; - owner = "koverstreet"; - repo = "bcachefs"; - rev = commit; - inherit sha256; - }; - - extraConfig = "BCACHEFS_FS m"; - # NIX_DEBUG=5; -} \ No newline at end of file diff --git a/nix/bcachefs.rev.sha256 b/nix/bcachefs.rev.sha256 deleted file mode 100644 index 3f06215..0000000 --- a/nix/bcachefs.rev.sha256 +++ /dev/null @@ -1 +0,0 @@ -sha256-JsWrbuxrs047YKGES+r7mMfPdDWIMAGrg1fWi8qU4+A= \ No newline at end of file diff --git a/nix/fetchnix.nix b/nix/fetchnix.nix deleted file mode 100644 index 2f98788..0000000 --- a/nix/fetchnix.nix +++ /dev/null @@ -1,48 +0,0 @@ -# `builtins.fetchTarball` only accepts a `sha256` argument in Nix version 1.12 -# or later, so here we provide a function that can provide a compatible interface -# to Nix 1.11 or Nix 1.12 -# -# TODO FIXME: remove this sometime after Nix 1.12 goes stable - -{ url # URL of the nixpkgs tarball to download -, rev # The Git revision of nixpkgs to fetch -, sha256 # The SHA256 of the downloaded data -, system ? builtins.currentSystem # This is overridable if necessary -}: - -with { - ifThenElse = { bool, thenValue, elseValue }: ( - if bool then thenValue else elseValue); -}; - -ifThenElse { - bool = (0 <= builtins.compareVersions builtins.nixVersion "1.12"); - - # In Nix 1.12, we can just give a `sha256` to `builtins.fetchTarball`. - thenValue = (builtins.fetchTarball { inherit url sha256; }); - - # This hack should at least work for Nix 1.11 - elseValue = ( - (rec { - tarball = import { inherit url sha256; }; - builtin-paths = import ; - - script = builtins.toFile "nixpkgs-unpacker" '' - "$coreutils/mkdir" "$out" - cd "$out" - "$gzip" --decompress < "$tarball" | "$tar" -x --strip-components=1 - ''; - - nixpkgs = builtins.derivation { - name = "nixpkgs-${builtins.substring 0 6 rev}"; - - builder = builtins.storePath builtin-paths.shell; - args = [ script ]; - - inherit tarball system; - tar = builtins.storePath builtin-paths.tar; - gzip = builtins.storePath builtin-paths.gzip; - coreutils = builtins.storePath builtin-paths.coreutils; - }; - }).nixpkgs); -} diff --git a/nix/nixpkgs.json b/nix/nixpkgs.json deleted file mode 100644 index a5a11d0..0000000 --- a/nix/nixpkgs.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "url": "https://github.com/nixos/nixpkgs/archive/5ae883b8c3b04e0c4a9c92a5ab3c7c84b9942943.tar.gz", - "rev": "5ae883b8c3b04e0c4a9c92a5ab3c7c84b9942943", - "sha256": "1s2nhax586v2fax7r5qd1s3d2gdg25isva7k7r9pf9x9ay630cmb" -} diff --git a/nix/nixpkgs.nix b/nix/nixpkgs.nix deleted file mode 100644 index 0067366..0000000 --- a/nix/nixpkgs.nix +++ /dev/null @@ -1,9 +0,0 @@ -let - # Grab the versions we specified in the JSON file - nixpkgs = builtins.fromJSON (builtins.readFile ./nixpkgs.json); - - # Bootstrap a copy of nixpkgs, based on this. - src = import ./fetchnix.nix { inherit (nixpkgs) url rev sha256; }; - -# We use the default nixpkgs configuration during bootstrap. -in import src { config = {}; } diff --git a/nix/overlay.nix b/nix/overlay.nix deleted file mode 100644 index 8138f20..0000000 --- a/nix/overlay.nix +++ /dev/null @@ -1,28 +0,0 @@ -{ filter, self, ... }: -final: prev: { - bcachefs = { - tools = final.callPackage ../default.nix { - testWithValgrind = false; - filter = filter.lib; - versionString = self.version; - }; - toolsValgrind = final.bcachefs.tools.override { - testWithValgrind = true; - }; - toolsDebug = final.bcachefs.toolsValgrind.override { - debugMode = true; - }; - - bch_bindgen = final.callPackage ../rust-src/bch_bindgen {}; - - mount = final.callPackage ../rust-src/mount {}; - - kernelPackages = final.recurseIntoAttrs (final.linuxPackagesFor final.bcachefs.kernel); - kernel = final.callPackage ./bcachefs-kernel.nix { - commit = final.bcachefs.tools.bcachefs_revision; - # This needs to be recalculated for every revision change - sha256 = builtins.readFile ./bcachefs.rev.sha256; - kernelPatches = []; - }; - }; -} diff --git a/nix/update-nixpkgs.sh b/nix/update-nixpkgs.sh deleted file mode 100755 index 770d280..0000000 --- a/nix/update-nixpkgs.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env bash -set -e - -if [[ "x$1" == "x" ]]; then - echo "Must provide a revision argument" - echo "Usage:" - echo " ./update-nixpkgs.sh " - echo " ./update-nixpkgs.sh https://github.com/foo/nixpkgs " - exit 1 -fi - -if [[ "x$2" == "x" ]]; then - REV="$1" - URL="https://github.com/nixos/nixpkgs" -else - REV="$2" - URL="$1" -fi - -DOWNLOAD="$URL/archive/$REV.tar.gz" -echo "Updating to nixpkgs revision $REV from $URL" -SHA256=$(nix-prefetch-url "$DOWNLOAD") - -cat > nixpkgs.json < - 2022.11.15-1 +- NOTE: This binary RPM has been built directly from the bcachefs-tools + git tree with "make rpm" from the git hash indicated in the package version. +- Update spec file to allow in-tree rpm builds +- Remove maually added Requires: and unneeded build-requires + * Tue Jan 21 2020 Michael Adams - 2020.01.21-1 - Updated RPM package definition to reflect that changes in codebase have occurred. + * Tue Jan 07 2020 Michael Adams - 2020.01.07-1 - Initial RPM package definition - Makefile needs further work to accomodate RPM macros. diff --git a/rust-src/mount/.gitignore b/rust-src/.gitignore similarity index 100% rename from rust-src/mount/.gitignore rename to rust-src/.gitignore diff --git a/rust-src/mount/Cargo.lock b/rust-src/Cargo.lock similarity index 54% rename from rust-src/mount/Cargo.lock rename to rust-src/Cargo.lock index 92d13cf..c4dd7f5 100644 --- a/rust-src/mount/Cargo.lock +++ b/rust-src/Cargo.lock @@ -4,36 +4,27 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.10" +version = "0.7.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" dependencies = [ "memchr", ] [[package]] -name = "ansi_term" -version = "0.11.0" +name = "android_system_properties" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" dependencies = [ - "winapi", -] - -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi", + "libc", ] [[package]] name = "anyhow" -version = "1.0.28" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9a60d744a80c30fcb657dfe2c1b22bcb3e814c1a1e3674f32bf5820b570fbff" +checksum = "2cb2f989d18dd141ab8ae82f64d1a8cdd37e0840f73a406896cf5e99502fab61" [[package]] name = "atty" @@ -41,39 +32,37 @@ version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ - "hermit-abi", + "hermit-abi 0.1.19", "libc", "winapi", ] [[package]] name = "autocfg" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] -name = "bcachefs-mount" +name = "bcachefs-rust" version = "0.3.1" dependencies = [ "anyhow", + "atty", "bch_bindgen", "byteorder", - "camino", + "chrono", "clap", + "colored", "either", "errno", "gag", "getset", "itertools", "libc", + "log", "parse-display", "rpassword", - "structopt", - "tracing", - "tracing-attributes", - "tracing-log", - "tracing-subscriber", "udev", "uuid", ] @@ -85,22 +74,23 @@ dependencies = [ "anyhow", "bindgen", "bitfield", + "bitflags", "byteorder", + "chrono", + "colored", "gag", "libc", "memoffset", + "paste", "pkg-config", - "tracing", - "tracing-attributes", "udev", "uuid", ] [[package]] name = "bindgen" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375" +version = "0.64.0" +source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8" dependencies = [ "bitflags", "cexpr", @@ -113,59 +103,48 @@ dependencies = [ "regex", "rustc-hash", "shlex", + "syn", ] [[package]] name = "bitfield" -version = "0.13.2" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" +checksum = "2d7e60934ceec538daadb9d8432424ed043a904d8e0243f3c6446bce549a46ac" [[package]] name = "bitflags" -version = "1.2.1" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "bitvec" -version = "0.19.5" +name = "bumpalo" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] +checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" [[package]] name = "byteorder" -version = "1.3.4" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] -name = "camino" -version = "1.0.5" +name = "cc" +version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52d74260d9bf6944e2208aa46841b4b8f0d7ffc0849a06837b2f510337f86b2b" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" [[package]] name = "cexpr" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ "nom", ] -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" - [[package]] name = "cfg-if" version = "1.0.0" @@ -174,21 +153,24 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.19" +version = "0.4.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" dependencies = [ - "libc", + "iana-time-zone", + "js-sys", "num-integer", "num-traits", + "time", + "wasm-bindgen", "winapi", ] [[package]] name = "clang-sys" -version = "1.2.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee" +checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a" dependencies = [ "glob", "libc", @@ -196,31 +178,124 @@ dependencies = [ [[package]] name = "clap" -version = "2.33.0" +version = "4.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" +checksum = "f13b9c79b5d1dd500d20ef541215a6423c75829ef43117e1b4d17fd8af0b5d76" dependencies = [ - "ansi_term 0.11.0", - "atty", "bitflags", + "clap_derive", + "clap_lex", + "is-terminal", + "once_cell", "strsim", - "term_size", - "textwrap", + "termcolor", + "terminal_size", +] + +[[package]] +name = "clap_derive" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8" +dependencies = [ + "heck", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade" +dependencies = [ + "os_str_bytes", +] + +[[package]] +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", "unicode-width", - "vec_map", +] + +[[package]] +name = "colored" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" +dependencies = [ + "atty", + "lazy_static", + "winapi", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" + +[[package]] +name = "cxx" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc831ee6a32dd495436e317595e639a587aa9907bef96fe6e6abc290ab6204e9" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94331d54f1b1a8895cd81049f7eaaaef9d05a7dcb4d1fd08bf3ff0806246789d" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48dcd35ba14ca9b40d6e4b4b39961f23d835dbb8eed74565ded361d93e1feb8a" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bbeb29798b407ccd82a3324ade1a7286e0d29851475990b612670f6f5124d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] name = "either" -version = "1.5.3" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" +checksum = "7fcaabb2fef8c910e7f4c7ce9f67a1283a1715879a7c230ca9d6d1ae31f16d91" [[package]] name = "errno" -version = "0.2.5" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b480f641ccf0faf324e20c1d3e53d81b7484c698b42ea677f6907ae4db195371" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" dependencies = [ "errno-dragonfly", "libc", @@ -229,31 +304,34 @@ dependencies = [ [[package]] name = "errno-dragonfly" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14ca354e36190500e1e1fb267c647932382b54053c50b14970856c0b00a35067" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" dependencies = [ - "gcc", + "cc", "libc", ] +[[package]] +name = "fastrand" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499" +dependencies = [ + "instant", +] + [[package]] name = "filedescriptor" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" +checksum = "7199d965852c3bac31f779ef99cbb4537f80e952e2d6aa0ffeb30cce00f4f46e" dependencies = [ "libc", "thiserror", "winapi", ] -[[package]] -name = "funty" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" - [[package]] name = "gag" version = "1.0.0" @@ -265,56 +343,100 @@ dependencies = [ ] [[package]] -name = "gcc" -version = "0.3.55" +name = "getset" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f5f3913fa0bfe7ee1fd8248b6b9f42a5af4b9d65ec2dd2c3c26132b950ecfc2" +checksum = "e45727250e75cc04ff2846a66397da8ef2b3db8e40e0cef4df67950a07621eb9" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] [[package]] -name = "getrandom" -version = "0.1.14" +name = "glob" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ - "cfg-if 0.1.10", "libc", - "wasi", ] [[package]] -name = "getset" -version = "0.1.0" +name = "hermit-abi" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f62a139c59ae846c3964c392f12aac68f1997d1a40e9d3b40a89a4ab553e04a0" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" dependencies = [ - "proc-macro-error 0.4.12", - "proc-macro2", - "quote", - "syn", + "libc", ] [[package]] -name = "glob" -version = "0.3.0" +name = "iana-time-zone" +version = "0.1.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", +] [[package]] -name = "heck" -version = "0.3.1" +name = "iana-time-zone-haiku" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" dependencies = [ - "unicode-segmentation", + "cxx", + "cxx-build", ] [[package]] -name = "hermit-abi" +name = "instant" version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61565ff7aaace3525556587bd2dc31d4a07071957be715e63ce7b1eccf51a8f4" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e" dependencies = [ "libc", + "windows-sys", +] + +[[package]] +name = "is-terminal" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189" +dependencies = [ + "hermit-abi 0.2.6", + "io-lifetimes", + "rustix", + "windows-sys", ] [[package]] @@ -327,10 +449,13 @@ dependencies = [ ] [[package]] -name = "itoa" -version = "0.4.8" +name = "js-sys" +version = "0.3.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" +checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +dependencies = [ + "wasm-bindgen", +] [[package]] name = "lazy_static" @@ -340,15 +465,15 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "lazycell" -version = "1.2.1" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b294d6fa9ee409a054354afc4352b0b9ef7ca222c69b8812cbea9e7d2bf3783f" +checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.69" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e85c08494b21a9054e7fe1374a732aeadaff3980b6990b94bfd3a70f690005" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libudev-sys" @@ -361,55 +486,65 @@ dependencies = [ ] [[package]] -name = "log" -version = "0.4.8" +name = "link-cplusplus" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" dependencies = [ - "cfg-if 0.1.10", + "cc", ] [[package]] -name = "matchers" -version = "0.0.1" +name = "linux-raw-sys" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" + +[[package]] +name = "log" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ - "regex-automata", + "cfg-if", ] [[package]] name = "memchr" -version = "2.3.3" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.5.4" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8" +checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "nom" -version = "6.2.1" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" dependencies = [ - "bitvec", - "funty", "memchr", - "version_check", + "minimal-lexical", ] [[package]] name = "num-integer" -version = "0.1.44" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ "autocfg", "num-traits", @@ -417,18 +552,30 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", ] +[[package]] +name = "once_cell" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" + +[[package]] +name = "os_str_bytes" +version = "6.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" + [[package]] name = "parse-display" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "718b422bc6b056b6374f7ffc3b2d9b55180a4af59a089835df1963994676d8b6" +checksum = "bd87725635cbae7fe960f91f55a114ed104e637790317cc8d9197ea16b058010" dependencies = [ "lazy_static", "parse-display-derive", @@ -437,9 +584,9 @@ dependencies = [ [[package]] name = "parse-display-derive" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7cf2deb364a60cc0f633c1ffe619b42463993c91352ae367010b8420e442655" +checksum = "cc52b391380aa8550348736a356bf028f5469391d580533a566e97543f55e813" dependencies = [ "lazy_static", "proc-macro2", @@ -450,184 +597,96 @@ dependencies = [ ] [[package]] -name = "peeking_take_while" -version = "0.1.2" +name = "paste" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" [[package]] -name = "pin-project-lite" -version = "0.2.7" +name = "peeking_take_while" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] name = "pkg-config" -version = "0.3.17" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677" - -[[package]] -name = "ppv-lite86" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" [[package]] name = "proc-macro-error" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18f33027081eba0a6d8aba6d1b1c3a3be58cbb12106341c2d5759fcd9b5277e7" -dependencies = [ - "proc-macro-error-attr 0.4.12", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678" -dependencies = [ - "proc-macro-error-attr 1.0.2", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "0.4.12" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a5b4b77fdb63c1eca72173d68d24501c54ab1269409f6b672c85deb18af69de" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ + "proc-macro-error-attr", "proc-macro2", "quote", "syn", - "syn-mid", "version_check", ] [[package]] name = "proc-macro-error-attr" -version = "1.0.2" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2", "quote", - "syn", - "syn-mid", "version_check", ] [[package]] name = "proc-macro2" -version = "1.0.12" +version = "1.0.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8872cf6f48eee44265156c111456a700ab3483686b3f96df4cf5481c89157319" +checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" dependencies = [ - "unicode-xid", + "unicode-ident", ] [[package]] name = "quote" -version = "1.0.4" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1f4b0efa5fc5e8ceb705136bfee52cfdb6a4e3509f770b478cd6ed434232a7" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" dependencies = [ "proc-macro2", ] [[package]] -name = "radium" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" - -[[package]] -name = "rand" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" -dependencies = [ - "getrandom", - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] - -[[package]] -name = "rand_chacha" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rand_hc" -version = "0.2.0" +name = "redox_syscall" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ - "rand_core", + "bitflags", ] -[[package]] -name = "redox_syscall" -version = "0.1.56" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" - [[package]] name = "regex" -version = "1.3.7" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", -] - -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax", ] [[package]] name = "regex-syntax" -version = "0.6.17" +version = "0.6.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" [[package]] name = "remove_dir_all" -version = "0.5.2" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e" +checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" dependencies = [ "winapi", ] @@ -649,36 +708,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] -name = "ryu" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" - -[[package]] -name = "serde" -version = "1.0.130" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" - -[[package]] -name = "serde_json" -version = "1.0.67" +name = "rustix" +version = "0.36.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" +checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03" dependencies = [ - "itoa", - "ryu", - "serde", + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", ] [[package]] -name = "sharded-slab" -version = "0.1.3" +name = "scratch" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "740223c51853f3145fe7c90360d2d4232f2b62e3449489c207eccde818979982" -dependencies = [ - "lazy_static", -] +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" [[package]] name = "shlex" @@ -686,118 +733,70 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" -[[package]] -name = "smallvec" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe0f37c9e8f3c5a4a66ad655a93c74daac4ad00c441533bf5c6e7990bb42604e" - [[package]] name = "strsim" -version = "0.8.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - -[[package]] -name = "structopt" -version = "0.3.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf9d950ef167e25e0bdb073cf1d68e9ad2795ac826f2f3f59647817cf23c0bfa" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134d838a2c9943ac3125cf6df165eda53493451b719f3255b2a26b85f772d0ba" -dependencies = [ - "heck", - "proc-macro-error 1.0.2", - "proc-macro2", - "quote", - "syn", -] +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "1.0.18" +version = "1.0.107" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "410a7488c0a728c7ceb4ad59b9567eb4053d02e8cc7f5c0e0eeeb39518369213" +checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] -[[package]] -name = "syn-mid" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "tempfile" -version = "3.1.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9" +checksum = "5cdb1ef4eaeeaddc8fbd371e5017057064af0911902ef36b39801f67cc6d79e4" dependencies = [ - "cfg-if 0.1.10", + "cfg-if", + "fastrand", "libc", - "rand", "redox_syscall", "remove_dir_all", "winapi", ] [[package]] -name = "term_size" -version = "0.3.2" +name = "termcolor" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e4129646ca0ed8f45d09b929036bafad5377103edd06e50bf574b353d2b08d9" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" dependencies = [ - "libc", - "winapi", + "winapi-util", ] [[package]] -name = "textwrap" -version = "0.11.0" +name = "terminal_size" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +checksum = "cb20089a8ba2b69debd491f8d2d023761cbf196e999218c591fa1e7e15a21907" dependencies = [ - "term_size", - "unicode-width", + "rustix", + "windows-sys", ] [[package]] name = "thiserror" -version = "1.0.21" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "318234ffa22e0920fe9a40d7b8369b5f649d490980cf7aadcf1eb91594869b42" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.21" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cae2447b6282786c3493999f40a9be2a6ad20cb8bd268b0a0dbf5a065535c0ab" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", @@ -805,165 +804,195 @@ dependencies = [ ] [[package]] -name = "thread_local" -version = "1.0.1" +name = "time" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" dependencies = [ - "lazy_static", + "libc", + "wasi", + "winapi", ] [[package]] -name = "tracing" -version = "0.1.26" +name = "udev" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09adeb8c97449311ccd28a427f96fb563e7fd31aabf994189879d9da2394b89d" +checksum = "4ebdbbd670373442a12fe9ef7aeb53aec4147a5a27a00bbc3ab639f08f48191a" dependencies = [ - "cfg-if 1.0.0", - "pin-project-lite", - "tracing-attributes", - "tracing-core", + "libc", + "libudev-sys", + "pkg-config", ] [[package]] -name = "tracing-attributes" -version = "0.1.15" +name = "unicode-ident" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c42e6fa53307c8a17e4ccd4dc81cf5ec38db9209f59b222210375b54ee40d1e2" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" [[package]] -name = "tracing-core" -version = "0.1.19" +name = "unicode-width" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ca517f43f0fb96e0c3072ed5c275fe5eece87e8cb52f4a77b69226d3b1c9df8" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "uuid" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ - "lazy_static", + "cfg-if", + "wasm-bindgen-macro", ] [[package]] -name = "tracing-log" -version = "0.1.2" +name = "wasm-bindgen-backend" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6923477a48e41c1951f1999ef8bb5a3023eb723ceadafe78ffb65dc366761e3" +checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" dependencies = [ - "lazy_static", + "bumpalo", "log", - "tracing-core", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", ] [[package]] -name = "tracing-serde" -version = "0.1.2" +name = "wasm-bindgen-macro" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb65ea441fbb84f9f6748fd496cf7f63ec9af5bca94dd86456978d055e8eb28b" +checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" dependencies = [ - "serde", - "tracing-core", + "quote", + "wasm-bindgen-macro-support", ] [[package]] -name = "tracing-subscriber" -version = "0.2.20" +name = "wasm-bindgen-macro-support" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9cbe87a2fa7e35900ce5de20220a582a9483a7063811defce79d7cbd59d4cfe" +checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" dependencies = [ - "ansi_term 0.12.1", - "chrono", - "lazy_static", - "matchers", - "regex", - "serde", - "serde_json", - "sharded-slab", - "smallvec", - "thread_local", - "tracing", - "tracing-core", - "tracing-log", - "tracing-serde", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", ] [[package]] -name = "udev" -version = "0.4.0" +name = "wasm-bindgen-shared" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" + +[[package]] +name = "winapi" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24953d50a3bce0f5f5a9a2766567072dc9af8096f8c40ea81815da651066bc9f" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ - "libc", - "libudev-sys", + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", ] [[package]] -name = "unicode-segmentation" -version = "1.6.0" +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] -name = "unicode-width" -version = "0.1.7" +name = "winapi-util" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] [[package]] -name = "unicode-xid" -version = "0.2.0" +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "uuid" -version = "0.8.1" +name = "windows-sys" +version = "0.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fde2f6a4bea1d6e007c4ad38c6839fa71cbb63b6dbf5b595aa38dc9b1093c11" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] [[package]] -name = "vec_map" -version = "0.8.1" +name = "windows_aarch64_gnullvm" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" [[package]] -name = "version_check" -version = "0.9.1" +name = "windows_aarch64_msvc" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078775d0255232fb988e6fccf26ddc9d1ac274299aaedcedce21c6f72cc533ce" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" [[package]] -name = "wasi" -version = "0.9.0+wasi-snapshot-preview1" +name = "windows_i686_gnu" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" [[package]] -name = "winapi" -version = "0.3.8" +name = "windows_i686_msvc" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" [[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" +name = "windows_x86_64_gnu" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" [[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +name = "windows_x86_64_gnullvm" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" [[package]] -name = "wyz" -version = "0.2.0" +name = "windows_x86_64_msvc" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" diff --git a/rust-src/Cargo.toml b/rust-src/Cargo.toml new file mode 100644 index 0000000..92a3853 --- /dev/null +++ b/rust-src/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "bcachefs-rust" +version = "0.3.1" +authors = ["Yuxuan Shui ", "Kayla Firestack "] +edition = "2018" + +[lib] +crate-type = ["staticlib"] + +[dependencies] +atty = "0.2.14" +log = { version = "0.4", features = ["std"] } +chrono = "0.4" +colored = "2" +clap = { version = "4.0.32", features = ["derive", "wrap_help"] } +anyhow = "1.0" +libc = "0.2.69" +udev = "0.7.0" +uuid = "1.2.2" +gag = "1.0.0" +getset = "0.1" +itertools = "0.9" +parse-display = "0.1" +errno = "0.2" +either = "1.5" +rpassword = "4" +bch_bindgen = { path = "bch_bindgen" } +byteorder = "1.3" diff --git a/rust-src/mount/README.md b/rust-src/README.md similarity index 100% rename from rust-src/mount/README.md rename to rust-src/README.md diff --git a/rust-src/bch_bindgen/Cargo.lock b/rust-src/bch_bindgen/Cargo.lock index 2138d33..b274197 100644 --- a/rust-src/bch_bindgen/Cargo.lock +++ b/rust-src/bch_bindgen/Cargo.lock @@ -2,17 +2,37 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anyhow" -version = "1.0.44" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224afbd727c3d6e4b90103ece64b8d1b67fbb1973b1046c2281eed3f3803f800" + +[[package]] +name = "atty" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61604a8f862e1d5c3229fdd78f8b02c68dcf73a4c4b05fd636d12240aaa242c1" +checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" +dependencies = [ + "hermit-abi", + "libc", + "winapi", +] [[package]] name = "autocfg" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "bch_bindgen" @@ -21,22 +41,23 @@ dependencies = [ "anyhow", "bindgen", "bitfield", + "bitflags", "byteorder", + "chrono", + "colored", "gag", "libc", "memoffset", + "paste", "pkg-config", - "tracing", - "tracing-attributes", "udev", "uuid", ] [[package]] name = "bindgen" -version = "0.59.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453c49e5950bb0eb63bb3df640e31618846c89d5b7faa54040d76e98e0134375" +version = "0.64.0" +source = "git+https://evilpiepirate.org/git/rust-bindgen.git#f773267b090bf16b9e8375fcbdcd8ba5e88806a8" dependencies = [ "bitflags", "cexpr", @@ -49,13 +70,14 @@ dependencies = [ "regex", "rustc-hash", "shlex", + "syn", ] [[package]] name = "bitfield" -version = "0.13.2" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719" +checksum = "2d7e60934ceec538daadb9d8432424ed043a904d8e0243f3c6446bce549a46ac" [[package]] name = "bitflags" @@ -64,16 +86,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] -name = "bitvec" -version = "0.19.5" +name = "bumpalo" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" -dependencies = [ - "funty", - "radium", - "tap", - "wyz", -] +checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535" [[package]] name = "byteorder" @@ -81,11 +97,17 @@ version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + [[package]] name = "cexpr" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db507a7679252d2276ed0dd8113c6875ec56d3089f9225b2b42c30cc1f8e5c89" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" dependencies = [ "nom", ] @@ -96,32 +118,142 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-integer", + "num-traits", + "time", + "wasm-bindgen", + "winapi", +] + [[package]] name = "clang-sys" -version = "1.2.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10612c0ec0e0a1ff0e97980647cb058a6e7aedb913d01d009c406b8b7d0b26ee" +checksum = "77ed9a53e5d4d9c573ae844bfac6872b159cb1d1585a83b29e7a64b7eef7332a" dependencies = [ "glob", "libc", ] [[package]] -name = "filedescriptor" -version = "0.8.1" +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] + +[[package]] +name = "colored" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed3d8a5e20435ff00469e51a0d82049bae66504b5c429920dadf9bb54d47b3f" +checksum = "b3616f750b84d8f0de8a58bda93e08e2a81ad3f523089b05f1dffecab48c6cbd" dependencies = [ + "atty", + "lazy_static", + "winapi", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" + +[[package]] +name = "cxx" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86d3488e7665a7a483b57e25bdd90d0aeb2bc7608c8d0346acf2ad3f1caf1d62" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48fcaf066a053a41a81dfb14d57d99738b767febb8b735c3016e469fac5da690" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2", + "quote", + "scratch", + "syn", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2ef98b8b717a829ca5603af80e1f9e2e48013ab227b68ef37872ef84ee479bf" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "086c685979a698443656e5cf7856c95c642295a38599f12fb1ff76fb28d19892" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", "libc", - "thiserror", "winapi", ] [[package]] -name = "funty" -version = "1.1.0" +name = "errno-dragonfly" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "filedescriptor" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7199d965852c3bac31f779ef99cbb4537f80e952e2d6aa0ffeb30cce00f4f46e" +dependencies = [ + "libc", + "thiserror", + "winapi", +] [[package]] name = "gag" @@ -134,21 +266,71 @@ dependencies = [ ] [[package]] -name = "getrandom" -version = "0.2.3" +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + +[[package]] +name = "hermit-abi" +version = "0.1.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" +dependencies = [ + "libc", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1abeb7a0dd0f8181267ff8adc397075586500b81b28a73e8a0208b00fc170fb3" +dependencies = [ "libc", - "wasi", + "windows-sys 0.45.0", ] [[package]] -name = "glob" -version = "0.3.0" +name = "js-sys" +version = "0.3.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +dependencies = [ + "wasm-bindgen", +] [[package]] name = "lazy_static" @@ -164,9 +346,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" [[package]] name = "libc" -version = "0.2.103" +version = "0.2.139" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8f7255a17a627354f321ef0055d63b898c6fb27eff628af4d1b66b7331edf6" +checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79" [[package]] name = "libudev-sys" @@ -179,158 +361,170 @@ dependencies = [ ] [[package]] -name = "memchr" -version = "2.3.4" +name = "link-cplusplus" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +dependencies = [ + "cc", +] [[package]] -name = "memoffset" -version = "0.5.6" +name = "linux-raw-sys" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043175f069eda7b85febe4a74abbaeff828d9f8b448515d3151a14a3542811aa" -dependencies = [ - "autocfg", -] +checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" [[package]] -name = "nom" -version = "6.2.1" +name = "log" +version = "0.4.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c5c51b9083a3c620fa67a2a635d1ce7d95b897e957d6b28ff9a5da960a103a6" +checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" dependencies = [ - "bitvec", - "funty", - "memchr", - "version_check", + "cfg-if", ] [[package]] -name = "peeking_take_while" -version = "0.1.2" +name = "memchr" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] -name = "pin-project-lite" -version = "0.2.7" +name = "memoffset" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d31d11c69a6b52a174b42bdc0c30e5e11670f90788b2c471c31c1d17d449443" +checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +dependencies = [ + "autocfg", +] [[package]] -name = "pkg-config" -version = "0.3.20" +name = "minimal-lexical" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c9b1041b4387893b91ee6746cddfc28516aff326a3519fb2adf820932c5e6cb" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] -name = "ppv-lite86" -version = "0.2.10" +name = "nom" +version = "7.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] [[package]] -name = "proc-macro2" -version = "1.0.29" +name = "num-integer" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" dependencies = [ - "unicode-xid", + "autocfg", + "num-traits", ] [[package]] -name = "quote" -version = "1.0.9" +name = "num-traits" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" +checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ - "proc-macro2", + "autocfg", ] [[package]] -name = "radium" -version = "0.5.3" +name = "once_cell" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8" +checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" [[package]] -name = "rand" -version = "0.8.4" +name = "paste" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", - "rand_hc", -] +checksum = "d01a5bd0424d00070b0098dd17ebca6f961a959dead1dbcbbbc1d1cd8d3deeba" [[package]] -name = "rand_chacha" -version = "0.3.1" +name = "peeking_take_while" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] +checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" [[package]] -name = "rand_core" -version = "0.6.3" +name = "pkg-config" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" +checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" + +[[package]] +name = "proc-macro2" +version = "1.0.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d727cae5b39d21da60fa540906919ad737832fe0b1c165da3a34d6548c849d6" dependencies = [ - "getrandom", + "unicode-ident", ] [[package]] -name = "rand_hc" -version = "0.3.1" +name = "quote" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" +checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" dependencies = [ - "rand_core", + "proc-macro2", ] [[package]] name = "redox_syscall" -version = "0.2.10" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.5.4" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" +checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733" dependencies = [ "regex-syntax", ] [[package]] name = "regex-syntax" -version = "0.6.25" +version = "0.6.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848" + +[[package]] +name = "rustc-hash" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" [[package]] -name = "remove_dir_all" -version = "0.5.3" +name = "rustix" +version = "0.36.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7" +checksum = "f43abb88211988493c1abb44a70efa56ff0ce98f233b7b276146f1f3f7ba9644" dependencies = [ - "winapi", + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys 0.45.0", ] [[package]] -name = "rustc-hash" -version = "1.1.0" +name = "scratch" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" [[package]] name = "shlex" @@ -340,49 +534,51 @@ checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3" [[package]] name = "syn" -version = "1.0.77" +version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5239bc68e0fef57495900cfea4e8dc75596d9a319d7e16b1e0a440d24e6fe0a0" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", - "unicode-xid", + "unicode-ident", ] -[[package]] -name = "tap" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" - [[package]] name = "tempfile" -version = "3.2.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dac1c663cfc93810f88aed9b8941d48cabf856a1b111c29a40439018d870eb22" +checksum = "af18f7ae1acd354b992402e9ec5864359d693cd8a79dcbef59f76891701c1e95" dependencies = [ "cfg-if", - "libc", - "rand", + "fastrand", "redox_syscall", - "remove_dir_all", - "winapi", + "rustix", + "windows-sys 0.42.0", +] + +[[package]] +name = "termcolor" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +dependencies = [ + "winapi-util", ] [[package]] name = "thiserror" -version = "1.0.29" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" +checksum = "6a9cd18aa97d5c45c6603caea1da6628790b37f7a34b6ca89522331c5180fed0" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.29" +version = "1.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" +checksum = "1fb327af4685e4d03fa8cbcf1716380da910eeb2bb8be417e7f9fd3fb164f36f" dependencies = [ "proc-macro2", "quote", @@ -390,70 +586,104 @@ dependencies = [ ] [[package]] -name = "tracing" -version = "0.1.28" +name = "time" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84f96e095c0c82419687c20ddf5cb3eadb61f4e1405923c9dc8e53a1adacbda8" +checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" dependencies = [ - "cfg-if", - "pin-project-lite", - "tracing-attributes", - "tracing-core", + "libc", + "wasi", + "winapi", ] [[package]] -name = "tracing-attributes" -version = "0.1.16" +name = "udev" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98863d0dd09fa59a1b79c6750ad80dbda6b75f4e71c437a6a1a8cb91a8bcbd77" +checksum = "4ebdbbd670373442a12fe9ef7aeb53aec4147a5a27a00bbc3ab639f08f48191a" dependencies = [ - "proc-macro2", - "quote", - "syn", + "libc", + "libudev-sys", + "pkg-config", ] [[package]] -name = "tracing-core" -version = "0.1.20" +name = "unicode-ident" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46125608c26121c81b0c6d693eab5a420e416da7e43c426d2e8f7df8da8a3acf" -dependencies = [ - "lazy_static", -] +checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" [[package]] -name = "udev" -version = "0.4.0" +name = "unicode-width" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24953d50a3bce0f5f5a9a2766567072dc9af8096f8c40ea81815da651066bc9f" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "uuid" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1674845326ee10d37ca60470760d4288a6f80f304007d92e5c53bab78c9cfd79" + +[[package]] +name = "wasi" +version = "0.10.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" + +[[package]] +name = "wasm-bindgen" +version = "0.2.84" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" dependencies = [ - "libc", - "libudev-sys", + "cfg-if", + "wasm-bindgen-macro", ] [[package]] -name = "unicode-xid" -version = "0.2.2" +name = "wasm-bindgen-backend" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" +checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] [[package]] -name = "uuid" -version = "0.8.2" +name = "wasm-bindgen-macro" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7" +checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] [[package]] -name = "version_check" -version = "0.9.3" +name = "wasm-bindgen-macro-support" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" +checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] [[package]] -name = "wasi" -version = "0.10.2+wasi-snapshot-preview1" +name = "wasm-bindgen-shared" +version = "0.2.84" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" +checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" [[package]] name = "winapi" @@ -471,6 +701,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +dependencies = [ + "winapi", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" @@ -478,7 +717,82 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] -name = "wyz" -version = "0.2.0" +name = "windows-sys" +version = "0.42.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2522491fbfcd58cc84d47aeb2958948c4b8982e9a2d8a2a35bbaed431390e7" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7" + +[[package]] +name = "windows_i686_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640" + +[[package]] +name = "windows_i686_msvc" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214" +checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd" diff --git a/rust-src/bch_bindgen/Cargo.toml b/rust-src/bch_bindgen/Cargo.toml index 91cc77f..cb341ee 100644 --- a/rust-src/bch_bindgen/Cargo.toml +++ b/rust-src/bch_bindgen/Cargo.toml @@ -9,18 +9,19 @@ crate-type = ["lib"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -tracing = "0.1.26" +chrono = "0.4" +colored = "2" anyhow = "1.0" -udev = "0.4" -uuid = "0.8" -bitfield = "0.13" -memoffset = "0.5" +udev = "0.7.0" +uuid = "1.2.2" +bitfield = "0.14.0" +memoffset = "0.8.0" byteorder = "1.3" -tracing-attributes = "0.1.15" libc = "0.2.69" gag = "1.0.0" - +bitflags = "1.3.2" +paste = "1.0.11" [build-dependencies] pkg-config = "0.3" -bindgen = { version = "0.59.1", default-features = false } +bindgen = { git = "https://evilpiepirate.org/git/rust-bindgen.git", default-features = false } diff --git a/rust-src/bch_bindgen/build.rs b/rust-src/bch_bindgen/build.rs index fd570db..92ec3ce 100644 --- a/rust-src/bch_bindgen/build.rs +++ b/rust-src/bch_bindgen/build.rs @@ -1,74 +1,109 @@ + +#[derive(Debug)] +pub struct Fix753 {} +impl bindgen::callbacks::ParseCallbacks for Fix753 { + fn item_name(&self, original_item_name: &str) -> Option { + Some(original_item_name.trim_start_matches("Fix753_").to_owned()) + } +} + fn main() { - use std::path::PathBuf; - // use std::process::Command; + use std::path::PathBuf; - let out_dir: PathBuf = std::env::var_os("OUT_DIR").expect("ENV Var 'OUT_DIR' Expected").into(); - let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR") - .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected") - .into(); - let libbcachefs_inc_dir = - std::env::var("LIBBCACHEFS_INCLUDE").unwrap_or_else(|_| top_dir.join("libbcachefs").display().to_string()); - let libbcachefs_inc_dir = std::path::Path::new(&libbcachefs_inc_dir); - println!("{}", libbcachefs_inc_dir.display()); + let out_dir: PathBuf = std::env::var_os("OUT_DIR") + .expect("ENV Var 'OUT_DIR' Expected") + .into(); + let top_dir: PathBuf = std::env::var_os("CARGO_MANIFEST_DIR") + .expect("ENV Var 'CARGO_MANIFEST_DIR' Expected") + .into(); - println!("cargo:rustc-link-lib=dylib=bcachefs"); - println!("cargo:rustc-link-search={}", env!("LIBBCACHEFS_LIB")); + let libbcachefs_inc_dir = std::path::Path::new("../.."); - let _libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs"); - let bindings = bindgen::builder() - .header(top_dir.join("src").join("libbcachefs_wrapper.h").display().to_string()) - .clang_arg(format!("-I{}", libbcachefs_inc_dir.join("include").display())) - .clang_arg(format!("-I{}", libbcachefs_inc_dir.display())) - .clang_arg("-DZSTD_STATIC_LINKING_ONLY") - .clang_arg("-DNO_BCACHEFS_FS") - .clang_arg("-D_GNU_SOURCE") - .derive_debug(true) - .derive_default(true) - .derive_eq(true) - .layout_tests(true) - .default_enum_style(bindgen::EnumVariation::Rust { non_exhaustive: true }) - .allowlist_function(".*bch2_.*") - // .allowlist_function("bch2_read_super") - // .allowlist_function("bch2_sb_field_.*") - // .allowlist_function("bch2_super_write") - // .allowlist_function("bch2_chacha_encrypt_key") - // .allowlist_function("__bch2_super_read") - .allowlist_function("bio_.*") - .allowlist_function("bch2_super_write_fd") - .allowlist_function("derive_passphrase") - .allowlist_function("request_key") - .allowlist_function("add_key") - .allowlist_function("keyctl_search") - .blocklist_type("bch_extent_ptr") - .blocklist_type("btree_node") - .blocklist_type("bch_extent_crc32") - .blocklist_type("rhash_lock_head") - .blocklist_type("srcu_struct") - .allowlist_var("BCH_.*") - .allowlist_var("KEY_SPEC_.*") - .allowlist_type("bch_kdf_types") - .allowlist_type("bch_sb_field_.*") - .allowlist_type("bch_encrypted_key") - .allowlist_type("nonce") - .newtype_enum("bch_kdf_types") - .opaque_type("gendisk") - .opaque_type("bkey") - // .opaque_type("bch_extent_ptr") - // .opaque_type("bch_extent_crc32") - .opaque_type("open_bucket.*") - .generate() - .expect("BindGen Generation Failiure: [libbcachefs_wrapper]"); - bindings - .write_to_file(out_dir.join("bcachefs.rs")) - .expect("Writing to output file failed for: `bcachefs.rs`"); + let _libbcachefs_dir = top_dir.join("libbcachefs").join("libbcachefs"); + let bindings = bindgen::builder() + .header( + top_dir + .join("src") + .join("libbcachefs_wrapper.h") + .display() + .to_string(), + ) + .clang_arg(format!( + "-I{}", + libbcachefs_inc_dir.join("include").display() + )) + .clang_arg(format!("-I{}", libbcachefs_inc_dir.display())) + .clang_arg("-DZSTD_STATIC_LINKING_ONLY") + .clang_arg("-DNO_BCACHEFS_FS") + .clang_arg("-D_GNU_SOURCE") + .clang_arg("-fkeep-inline-functions") + .derive_debug(true) + .derive_default(true) + .layout_tests(true) + .default_enum_style(bindgen::EnumVariation::Rust { + non_exhaustive: true, + }) + .allowlist_function(".*bch2_.*") + .allowlist_function("bio_.*") + .allowlist_function("derive_passphrase") + .allowlist_function("request_key") + .allowlist_function("add_key") + .allowlist_function("keyctl_search") + .allowlist_function("match_string") + .allowlist_function("printbuf.*") + .blocklist_type("bch_extent_ptr") + .blocklist_type("btree_node") + .blocklist_type("bch_extent_crc32") + .blocklist_type("rhash_lock_head") + .blocklist_type("srcu_struct") + .allowlist_var("BCH_.*") + .allowlist_var("KEY_SPEC_.*") + .allowlist_var("Fix753_FMODE_.*") + .allowlist_var("bch.*") + .allowlist_var("__BTREE_ITER.*") + .allowlist_var("BTREE_ITER.*") + .blocklist_item("bch2_bkey_ops") + .allowlist_type("bch_.*") + .allowlist_type("fsck_err_opts") + .rustified_enum("fsck_err_opts") + .allowlist_type("nonce") + .no_debug("bch_replicas_padded") + .newtype_enum("bch_kdf_types") + .rustified_enum("bch_key_types") + .opaque_type("gendisk") + .opaque_type("gc_stripe") + .opaque_type("open_bucket.*") + .opaque_type("replicas_delta_list") + .no_copy("btree_trans") + .no_copy("printbuf") + .no_partialeq("bkey") + .no_partialeq("bpos") + .generate_inline_functions(true) + .parse_callbacks(Box::new(Fix753 {})) + .generate() + .expect("BindGen Generation Failiure: [libbcachefs_wrapper]"); + bindings + .write_to_file(out_dir.join("bcachefs.rs")) + .expect("Writing to output file failed for: `bcachefs.rs`"); - let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib"); - let bindings = bindgen::builder() - .header(top_dir.join("src").join("keyutils_wrapper.h").display().to_string()) - .clang_args(keyutils.include_paths.iter().map(|p| format!("-I{}", p.display()))) - .generate() - .expect("BindGen Generation Failiure: [Keyutils]"); - bindings - .write_to_file(out_dir.join("keyutils.rs")) - .expect("Writing to output file failed for: `keyutils.rs`"); + let keyutils = pkg_config::probe_library("libkeyutils").expect("Failed to find keyutils lib"); + let bindings = bindgen::builder() + .header( + top_dir + .join("src") + .join("keyutils_wrapper.h") + .display() + .to_string(), + ) + .clang_args( + keyutils + .include_paths + .iter() + .map(|p| format!("-I{}", p.display())), + ) + .generate() + .expect("BindGen Generation Failiure: [Keyutils]"); + bindings + .write_to_file(out_dir.join("keyutils.rs")) + .expect("Writing to output file failed for: `keyutils.rs`"); } diff --git a/rust-src/bch_bindgen/default.nix b/rust-src/bch_bindgen/default.nix deleted file mode 100644 index f6053d5..0000000 --- a/rust-src/bch_bindgen/default.nix +++ /dev/null @@ -1,76 +0,0 @@ -{ lib -, stdenv -, rustPlatform -, llvmPackages -, bcachefs -, pkg-config - -, udev -, liburcu -, zstd -, keyutils -, libaio - -, lz4 # liblz4 -, libsodium -, libuuid -, zlib # zlib1g -, libscrypt - -, rustfmt - -, glibc -, ... -}: let - include = { - glibc = "${glibc.dev}/include"; - clang = let libc = llvmPackages.libclang; in - "${libc.lib}/lib/clang/${libc.version}/include"; - urcu = "${liburcu}/include"; - zstd = "${zstd.dev}/include"; - }; - cargo = lib.trivial.importTOML ./Cargo.toml; -in rustPlatform.buildRustPackage { - pname = cargo.package.name; - version = cargo.package.version; - - src = builtins.path { path = ./.; name = "bch_bindgen"; }; - - cargoLock = { lockFile = ./Cargo.lock; }; - - nativeBuildInputs = [ rustfmt pkg-config ]; - buildInputs = [ - - # libaio - keyutils # libkeyutils - lz4 # liblz4 - libsodium - liburcu - libuuid - zstd # libzstd - zlib # zlib1g - udev - libscrypt - libaio - ]; - - LIBBCACHEFS_LIB ="${bcachefs.tools}/lib"; - LIBBCACHEFS_INCLUDE = bcachefs.tools.src; - LIBCLANG_PATH = "${llvmPackages.libclang.lib}/lib"; - BINDGEN_EXTRA_CLANG_ARGS = lib.replaceStrings ["\n" "\t"] [" " ""] '' - -std=gnu99 - -I${include.glibc} - -I${include.clang} - -I${include.urcu} - -I${include.zstd} - ''; - - postPatch = '' - cp ${./Cargo.lock} Cargo.lock - ''; - - - doCheck = true; - - # NIX_DEBUG = 4; -} \ No newline at end of file diff --git a/rust-src/bch_bindgen/rustfmt.toml b/rust-src/bch_bindgen/rustfmt.toml index a2b7f32..42f2ad7 100644 --- a/rust-src/bch_bindgen/rustfmt.toml +++ b/rust-src/bch_bindgen/rustfmt.toml @@ -1,2 +1,3 @@ -max_width=120 -hard_tabs = true +# Default settings, i.e. idiomatic rust +edition = "2021" +newline_style = "Unix" \ No newline at end of file diff --git a/rust-src/bch_bindgen/src/bcachefs.rs b/rust-src/bch_bindgen/src/bcachefs.rs index cc98ffc..fa8dbde 100644 --- a/rust-src/bch_bindgen/src/bcachefs.rs +++ b/rust-src/bch_bindgen/src/bcachefs.rs @@ -7,116 +7,104 @@ include!(concat!(env!("OUT_DIR"), "/bcachefs.rs")); use bitfield::bitfield; bitfield! { - pub struct bch_scrypt_flags(u64); - pub N, _: 15, 0; - pub R, _: 31, 16; - pub P, _: 47, 32; + pub struct bch_scrypt_flags(u64); + pub N, _: 15, 0; + pub R, _: 31, 16; + pub P, _: 47, 32; } bitfield! { - pub struct bch_crypt_flags(u64); - pub TYPE, _: 4, 0; + pub struct bch_crypt_flags(u64); + pub TYPE, _: 4, 0; } use memoffset::offset_of; impl bch_sb_field_crypt { - pub fn scrypt_flags(&self) -> Option { - use std::convert::TryInto; - match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) { - bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)), - _ => None, - } - } - pub fn key(&self) -> &bch_encrypted_key { - &self.key - } + pub fn scrypt_flags(&self) -> Option { + use std::convert::TryInto; + match bch_kdf_types(bch_crypt_flags(self.flags).TYPE().try_into().ok()?) { + bch_kdf_types::BCH_KDF_SCRYPT => Some(bch_scrypt_flags(self.kdf_flags)), + _ => None, + } + } + pub fn key(&self) -> &bch_encrypted_key { + &self.key + } } impl PartialEq for bch_sb { - fn eq(&self, other: &Self) -> bool { - self.magic.b == other.magic.b - && self.user_uuid.b == other.user_uuid.b - && self.block_size == other.block_size - && self.version == other.version - && self.uuid.b == other.uuid.b - && self.seq == other.seq - } -} - -impl std::fmt::Debug for bch_sb { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("bch_sb") - .field("uuid", &self.uuid()) - .field("version", &(self.version, self.version_min)) - .field("block_size", &self.block_size) - .field("device_idx", &self.dev_idx) - .field("seq", &self.seq) - .field("csum", &(self.csum.lo, self.csum.hi)) - .field("offset", &self.offset) - .finish_non_exhaustive() + fn eq(&self, other: &Self) -> bool { + self.magic.b == other.magic.b + && self.user_uuid.b == other.user_uuid.b + && self.block_size == other.block_size + && self.version == other.version + && self.uuid.b == other.uuid.b + && self.seq == other.seq } } - impl bch_sb { - pub fn crypt(&self) -> Option<&bch_sb_field_crypt> { - unsafe { - let ptr = bch2_sb_field_get(self as *const _ as *mut _, bch_sb_field_type::BCH_SB_FIELD_crypt) as *const u8; - if ptr.is_null() { - None - } else { - let offset = offset_of!(bch_sb_field_crypt, field); - Some(&*((ptr.sub(offset)) as *const _)) - } - } - } - pub fn uuid(&self) -> uuid::Uuid { - uuid::Uuid::from_bytes(self.user_uuid.b) - } + pub fn crypt(&self) -> Option<&bch_sb_field_crypt> { + unsafe { + let ptr = bch2_sb_field_get( + self as *const _ as *mut _, + bch_sb_field_type::BCH_SB_FIELD_crypt, + ) as *const u8; + if ptr.is_null() { + None + } else { + let offset = offset_of!(bch_sb_field_crypt, field); + Some(&*((ptr.sub(offset)) as *const _)) + } + } + } + pub fn uuid(&self) -> uuid::Uuid { + uuid::Uuid::from_bytes(self.user_uuid.b) + } - /// Get the nonce used to encrypt the superblock - pub fn nonce(&self) -> nonce { - use byteorder::{LittleEndian, ReadBytesExt}; - let mut internal_uuid = &self.uuid.b[..]; - let dword1 = internal_uuid.read_u32::().unwrap(); - let dword2 = internal_uuid.read_u32::().unwrap(); - nonce { - d: [0, 0, dword1, dword2], - } - } + /// Get the nonce used to encrypt the superblock + pub fn nonce(&self) -> nonce { + use byteorder::{LittleEndian, ReadBytesExt}; + let mut internal_uuid = &self.uuid.b[..]; + let dword1 = internal_uuid.read_u32::().unwrap(); + let dword2 = internal_uuid.read_u32::().unwrap(); + nonce { + d: [0, 0, dword1, dword2], + } + } } impl bch_sb_handle { - pub fn sb(&self) -> &bch_sb { - unsafe { &*self.sb } - } + pub fn sb(&self) -> &bch_sb { + unsafe { &*self.sb } + } - pub fn bdev(&self) -> &block_device { - unsafe { &*self.bdev } - } + pub fn bdev(&self) -> &block_device { + unsafe { &*self.bdev } + } } #[repr(C)] // #[repr(align(8))] #[derive(Debug, Default, Copy, Clone)] pub struct bch_extent_ptr { - pub _bitfield_1: __BindgenBitfieldUnit<[u8; 8usize]>, + pub _bitfield_1: __BindgenBitfieldUnit<[u8; 8usize]>, } #[repr(C, packed(8))] pub struct btree_node { - pub csum: bch_csum, - pub magic: __le64, - pub flags: __le64, - pub min_key: bpos, - pub max_key: bpos, - pub _ptr: bch_extent_ptr, - pub format: bkey_format, - pub __bindgen_anon_1: btree_node__bindgen_ty_1, + pub csum: bch_csum, + pub magic: __le64, + pub flags: __le64, + pub min_key: bpos, + pub max_key: bpos, + pub _ptr: bch_extent_ptr, + pub format: bkey_format, + pub __bindgen_anon_1: btree_node__bindgen_ty_1, } #[repr(C, packed(8))] // #[repr(align(8))] #[derive(Debug, Default, Copy, Clone)] pub struct bch_extent_crc32 { - pub _bitfield_1: __BindgenBitfieldUnit<[u8; 4usize]>, - pub csum: __u32, + pub _bitfield_1: __BindgenBitfieldUnit<[u8; 4usize]>, + pub csum: __u32, } // #[repr(u8)] diff --git a/rust-src/bch_bindgen/src/bkey.rs b/rust-src/bch_bindgen/src/bkey.rs new file mode 100644 index 0000000..64697ea --- /dev/null +++ b/rust-src/bch_bindgen/src/bkey.rs @@ -0,0 +1,121 @@ +#![allow(non_camel_case_types)] + +use crate::c; +use crate::fs::Fs; +use crate::btree::BtreeIter; +use crate::printbuf_to_formatter; +use std::fmt; +use std::marker::PhantomData; +use std::mem::transmute; + +pub struct BkeySC<'a> { + pub k: &'a c::bkey, + pub v: &'a c::bch_val, + pub(crate) iter: PhantomData<&'a mut BtreeIter<'a>> +} + +pub enum BkeyValC<'a> { + deleted, + whiteout, + error, + cookie(&'a c::bch_cookie), + hash_whiteout(&'a c::bch_hash_whiteout), + btree_ptr(&'a c::bch_btree_ptr), + extent(&'a c::bch_extent), + reservation(&'a c::bch_reservation), + inode(&'a c::bch_inode), + inode_generation(&'a c::bch_inode_generation), + dirent(&'a c::bch_dirent), + xattr(&'a c::bch_xattr), + alloc(&'a c::bch_alloc), + quota(&'a c::bch_quota), + stripe(&'a c::bch_stripe), + reflink_p(&'a c::bch_reflink_p), + reflink_v(&'a c::bch_reflink_v), + inline_data(&'a c::bch_inline_data), + btree_ptr_v2(&'a c::bch_btree_ptr_v2), + indirect_inline_data(&'a c::bch_indirect_inline_data), + alloc_v2(&'a c::bch_alloc_v2), + subvolume(&'a c::bch_subvolume), + snapshot(&'a c::bch_snapshot), + inode_v2(&'a c::bch_inode_v2), + alloc_v3(&'a c::bch_alloc_v3), + set, + lru(&'a c::bch_lru), + alloc_v4(&'a c::bch_alloc_v4), + backpointer(&'a c::bch_backpointer), + inode_v3(&'a c::bch_inode_v3), + bucket_gens(&'a c::bch_bucket_gens), + snapshot_tree(&'a c::bch_snapshot_tree), +} + +impl<'a, 'b> BkeySC<'a> { + unsafe fn to_raw(&self) -> c::bkey_s_c { + c::bkey_s_c { k: self.k, v: self.v } + } + + pub fn to_text(&'a self, fs: &'b Fs) -> BkeySCToText<'a, 'b> { + BkeySCToText { k: self, fs } + } + + pub fn v(&'a self) -> BkeyValC { + let ty: c::bch_bkey_type = unsafe { transmute(self.k.type_ as u32) }; + + use c::bch_bkey_type::*; + use BkeyValC::*; + match ty { + KEY_TYPE_deleted => deleted, + KEY_TYPE_whiteout => whiteout, + KEY_TYPE_error => error, + KEY_TYPE_cookie => cookie(unsafe { transmute(self.v) }), + KEY_TYPE_hash_whiteout => hash_whiteout(unsafe { transmute(self.v) }), + KEY_TYPE_btree_ptr => btree_ptr(unsafe { transmute(self.v) }), + KEY_TYPE_extent => extent(unsafe { transmute(self.v) }), + KEY_TYPE_reservation => reservation(unsafe { transmute(self.v) }), + KEY_TYPE_inode => inode(unsafe { transmute(self.v) }), + KEY_TYPE_inode_generation => inode_generation(unsafe { transmute(self.v) }), + KEY_TYPE_dirent => dirent(unsafe { transmute(self.v) }), + KEY_TYPE_xattr => xattr(unsafe { transmute(self.v) }), + KEY_TYPE_alloc => alloc(unsafe { transmute(self.v) }), + KEY_TYPE_quota => quota(unsafe { transmute(self.v) }), + KEY_TYPE_stripe => stripe(unsafe { transmute(self.v) }), + KEY_TYPE_reflink_p => reflink_p(unsafe { transmute(self.v) }), + KEY_TYPE_reflink_v => reflink_v(unsafe { transmute(self.v) }), + KEY_TYPE_inline_data => inline_data(unsafe { transmute(self.v) }), + KEY_TYPE_btree_ptr_v2 => btree_ptr_v2(unsafe { transmute(self.v) }), + KEY_TYPE_indirect_inline_data => indirect_inline_data(unsafe { transmute(self.v) }), + KEY_TYPE_alloc_v2 => alloc_v2(unsafe { transmute(self.v) }), + KEY_TYPE_subvolume => subvolume(unsafe { transmute(self.v) }), + KEY_TYPE_snapshot => snapshot(unsafe { transmute(self.v) }), + KEY_TYPE_inode_v2 => inode_v2(unsafe { transmute(self.v) }), + KEY_TYPE_alloc_v3 => inode_v3(unsafe { transmute(self.v) }), + KEY_TYPE_set => set, + KEY_TYPE_lru => lru(unsafe { transmute(self.v) }), + KEY_TYPE_alloc_v4 => alloc_v4(unsafe { transmute(self.v) }), + KEY_TYPE_backpointer => backpointer(unsafe { transmute(self.v) }), + KEY_TYPE_inode_v3 => inode_v3(unsafe { transmute(self.v) }), + KEY_TYPE_bucket_gens => bucket_gens(unsafe { transmute(self.v) }), + KEY_TYPE_snapshot_tree => snapshot_tree(unsafe { transmute(self.v) }), + KEY_TYPE_MAX => unreachable!(), + } + } +} + +impl<'a> From<&'a c::bkey_i> for BkeySC<'a> { + fn from(k: &'a c::bkey_i) -> Self { + BkeySC { k: &k.k, v: &k.v, iter: PhantomData } + } +} + +pub struct BkeySCToText<'a, 'b> { + k: &'a BkeySC<'a>, + fs: &'b Fs, +} + +impl<'a, 'b> fmt::Display for BkeySCToText<'a, 'b> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + unsafe { + printbuf_to_formatter(f, |buf| c::bch2_bkey_val_to_text(buf, self.fs.raw, self.k.to_raw())) + } + } +} diff --git a/rust-src/bch_bindgen/src/btree.rs b/rust-src/bch_bindgen/src/btree.rs new file mode 100644 index 0000000..32b4e74 --- /dev/null +++ b/rust-src/bch_bindgen/src/btree.rs @@ -0,0 +1,202 @@ +use crate::SPOS_MAX; +use crate::c; +use crate::bkey::BkeySC; +use crate::fs::Fs; +use crate::errcode::{bch_errcode, errptr_to_result_c}; +use crate::printbuf_to_formatter; +use std::fmt; +use std::marker::PhantomData; +use std::mem::MaybeUninit; +use std::ptr; +use bitflags::bitflags; + +pub struct BtreeTrans<'f> { + raw: c::btree_trans, + fs: PhantomData<&'f Fs> +} + +impl<'f> BtreeTrans<'f> { + pub fn new(fs: &'f Fs) -> BtreeTrans { + unsafe { + let mut trans: MaybeUninit = MaybeUninit::uninit(); + + c::__bch2_trans_init(&mut (*trans.as_mut_ptr()), fs.raw, 0); + BtreeTrans { raw: trans.assume_init(), fs: PhantomData } + } + } +} + +impl<'f> Drop for BtreeTrans<'f> { + fn drop(&mut self) { + unsafe { c::bch2_trans_exit(&mut self.raw) } + } +} + +bitflags! { + pub struct BtreeIterFlags: u16 { + const SLOTS = c::BTREE_ITER_SLOTS as u16; + const ALL_LEVELS = c::BTREE_ITER_ALL_LEVELS as u16; + const INTENT = c::BTREE_ITER_INTENT as u16; + const PREFETCH = c::BTREE_ITER_PREFETCH as u16; + const IS_EXTENTS = c::BTREE_ITER_IS_EXTENTS as u16; + const NOT_EXTENTS = c::BTREE_ITER_NOT_EXTENTS as u16; + const CACHED = c::BTREE_ITER_CACHED as u16; + const KEY_CACHED = c::BTREE_ITER_WITH_KEY_CACHE as u16; + const WITH_UPDATES = c::BTREE_ITER_WITH_UPDATES as u16; + const WITH_JOURNAL = c::BTREE_ITER_WITH_JOURNAL as u16; + const __ALL_SNAPSHOTS = c::__BTREE_ITER_ALL_SNAPSHOTS as u16; + const ALL_SNAPSHOTS = c::BTREE_ITER_ALL_SNAPSHOTS as u16; + const FILTER_SNAPSHOTS = c::BTREE_ITER_FILTER_SNAPSHOTS as u16; + const NOPRESERVE = c::BTREE_ITER_NOPRESERVE as u16; + const CACHED_NOFILL = c::BTREE_ITER_CACHED_NOFILL as u16; + const KEY_CACHE_FILL = c::BTREE_ITER_KEY_CACHE_FILL as u16; + } +} + +pub struct BtreeIter<'t> { + raw: c::btree_iter, + trans: PhantomData<&'t BtreeTrans<'t>>, +} + +impl<'t> BtreeIter<'t> { + pub fn new(trans: &'t BtreeTrans<'t>, btree: c::btree_id, pos: c::bpos, flags: BtreeIterFlags) -> BtreeIter<'t> { + unsafe { + let mut iter: MaybeUninit = MaybeUninit::uninit(); + + c::bch2_trans_iter_init_outlined( + ptr::addr_of!(trans.raw).cast_mut(), + iter.as_mut_ptr(), + btree as u32, + pos, + flags.bits as u32); + + BtreeIter { raw: iter.assume_init(), trans: PhantomData } + } + } + + pub fn peek_upto<'i>(&'i mut self, end: c::bpos) -> Result, bch_errcode> { + unsafe { + let k = c::bch2_btree_iter_peek_upto(&mut self.raw, end); + errptr_to_result_c(k.k) + .map(|_| if !k.k.is_null() { Some(BkeySC { k: &*k.k, v: &*k.v, iter: PhantomData }) } else { None } ) + } + } + + pub fn peek(&mut self) -> Result, bch_errcode> { + self.peek_upto(SPOS_MAX) + } + + pub fn peek_and_restart(&mut self) -> Result, bch_errcode> { + unsafe { + let k = c::bch2_btree_iter_peek_and_restart_outlined(&mut self.raw); + + errptr_to_result_c(k.k) + .map(|_| if !k.k.is_null() { Some(BkeySC{ k: &*k.k, v: &*k.v, iter: PhantomData }) } else { None } ) + } + } + + pub fn advance(&mut self) { + unsafe { + c::bch2_btree_iter_advance(&mut self.raw); + } + } +} + +impl<'t> Drop for BtreeIter<'t> { + fn drop(&mut self) { + unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) } + } +} + +pub struct BtreeNodeIter<'t> { + raw: c::btree_iter, + trans: PhantomData<&'t BtreeTrans<'t>>, +} + +impl<'t> BtreeNodeIter<'t> { + pub fn new(trans: &'t BtreeTrans<'t>, + btree: c::btree_id, + pos: c::bpos, + locks_want: u32, + depth: u32, + flags: BtreeIterFlags) -> BtreeNodeIter { + unsafe { + let mut iter: MaybeUninit = MaybeUninit::uninit(); + c::bch2_trans_node_iter_init( + ptr::addr_of!(trans.raw).cast_mut(), + iter.as_mut_ptr(), + btree, + pos, + locks_want, + depth, + flags.bits as u32); + + BtreeNodeIter { raw: iter.assume_init(), trans: PhantomData } + } + } + + pub fn peek<'i>(&'i mut self) -> Result, bch_errcode> { + unsafe { + let b = c::bch2_btree_iter_peek_node(&mut self.raw); + errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) + } + } + + pub fn peek_and_restart<'i>(&'i mut self) -> Result, bch_errcode> { + unsafe { + let b = c::bch2_btree_iter_peek_node_and_restart(&mut self.raw); + errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) + } + } + + pub fn advance<'i>(&'i mut self) { + unsafe { + c::bch2_btree_iter_next_node(&mut self.raw); + } + } + + pub fn next<'i>(&'i mut self) -> Result, bch_errcode> { + unsafe { + let b = c::bch2_btree_iter_next_node(&mut self.raw); + errptr_to_result_c(b).map(|b| if !b.is_null() { Some(&*b) } else { None }) + } + } +} + +impl<'t> Drop for BtreeNodeIter<'t> { + fn drop(&mut self) { + unsafe { c::bch2_trans_iter_exit(self.raw.trans, &mut self.raw) } + } +} + +impl<'b, 'f> c::btree { + pub fn to_text(&'b self, fs: &'f Fs) -> BtreeNodeToText<'b, 'f> { + BtreeNodeToText { b: &self, fs } + } + + pub fn ondisk_to_text(&'b self, fs: &'f Fs) -> BtreeNodeOndiskToText<'b, 'f> { + BtreeNodeOndiskToText { b: &self, fs } + } +} + +pub struct BtreeNodeToText<'b, 'f> { + b: &'b c::btree, + fs: &'f Fs, +} + +impl<'b, 'f> fmt::Display for BtreeNodeToText<'b, 'f> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + printbuf_to_formatter(f, |buf| unsafe { c::bch2_btree_node_to_text(buf, self.fs.raw, self.b) }) + } +} + +pub struct BtreeNodeOndiskToText<'b, 'f> { + b: &'b c::btree, + fs: &'f Fs, +} + +impl<'b, 'f> fmt::Display for BtreeNodeOndiskToText<'b, 'f> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + printbuf_to_formatter(f, |buf| unsafe { c::bch2_btree_node_ondisk_to_text(buf, self.fs.raw, self.b) }) + } +} diff --git a/rust-src/bch_bindgen/src/errcode.rs b/rust-src/bch_bindgen/src/errcode.rs new file mode 100644 index 0000000..4d75f1d --- /dev/null +++ b/rust-src/bch_bindgen/src/errcode.rs @@ -0,0 +1,40 @@ +use crate::bcachefs; +use std::ffi::CStr; +use std::fmt; + +pub use crate::c::bch_errcode; + +impl fmt::Display for bch_errcode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = unsafe { CStr::from_ptr(bcachefs::bch2_err_str(*self as i32)) }; + write!(f, "{:?}", s) + } +} + +/* Can we make a function generic over ptr constness? */ + +pub fn errptr_to_result(p: *mut T) -> Result<*mut T, bch_errcode> { + let addr = p as usize; + let max_err: isize = -4096; + if addr > max_err as usize { + let addr = addr as i32; + let err: bch_errcode = unsafe { std::mem::transmute(-addr) }; + Err(err) + } else { + Ok(p) + } +} + +pub fn errptr_to_result_c(p: *const T) -> Result<*const T, bch_errcode> { + let addr = p as usize; + let max_err: isize = -4096; + if addr > max_err as usize { + let addr = addr as i32; + let err: bch_errcode = unsafe { std::mem::transmute(-addr) }; + Err(err) + } else { + Ok(p) + } +} + +impl std::error::Error for bch_errcode {} diff --git a/rust-src/bch_bindgen/src/fs.rs b/rust-src/bch_bindgen/src/fs.rs new file mode 100644 index 0000000..b26c51b --- /dev/null +++ b/rust-src/bch_bindgen/src/fs.rs @@ -0,0 +1,27 @@ +use std::ffi::CString; +use std::os::unix::ffi::OsStrExt; +use std::path::PathBuf; +use crate::c; +use crate::errcode::{bch_errcode, errptr_to_result}; + +pub struct Fs { + pub raw: *mut c::bch_fs, +} + +impl Fs { + pub fn open(devs: &Vec, opts: c::bch_opts) -> Result { + let devs: Vec<_> = devs.iter() + .map(|i| CString::new(i.as_os_str().as_bytes()).unwrap().into_raw()) + .collect(); + + let ret = unsafe { c::bch2_fs_open(devs[..].as_ptr(), devs.len() as u32, opts) }; + + errptr_to_result(ret).map(|fs| Fs { raw: fs}) + } +} + +impl Drop for Fs { + fn drop(&mut self) { + unsafe { c::bch2_fs_stop(self.raw) } + } +} diff --git a/rust-src/bch_bindgen/src/lib.rs b/rust-src/bch_bindgen/src/lib.rs index c19b5a2..73aeef6 100644 --- a/rust-src/bch_bindgen/src/lib.rs +++ b/rust-src/bch_bindgen/src/lib.rs @@ -1,7 +1,168 @@ pub mod bcachefs; +pub mod btree; +pub mod bkey; +pub mod errcode; pub mod keyutils; pub mod rs; +pub mod fs; +pub mod opts; +pub use paste::paste; pub mod c { - pub use crate::bcachefs::*; + pub use crate::bcachefs::*; +} + +use c::bpos as Bpos; + +pub const fn spos(inode: u64, offset: u64, snapshot: u32) -> Bpos { + Bpos { inode, offset, snapshot } +} + +pub const fn pos(inode: u64, offset: u64) -> Bpos { + spos(inode, offset, 0) +} + +pub const POS_MIN: Bpos = spos(0, 0, 0); +pub const POS_MAX: Bpos = spos(u64::MAX, u64::MAX, 0); +pub const SPOS_MAX: Bpos = spos(u64::MAX, u64::MAX, u32::MAX); + +use std::cmp::Ordering; + +impl PartialEq for Bpos { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl Eq for Bpos {} + +impl PartialOrd for Bpos { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Bpos { + fn cmp(&self, other: &Self) -> Ordering { + let l_inode = self.inode; + let r_inode = other.inode; + let l_offset = self.offset; + let r_offset = other.offset; + let l_snapshot = self.snapshot; + let r_snapshot = other.snapshot; + + l_inode.cmp(&r_inode) + .then(l_offset.cmp(&r_offset)) + .then(l_snapshot.cmp(&r_snapshot)) + } +} + +use std::ffi::CStr; +use std::fmt; + +impl fmt::Display for c::btree_id { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let s = unsafe { CStr::from_ptr(*c::bch2_btree_ids.get_unchecked(*self as usize)) }; + let s = s.to_str().unwrap(); + write!(f, "{}", s) + } +} + +use std::str::FromStr; +use std::ffi::CString; + +use std::error::Error; + +#[derive(Debug)] +pub struct InvalidBtreeId; + +impl fmt::Display for InvalidBtreeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "invalid btree id") + } +} + +impl Error for InvalidBtreeId { +} + +impl FromStr for c::btree_id { + type Err = InvalidBtreeId; + + fn from_str(s: &str) -> Result { + let s = CString::new(s).unwrap(); + let p = s.as_ptr(); + + let v = unsafe {c::match_string(c::bch2_btree_ids[..].as_ptr(), (-(1 as isize)) as usize, p)}; + if v >= 0 { + Ok(unsafe { std::mem::transmute(v) }) + } else { + Err(InvalidBtreeId) + } + } +} + +impl c::printbuf { + fn new() -> c::printbuf { + let mut buf: c::printbuf = Default::default(); + + buf.set_heap_allocated(true); + buf + } +} + +impl Drop for c::printbuf { + fn drop(&mut self) { + unsafe { c::bch2_printbuf_exit(self) } + } +} + +impl fmt::Display for Bpos { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut buf = c::printbuf::new(); + + unsafe { c::bch2_bpos_to_text(&mut buf, *self) }; + + let s = unsafe { CStr::from_ptr(buf.buf) }; + let s = s.to_str().unwrap(); + write!(f, "{}", s) + } +} + +impl FromStr for c::bpos { + type Err = InvalidBtreeId; + + fn from_str(s: &str) -> Result { + if s == "POS_MIN" { + return Ok(POS_MIN); + } + + if s == "POS_MAX" { + return Ok(POS_MAX); + } + + if s == "SPOS_MAX" { + return Ok(SPOS_MAX); + } + + let mut fields = s.split(':'); + let ino_str = fields.next().ok_or(InvalidBtreeId)?; + let off_str = fields.next().ok_or(InvalidBtreeId)?; + let snp_str = fields.next(); + + let ino: u64 = ino_str.parse().map_err(|_| InvalidBtreeId)?; + let off: u64 = off_str.parse().map_err(|_| InvalidBtreeId)?; + let snp: u32 = snp_str.map(|s| s.parse().ok()).flatten().unwrap_or(0); + + Ok(c::bpos { inode: ino, offset: off, snapshot: snp }) + } +} + +pub fn printbuf_to_formatter(f: &mut fmt::Formatter<'_>, func: F) -> fmt::Result + where F: Fn(*mut c::printbuf) { + let mut buf = c::printbuf::new(); + + func(&mut buf); + + let s = unsafe { CStr::from_ptr(buf.buf) }; + f.write_str(s.to_str().unwrap()) } diff --git a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h index 2a0e702..e7bcfcf 100644 --- a/rust-src/bch_bindgen/src/libbcachefs_wrapper.h +++ b/rust-src/bch_bindgen/src/libbcachefs_wrapper.h @@ -1,8 +1,20 @@ #include "../libbcachefs/super-io.h" #include "../libbcachefs/checksum.h" #include "../libbcachefs/bcachefs_format.h" +#include "../libbcachefs/btree_cache.h" +#include "../libbcachefs/btree_iter.h" +#include "../libbcachefs/debug.h" +#include "../libbcachefs/errcode.h" +#include "../libbcachefs/error.h" #include "../libbcachefs/opts.h" #include "../libbcachefs.h" #include "../crypto.h" #include "../include/linux/bio.h" +#include "../include/linux/blkdev.h" + +#define MARK_FIX_753(req_name) const fmode_t Fix753_##req_name = req_name; + +MARK_FIX_753(FMODE_READ); +MARK_FIX_753(FMODE_WRITE); +MARK_FIX_753(FMODE_EXCL); \ No newline at end of file diff --git a/rust-src/bch_bindgen/src/opts.rs b/rust-src/bch_bindgen/src/opts.rs new file mode 100644 index 0000000..d38d469 --- /dev/null +++ b/rust-src/bch_bindgen/src/opts.rs @@ -0,0 +1,35 @@ +#[macro_export] +macro_rules! opt_set { + ($opts:ident, $n:ident, $v:expr) => { + bch_bindgen::paste! { + $opts.$n = $v; + $opts.[](1) + } + }; +} + +#[macro_export] +macro_rules! opt_defined { + ($opts:ident, $n:ident) => { + bch_bindgen::paste! { + $opts.[< $n _defined>]() + } + }; +} + +#[macro_export] +macro_rules! opt_get { + ($opts:ident, $n:ident) => { + if bch_bindgen::opt_defined!($opts, $n) == 0 { + bch_bindgen::paste! { + unsafe { + bch_bindgen::bcachefs::bch2_opts_default.$n + } + } + } else { + bch_bindgen::paste! { + $opts.$n + } + } + }; +} diff --git a/rust-src/bch_bindgen/src/rs.rs b/rust-src/bch_bindgen/src/rs.rs index 4452f0b..24594ae 100644 --- a/rust-src/bch_bindgen/src/rs.rs +++ b/rust-src/bch_bindgen/src/rs.rs @@ -1,58 +1,29 @@ +use anyhow::anyhow; use crate::bcachefs; - -pub const SUPERBLOCK_MAGIC: uuid::Uuid = uuid::Uuid::from_u128( - 0x_c68573f6_4e1a_45ca_8265_f57f48ba6d81 -); - -extern "C" { - pub static stdout: *mut libc::FILE; -} - -pub enum ReadSuperErr { - Io(std::io::Error), -} - -type RResult = std::io::Result>; - -#[tracing_attributes::instrument(skip(opts))] -pub fn read_super_opts(path: &std::path::Path, mut opts: bcachefs::bch_opts) -> RResult { - // let devp = camino::Utf8Path::from_path(devp).unwrap(); - - use std::os::unix::ffi::OsStrExt; - let path = std::ffi::CString::new(path.as_os_str().as_bytes())?; - - let mut sb = std::mem::MaybeUninit::zeroed(); - - // use gag::{BufferRedirect}; - // // Stop libbcachefs from spamming the output - // let gag = BufferRedirect::stderr().unwrap(); - // tracing::trace!("entering libbcachefs"); - - let ret = unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) }; - tracing::trace!(%ret); - - match -ret { - libc::EACCES => Err(std::io::Error::new( - std::io::ErrorKind::PermissionDenied, - "Access Permission Denied", - )), - 0 => Ok(Ok(unsafe { sb.assume_init() })), - 22 => Ok(Err(std::io::Error::new( - std::io::ErrorKind::InvalidData, - "Not a BCacheFS SuperBlock", - ))), - code => { - tracing::debug!(msg = "BCacheFS return error code", ?code); - Ok(Err(std::io::Error::new( - std::io::ErrorKind::Other, - "Failed to Read SuperBlock", - ))) - } - } +use crate::bcachefs::*; +use crate::errcode::bch_errcode; + +pub fn read_super_opts( + path: &std::path::Path, + mut opts: bch_opts, +) -> anyhow::Result { + use std::os::unix::ffi::OsStrExt; + let path = std::ffi::CString::new(path.as_os_str().as_bytes()).unwrap(); + + let mut sb = std::mem::MaybeUninit::zeroed(); + + let ret = + unsafe { crate::bcachefs::bch2_read_super(path.as_ptr(), &mut opts, sb.as_mut_ptr()) }; + + if ret != 0 { + let err: bch_errcode = unsafe { ::std::mem::transmute(ret) }; + Err(anyhow!(err)) + } else { + Ok(unsafe { sb.assume_init() }) + } } -#[tracing_attributes::instrument] -pub fn read_super(path: &std::path::Path) -> RResult { - let opts = bcachefs::bch_opts::default(); //unsafe {std::mem::MaybeUninit::zeroed().assume_init()}; - read_super_opts(path, opts) +pub fn read_super(path: &std::path::Path) -> anyhow::Result { + let opts = bcachefs::bch_opts::default(); + read_super_opts(path, opts) } diff --git a/rust-src/mount/Cargo.toml b/rust-src/mount/Cargo.toml deleted file mode 100644 index d48d4f7..0000000 --- a/rust-src/mount/Cargo.toml +++ /dev/null @@ -1,30 +0,0 @@ -[package] -name = "bcachefs-mount" -version = "0.3.1" -authors = ["Yuxuan Shui ", "Kayla Firestack "] -edition = "2018" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -tracing = "0.1.26" -tracing-log = "0.1.2" -tracing-subscriber = "0.2.20" -tracing-attributes = "0.1.15" -clap = { version = "2.33", features = [ "wrap_help" ] } -anyhow = "1.0" -libc = "0.2.69" -uuid = "0.8" -udev = "0.4" -gag = "1.0.0" -getset = "0.1" -itertools = "0.9" -structopt = "0.3.23" -parse-display = "0.1" -errno = "0.2" -either = "1.5" -rpassword = "4" -camino = "1.0.5" -bch_bindgen = { path = "../bch_bindgen" } -byteorder = "1.3" - diff --git a/rust-src/mount/default.nix b/rust-src/mount/default.nix deleted file mode 100644 index dab7db7..0000000 --- a/rust-src/mount/default.nix +++ /dev/null @@ -1,41 +0,0 @@ -{ lib - -, stdenv -, glibc -, llvmPackages -, rustPlatform - -, bcachefs - -, ... -}: rustPlatform.buildRustPackage ( let - cargo = lib.trivial.importTOML ./Cargo.toml; -in { - pname = "mount.bcachefs"; - version = cargo.package.version; - - src = builtins.path { path = ../.; name = "rust-src"; }; - sourceRoot = "rust-src/mount"; - - cargoLock = { lockFile = ./Cargo.lock; }; - - nativeBuildInputs = bcachefs.bch_bindgen.nativeBuildInputs; - buildInputs = bcachefs.bch_bindgen.buildInputs; - inherit (bcachefs.bch_bindgen) - LIBBCACHEFS_INCLUDE - LIBBCACHEFS_LIB - LIBCLANG_PATH - BINDGEN_EXTRA_CLANG_ARGS; - - postInstall = '' - ln $out/bin/${cargo.package.name} $out/bin/mount.bcachefs - ln -s $out/bin $out/sbin - ''; - # -isystem ${llvmPackages.libclang.lib}/lib/clang/${lib.getVersion llvmPackages.libclang}/include"; - # CFLAGS = "-I${llvmPackages.libclang.lib}/include"; - # LDFLAGS = "-L${libcdev}"; - - doCheck = false; - - # NIX_DEBUG = 4; -}) \ No newline at end of file diff --git a/rust-src/mount/module.nix b/rust-src/mount/module.nix deleted file mode 100644 index b62aa7d..0000000 --- a/rust-src/mount/module.nix +++ /dev/null @@ -1,54 +0,0 @@ -## Mirrors: https://github.com/NixOS/nixpkgs/blob/nixos-unstable/nixos/modules/tasks/filesystems/bcachefs.nix -## with changes to use flakes and import mount.bcachefs -{ config, lib, pkgs, utils, ... }: - -with lib; - -let - - bootFs = filterAttrs (n: fs: (fs.fsType == "bcachefs") && (utils.fsNeededForBoot fs)) config.fileSystems; - cfg = config.filesystems.bcachefs; -in - -{ - options.filesystems.bcachefs.packages.tools = lib.mkOption { - description = "Which package to use to link in the bcachefs tools package"; - default = pkgs.bcachefs.tools; - type = lib.types.package; - }; - options.filesystems.bcachefs.packages.mount = lib.mkOption { - description = "Which package to use to link in the bcachefs mount package"; - default = pkgs.bcachefs.mount; - type = lib.types.package; - }; - options.filesystems.bcachefs.packages.kernelPackages = lib.mkOption { - description = "Which package to use to link in the kernel package to use"; - default = pkgs.bcachefs.kernelPackages; - type = lib.types.attrs; - - }; - - config = mkIf (elem "bcachefs" config.boot.supportedFilesystems) (mkMerge [ - { - system.fsPackages = [ cfg.packages.tools cfg.packages.mount ]; - - # use kernel package with bcachefs support until it's in mainline - boot.kernelPackages = cfg.packages.kernelPackages; - } - - (mkIf ((elem "bcachefs" config.boot.initrd.supportedFilesystems) || (bootFs != {})) { - # chacha20 and poly1305 are required only for decryption attempts - boot.initrd.availableKernelModules = [ "sha256" "chacha20" "poly1305" ]; - boot.initrd.kernelModules = [ "bcachefs" ]; - - boot.initrd.extraUtilsCommands = '' - copy_bin_and_libs ${cfg.packages.tools}/bin/bcachefs - copy_bin_and_libs ${cfg.packages.mount}/bin/mount.bcachefs - ''; - boot.initrd.extraUtilsCommandsTest = '' - $out/bin/bcachefs version - $out/bin/mount.bcachefs --version - ''; - }) - ]); -} diff --git a/rust-src/mount/rustfmt.toml b/rust-src/mount/rustfmt.toml deleted file mode 100644 index a2b7f32..0000000 --- a/rust-src/mount/rustfmt.toml +++ /dev/null @@ -1,2 +0,0 @@ -max_width=120 -hard_tabs = true diff --git a/rust-src/mount/src/filesystem.rs b/rust-src/mount/src/filesystem.rs deleted file mode 100644 index b1575c2..0000000 --- a/rust-src/mount/src/filesystem.rs +++ /dev/null @@ -1,208 +0,0 @@ -extern "C" { - pub static stdout: *mut libc::FILE; -} - -use getset::{CopyGetters, Getters}; -use std::path::PathBuf; -#[derive(Getters, CopyGetters)] -pub struct FileSystem { - /// External UUID of the bcachefs - #[getset(get = "pub")] - uuid: uuid::Uuid, - /// Whether filesystem is encrypted - #[getset(get_copy = "pub")] - encrypted: bool, - /// Super block - #[getset(get = "pub")] - sb: bcachefs::bch_sb_handle, - /// Member devices for this filesystem - #[getset(get = "pub")] - devices: Vec, -} -impl std::fmt::Debug for FileSystem { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("FileSystem") - .field("uuid", &self.uuid) - .field("encrypted", &self.encrypted) - .field("devices", &self.device_string()) - .finish() - } -} -use std::fmt; -impl std::fmt::Display for FileSystem { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let devs = self.device_string(); - write!( - f, - "{:?}: locked?={lock} ({}) ", - self.uuid, - devs, - lock = self.encrypted - ) - } -} - -impl FileSystem { - pub(crate) fn new(sb: bcachefs::bch_sb_handle) -> Self { - Self { - uuid: sb.sb().uuid(), - encrypted: sb.sb().crypt().is_some(), - sb: sb, - devices: Vec::new(), - } - } - - pub fn device_string(&self) -> String { - use itertools::Itertools; - self.devices.iter().map(|d| d.display()).join(":") - } - - pub fn mount( - &self, - target: impl AsRef, - options: impl AsRef, - ) -> anyhow::Result<()> { - tracing::info_span!("mount").in_scope(|| { - let src = self.device_string(); - let (data, mountflags) = parse_mount_options(options); - // let fstype = c_str!("bcachefs"); - - tracing::info!(msg="mounting bcachefs filesystem", target=%target.as_ref().display()); - mount_inner(src, target, "bcachefs", mountflags, data) - }) - } -} - -fn mount_inner( - src: String, - target: impl AsRef, - fstype: &str, - mountflags: u64, - data: Option, -) -> anyhow::Result<()> { - use std::{ - ffi::{c_void, CString}, - os::{raw::c_char, unix::ffi::OsStrExt}, - }; - - // bind the CStrings to keep them alive - let src = CString::new(src)?; - let target = CString::new(target.as_ref().as_os_str().as_bytes())?; - let data = data.map(CString::new).transpose()?; - let fstype = CString::new(fstype)?; - - // convert to pointers for ffi - let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; - let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; - let data = data.as_ref().map_or(std::ptr::null(), |data| { - data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void - }); - let fstype = fstype.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; - - let ret = {let _entered = tracing::info_span!("libc::mount").entered(); - tracing::info!("mounting filesystem"); - // REQUIRES: CAP_SYS_ADMIN - unsafe { libc::mount(src, target, fstype, mountflags, data) } - }; - match ret { - 0 => Ok(()), - _ => Err(crate::ErrnoError(errno::errno()).into()), - } -} - -/// Parse a comma-separated mount options and split out mountflags and filesystem -/// specific options. -#[tracing_attributes::instrument(skip(options))] -fn parse_mount_options(options: impl AsRef) -> (Option, u64) { - use either::Either::*; - tracing::debug!(msg="parsing mount options", options=?options.as_ref()); - let (opts, flags) = options - .as_ref() - .split(",") - .map(|o| match o { - "dirsync" => Left(libc::MS_DIRSYNC), - "lazytime" => Left(1 << 25), // MS_LAZYTIME - "mand" => Left(libc::MS_MANDLOCK), - "noatime" => Left(libc::MS_NOATIME), - "nodev" => Left(libc::MS_NODEV), - "nodiratime" => Left(libc::MS_NODIRATIME), - "noexec" => Left(libc::MS_NOEXEC), - "nosuid" => Left(libc::MS_NOSUID), - "ro" => Left(libc::MS_RDONLY), - "rw" => Left(0), - "relatime" => Left(libc::MS_RELATIME), - "strictatime" => Left(libc::MS_STRICTATIME), - "sync" => Left(libc::MS_SYNCHRONOUS), - "" => Left(0), - o @ _ => Right(o), - }) - .fold((Vec::new(), 0), |(mut opts, flags), next| match next { - Left(f) => (opts, flags | f), - Right(o) => { - opts.push(o); - (opts, flags) - } - }); - - use itertools::Itertools; - ( - if opts.len() == 0 { - None - } else { - Some(opts.iter().join(",")) - }, - flags, - ) -} - -use bch_bindgen::bcachefs; -use std::collections::HashMap; -use uuid::Uuid; - -#[tracing_attributes::instrument] -pub fn probe_filesystems() -> anyhow::Result> { - tracing::trace!("enumerating udev devices"); - let mut udev = udev::Enumerator::new()?; - - udev.match_subsystem("block")?; // find kernel block devices - - let mut fs_map = HashMap::new(); - let devresults = - udev.scan_devices()? - .into_iter() - .filter_map(|dev| dev.devnode().map(ToOwned::to_owned)); - - for pathbuf in devresults { - match get_super_block_uuid(&pathbuf)? { - - Ok((uuid_key, superblock)) => { - let fs = fs_map.entry(uuid_key).or_insert_with(|| { - tracing::info!(msg="found bcachefs pool", uuid=?uuid_key); - FileSystem::new(superblock) - }); - - fs.devices.push(pathbuf); - }, - - Err(e) => { tracing::debug!(inner2_error=?e);} - } - } - - - tracing::info!(msg = "found filesystems", count = fs_map.len()); - Ok(fs_map) -} - -// #[tracing_attributes::instrument(skip(dev, fs_map))] -fn get_super_block_uuid(path: &std::path::Path) -> std::io::Result> { - let sb = bch_bindgen::rs::read_super(&path)?; - let super_block = match sb { - Err(e) => { return Ok(Err(e)); } - Ok(sb) => sb, - }; - - let uuid = (&super_block).sb().uuid(); - tracing::debug!(found="bcachefs superblock", devnode=?path, ?uuid); - - Ok(Ok((uuid, super_block))) -} diff --git a/rust-src/mount/src/key.rs b/rust-src/mount/src/key.rs deleted file mode 100644 index 91c92d1..0000000 --- a/rust-src/mount/src/key.rs +++ /dev/null @@ -1,97 +0,0 @@ -use tracing::info; - -fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result { - use bch_bindgen::keyutils::{self, keyctl_search}; - let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _; - let key_type = c_str!("logon"); - - let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) }; - if key_id > 0 { - info!("Key has became avaiable"); - Ok(true) - } else if errno::errno().0 != libc::ENOKEY { - Err(crate::ErrnoError(errno::errno()).into()) - } else { - Ok(false) - } -} - -fn wait_for_key(uuid: &uuid::Uuid) -> anyhow::Result<()> { - let key_name = std::ffi::CString::new(format!("bcachefs:{}", uuid)).unwrap(); - loop { - if check_for_key(&key_name)? { - break Ok(()); - } - - std::thread::sleep(std::time::Duration::from_secs(1)); - } -} - -const BCH_KEY_MAGIC: &str = "bch**key"; -use crate::filesystem::FileSystem; -fn ask_for_key(fs: &FileSystem) -> anyhow::Result<()> { - use anyhow::anyhow; - use byteorder::{LittleEndian, ReadBytesExt}; - use bch_bindgen::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key}; - use std::os::raw::c_char; - - let key_name = std::ffi::CString::new(format!("bcachefs:{}", fs.uuid())).unwrap(); - if check_for_key(&key_name)? { - return Ok(()); - } - - let bch_key_magic = BCH_KEY_MAGIC.as_bytes().read_u64::().unwrap(); - let crypt = fs.sb().sb().crypt().unwrap(); - let pass = rpassword::read_password_from_tty(Some("Enter passphrase: "))?; - let pass = std::ffi::CString::new(pass.trim_end())?; // bind to keep the CString alive - let mut output: bch_key = unsafe { - bcachefs::derive_passphrase( - crypt as *const _ as *mut _, - pass.as_c_str().to_bytes_with_nul().as_ptr() as *const _, - ) - }; - - let mut key = crypt.key().clone(); - let ret = unsafe { - bch2_chacha_encrypt_key( - &mut output as *mut _, - fs.sb().sb().nonce(), - &mut key as *mut _ as *mut _, - std::mem::size_of::() as u64, - ) - }; - if ret != 0 { - Err(anyhow!("chacha decryption failure")) - } else if key.magic != bch_key_magic { - Err(anyhow!("failed to verify the password")) - } else { - let key_type = c_str!("logon"); - let ret = unsafe { - bch_bindgen::keyutils::add_key( - key_type, - key_name.as_c_str().to_bytes_with_nul() as *const _ as *const c_char, - &output as *const _ as *const _, - std::mem::size_of::() as u64, - bch_bindgen::keyutils::KEY_SPEC_USER_KEYRING, - ) - }; - if ret == -1 { - Err(anyhow!("failed to add key to keyring: {}", errno::errno())) - } else { - Ok(()) - } - } -} - -#[tracing_attributes::instrument] -pub fn prepare_key(fs: &FileSystem, password: crate::KeyLocation) -> anyhow::Result<()> { - use crate::KeyLocation::*; - use anyhow::anyhow; - - tracing::info!(msg = "checking if key exists for filesystem"); - match password { - Fail => Err(anyhow!("no key available")), - Wait => Ok(wait_for_key(fs.uuid())?), - Ask => ask_for_key(fs), - } -} diff --git a/rust-src/mount/src/lib.rs b/rust-src/mount/src/lib.rs deleted file mode 100644 index 4e918e1..0000000 --- a/rust-src/mount/src/lib.rs +++ /dev/null @@ -1,91 +0,0 @@ -use anyhow::anyhow; -use structopt::StructOpt; - -pub mod err { - pub enum GError { - Unknown{ - message: std::borrow::Cow<'static, String> - } - } - pub type GResult =::core::result::Result< ::core::result::Result, OE>; - pub type Result = GResult; -} - -#[macro_export] -macro_rules! c_str { - ($lit:expr) => { - unsafe { - std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char) - .to_bytes_with_nul() - .as_ptr() as *const std::os::raw::c_char - } - }; -} - -#[derive(Debug)] -struct ErrnoError(errno::Errno); -impl std::fmt::Display for ErrnoError { - fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { - self.0.fmt(f) - } -} -impl std::error::Error for ErrnoError {} - -#[derive(Debug)] -pub enum KeyLocation { - Fail, - Wait, - Ask, -} - -#[derive(Debug)] -pub struct KeyLoc(pub Option); -impl std::ops::Deref for KeyLoc { - type Target = Option; - fn deref(&self) -> &Self::Target { - &self.0 - } -} -impl std::str::FromStr for KeyLoc { - type Err = anyhow::Error; - fn from_str(s: &str) -> anyhow::Result { - // use anyhow::anyhow; - match s { - "" => Ok(KeyLoc(None)), - "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))), - "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))), - "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))), - _ => Err(anyhow!("invalid password option")), - } - } -} - -#[derive(StructOpt, Debug)] -/// Mount a bcachefs filesystem by its UUID. -pub struct Options { - /// Where the password would be loaded from. - /// - /// Possible values are: - /// "fail" - don't ask for password, fail if filesystem is encrypted; - /// "wait" - wait for password to become available before mounting; - /// "ask" - prompt the user for password; - #[structopt(short, long, default_value = "")] - pub key_location: KeyLoc, - - /// External UUID of the bcachefs filesystem - pub uuid: uuid::Uuid, - - /// Where the filesystem should be mounted. If not set, then the filesystem - /// won't actually be mounted. But all steps preceeding mounting the - /// filesystem (e.g. asking for passphrase) will still be performed. - pub mountpoint: Option, - - /// Mount options - #[structopt(short, default_value = "")] - pub options: String, -} - -pub mod filesystem; -pub mod key; - -// pub fn mnt_in_use() diff --git a/rust-src/mount/src/main.rs b/rust-src/mount/src/main.rs deleted file mode 100644 index 92b6917..0000000 --- a/rust-src/mount/src/main.rs +++ /dev/null @@ -1,63 +0,0 @@ -fn main() { - // convert existing log statements to tracing events - // tracing_log::LogTracer::init().expect("logtracer init failed!"); - // format tracing log data to env_logger like stdout - tracing_subscriber::fmt::init(); - - if let Err(e) = crate::main_inner() { - tracing::error!(fatal_error = ?e); - } -} - - - -#[tracing_attributes::instrument("main")] -pub fn main_inner() -> anyhow::Result<()> { - use structopt::StructOpt; - use bcachefs_mount::{Options, filesystem, key}; - unsafe { - libc::setvbuf( - filesystem::stdout, - std::ptr::null_mut(), - libc::_IONBF, - 0, - ); - // libc::fflush(filesystem::stdout); - } - let opt = Options::from_args(); - - - tracing::trace!(?opt); - - let fss = filesystem::probe_filesystems()?; - let fs = fss - .get(&opt.uuid) - .ok_or_else(|| anyhow::anyhow!("filesystem was not found"))?; - - tracing::info!(msg="found filesystem", %fs); - if fs.encrypted() { - let key = opt - .key_location - .0 - .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?; - - key::prepare_key(&fs, key)?; - } - - let mountpoint = opt - .mountpoint - .ok_or_else(|| anyhow::anyhow!("mountpoint option was not specified"))?; - - fs.mount(&mountpoint, &opt.options)?; - - Ok(()) -} - -#[cfg(test)] -mod test { - // use insta::assert_debug_snapshot; - // #[test] - // fn snapshot_testing() { - // insta::assert_debug_snapshot!(); - // } -} diff --git a/rust-src/rustfmt.toml b/rust-src/rustfmt.toml new file mode 100644 index 0000000..42f2ad7 --- /dev/null +++ b/rust-src/rustfmt.toml @@ -0,0 +1,3 @@ +# Default settings, i.e. idiomatic rust +edition = "2021" +newline_style = "Unix" \ No newline at end of file diff --git a/rust-src/src/cmd_list.rs b/rust-src/src/cmd_list.rs new file mode 100644 index 0000000..3f86b8c --- /dev/null +++ b/rust-src/src/cmd_list.rs @@ -0,0 +1,171 @@ +use atty::Stream; +use log::{error}; +use bch_bindgen::bcachefs; +use bch_bindgen::opt_set; +use bch_bindgen::fs::Fs; +use bch_bindgen::bkey::BkeySC; +use bch_bindgen::btree::BtreeTrans; +use bch_bindgen::btree::BtreeIter; +use bch_bindgen::btree::BtreeNodeIter; +use bch_bindgen::btree::BtreeIterFlags; +use clap::Parser; +use std::ffi::{CStr, OsStr, c_int, c_char}; +use std::os::unix::ffi::OsStrExt; + +fn list_keys(fs: &Fs, opt: Cli) -> anyhow::Result<()> { + let trans = BtreeTrans::new(fs); + let mut iter = BtreeIter::new(&trans, opt.btree, opt.start, + BtreeIterFlags::ALL_SNAPSHOTS| + BtreeIterFlags::PREFETCH); + + while let Some(k) = iter.peek_and_restart()? { + if k.k.p > opt.end { + break; + } + + println!("{}", k.to_text(fs)); + iter.advance(); + } + + Ok(()) +} + +fn list_btree_formats(fs: &Fs, opt: Cli) -> anyhow::Result<()> { + let trans = BtreeTrans::new(fs); + let mut iter = BtreeNodeIter::new(&trans, opt.btree, opt.start, + 0, opt.level, + BtreeIterFlags::PREFETCH); + + while let Some(b) = iter.peek_and_restart()? { + if b.key.k.p > opt.end { + break; + } + + println!("{}", b.to_text(fs)); + iter.advance(); + } + + Ok(()) +} + +fn list_btree_nodes(fs: &Fs, opt: Cli) -> anyhow::Result<()> { + let trans = BtreeTrans::new(fs); + let mut iter = BtreeNodeIter::new(&trans, opt.btree, opt.start, + 0, opt.level, + BtreeIterFlags::PREFETCH); + + while let Some(b) = iter.peek_and_restart()? { + if b.key.k.p > opt.end { + break; + } + + println!("{}", BkeySC::from(&b.key).to_text(fs)); + iter.advance(); + } + + Ok(()) +} + +fn list_nodes_ondisk(fs: &Fs, opt: Cli) -> anyhow::Result<()> { + let trans = BtreeTrans::new(fs); + let mut iter = BtreeNodeIter::new(&trans, opt.btree, opt.start, + 0, opt.level, + BtreeIterFlags::PREFETCH); + + while let Some(b) = iter.peek_and_restart()? { + if b.key.k.p > opt.end { + break; + } + + println!("{}", b.ondisk_to_text(fs)); + iter.advance(); + } + + Ok(()) +} + +#[derive(Clone, clap::ValueEnum)] +enum Mode { + Keys, + Formats, + Nodes, + NodesOndisk, +} + +#[derive(Parser)] +struct Cli { + /// Btree to list from + #[arg(short, long, default_value_t=bcachefs::btree_id::BTREE_ID_extents)] + btree: bcachefs::btree_id, + + /// Btree depth to descend to (0 == leaves) + #[arg(short, long, default_value_t=0)] + level: u32, + + /// Start position to list from + #[arg(short, long, default_value="POS_MIN")] + start: bcachefs::bpos, + + /// End position + #[arg(short, long, default_value="SPOS_MAX")] + end: bcachefs::bpos, + + #[arg(short, long, default_value="keys")] + mode: Mode, + + /// Check (fsck) the filesystem first + #[arg(short, long, default_value_t=false)] + fsck: bool, + + /// Force color on/off. Default: autodetect tty + #[arg(short, long, action = clap::ArgAction::Set, default_value_t=atty::is(Stream::Stdout))] + colorize: bool, + + /// Verbose mode + #[arg(short, long)] + verbose: bool, + + #[arg(required(true))] + devices: Vec, +} + +fn cmd_list_inner(opt: Cli) -> anyhow::Result<()> { + let mut fs_opts: bcachefs::bch_opts = Default::default(); + + opt_set!(fs_opts, nochanges, 1); + opt_set!(fs_opts, norecovery, 1); + opt_set!(fs_opts, degraded, 1); + opt_set!(fs_opts, errors, bcachefs::bch_error_actions::BCH_ON_ERROR_continue as u8); + + if opt.fsck { + opt_set!(fs_opts, fix_errors, bcachefs::fsck_err_opts::FSCK_FIX_yes as u8); + opt_set!(fs_opts, norecovery, 0); + } + + if opt.verbose { + opt_set!(fs_opts, verbose, 1); + } + + let fs = Fs::open(&opt.devices, fs_opts)?; + + match opt.mode { + Mode::Keys => list_keys(&fs, opt), + Mode::Formats => list_btree_formats(&fs, opt), + Mode::Nodes => list_btree_nodes(&fs, opt), + Mode::NodesOndisk => list_nodes_ondisk(&fs, opt), + } +} + +#[no_mangle] +pub extern "C" fn cmd_list(argc: c_int, argv: *const *const c_char) { + let argv: Vec<_> = (0..argc) + .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) }) + .map(|i| OsStr::from_bytes(i.to_bytes())) + .collect(); + + let opt = Cli::parse_from(argv); + colored::control::set_override(opt.colorize); + if let Err(e) = cmd_list_inner(opt) { + error!("Fatal error: {}", e); + } +} diff --git a/rust-src/src/cmd_mount.rs b/rust-src/src/cmd_mount.rs new file mode 100644 index 0000000..0150ffd --- /dev/null +++ b/rust-src/src/cmd_mount.rs @@ -0,0 +1,245 @@ +use atty::Stream; +use bch_bindgen::{bcachefs, bcachefs::bch_sb_handle}; +use log::{info, debug, error, LevelFilter}; +use clap::Parser; +use uuid::Uuid; +use std::path::PathBuf; +use crate::key; +use crate::key::KeyLoc; +use crate::logger::SimpleLogger; +use std::ffi::{CStr, CString, OsStr, c_int, c_char, c_void}; +use std::os::unix::ffi::OsStrExt; + +fn mount_inner( + src: String, + target: impl AsRef, + fstype: &str, + mountflags: u64, + data: Option, +) -> anyhow::Result<()> { + + // bind the CStrings to keep them alive + let src = CString::new(src)?; + let target = CString::new(target.as_ref().as_os_str().as_bytes())?; + let data = data.map(CString::new).transpose()?; + let fstype = CString::new(fstype)?; + + // convert to pointers for ffi + let src = src.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + let target = target.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + let data = data.as_ref().map_or(std::ptr::null(), |data| { + data.as_c_str().to_bytes_with_nul().as_ptr() as *const c_void + }); + let fstype = fstype.as_c_str().to_bytes_with_nul().as_ptr() as *const c_char; + + let ret = { + info!("mounting filesystem"); + // REQUIRES: CAP_SYS_ADMIN + unsafe { libc::mount(src, target, fstype, mountflags, data) } + }; + match ret { + 0 => Ok(()), + _ => Err(crate::ErrnoError(errno::errno()).into()), + } +} + +/// Parse a comma-separated mount options and split out mountflags and filesystem +/// specific options. +fn parse_mount_options(options: impl AsRef) -> (Option, u64) { + use either::Either::*; + debug!("parsing mount options: {}", options.as_ref()); + let (opts, flags) = options + .as_ref() + .split(",") + .map(|o| match o { + "dirsync" => Left(libc::MS_DIRSYNC), + "lazytime" => Left(1 << 25), // MS_LAZYTIME + "mand" => Left(libc::MS_MANDLOCK), + "noatime" => Left(libc::MS_NOATIME), + "nodev" => Left(libc::MS_NODEV), + "nodiratime" => Left(libc::MS_NODIRATIME), + "noexec" => Left(libc::MS_NOEXEC), + "nosuid" => Left(libc::MS_NOSUID), + "relatime" => Left(libc::MS_RELATIME), + "remount" => Left(libc::MS_REMOUNT), + "ro" => Left(libc::MS_RDONLY), + "rw" => Left(0), + "strictatime" => Left(libc::MS_STRICTATIME), + "sync" => Left(libc::MS_SYNCHRONOUS), + "" => Left(0), + o @ _ => Right(o), + }) + .fold((Vec::new(), 0), |(mut opts, flags), next| match next { + Left(f) => (opts, flags | f), + Right(o) => { + opts.push(o); + (opts, flags) + } + }); + + use itertools::Itertools; + ( + if opts.len() == 0 { + None + } else { + Some(opts.iter().join(",")) + }, + flags, + ) +} + +fn mount( + device: String, + target: impl AsRef, + options: impl AsRef, +) -> anyhow::Result<()> { + let (data, mountflags) = parse_mount_options(options); + + info!( + "mounting bcachefs filesystem, {}", + target.as_ref().display() + ); + mount_inner(device, target, "bcachefs", mountflags, data) +} + +fn read_super_silent(path: &std::path::PathBuf) -> anyhow::Result { + // Stop libbcachefs from spamming the output + let _gag = gag::BufferRedirect::stdout().unwrap(); + + bch_bindgen::rs::read_super(&path) +} + +fn get_devices_by_uuid(uuid: Uuid) -> anyhow::Result> { + debug!("enumerating udev devices"); + let mut udev = udev::Enumerator::new()?; + + udev.match_subsystem("block")?; + + let devs = udev + .scan_devices()? + .into_iter() + .filter_map(|dev| dev.devnode().map(ToOwned::to_owned)) + .map(|dev| (dev.clone(), read_super_silent(&dev))) + .filter_map(|(dev, sb)| sb.ok().map(|sb| (dev, sb))) + .filter(|(_, sb)| sb.sb().uuid() == uuid) + .collect(); + Ok(devs) +} + +/// Mount a bcachefs filesystem by its UUID. +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +struct Cli { + /// Where the password would be loaded from. + /// + /// Possible values are: + /// "fail" - don't ask for password, fail if filesystem is encrypted; + /// "wait" - wait for password to become available before mounting; + /// "ask" - prompt the user for password; + #[arg(short, long, default_value = "ask", verbatim_doc_comment)] + key_location: KeyLoc, + + /// Device, or UUID= + dev: String, + + /// Where the filesystem should be mounted. If not set, then the filesystem + /// won't actually be mounted. But all steps preceeding mounting the + /// filesystem (e.g. asking for passphrase) will still be performed. + mountpoint: std::path::PathBuf, + + /// Mount options + #[arg(short, default_value = "")] + options: String, + + /// Force color on/off. Default: autodetect tty + #[arg(short, long, action = clap::ArgAction::Set, default_value_t=atty::is(Stream::Stdout))] + colorize: bool, + + /// Verbose mode + #[arg(short, long, action = clap::ArgAction::Count)] + verbose: u8, +} + +fn devs_str_sbs_from_uuid(uuid: String) -> anyhow::Result<(String, Vec)> { + debug!("enumerating devices with UUID {}", uuid); + + let devs_sbs = Uuid::parse_str(&uuid) + .map(|uuid| get_devices_by_uuid(uuid))??; + + let devs_str = devs_sbs + .iter() + .map(|(dev, _)| dev.to_str().unwrap()) + .collect::>() + .join(":"); + + let sbs: Vec = devs_sbs.iter().map(|(_, sb)| *sb).collect(); + + Ok((devs_str, sbs)) + +} + +fn cmd_mount_inner(opt: Cli) -> anyhow::Result<()> { + let (devs, sbs) = if opt.dev.starts_with("UUID=") { + let uuid = opt.dev.replacen("UUID=", "", 1); + devs_str_sbs_from_uuid(uuid)? + } else if opt.dev.starts_with("OLD_BLKID_UUID=") { + let uuid = opt.dev.replacen("OLD_BLKID_UUID=", "", 1); + devs_str_sbs_from_uuid(uuid)? + } else { + let mut sbs = Vec::new(); + + for dev in opt.dev.split(':') { + let dev = PathBuf::from(dev); + sbs.push(bch_bindgen::rs::read_super(&dev)?); + } + + (opt.dev, sbs) + }; + + if sbs.len() == 0 { + Err(anyhow::anyhow!("No device found from specified parameters"))?; + } else if unsafe { bcachefs::bch2_sb_is_encrypted(sbs[0].sb) } { + let key = opt + .key_location + .0 + .ok_or_else(|| anyhow::anyhow!("no keyoption specified for locked filesystem"))?; + + key::prepare_key(&sbs[0], key)?; + } + + info!( + "mounting with params: device: {}, target: {}, options: {}", + devs, + &opt.mountpoint.to_string_lossy(), + &opt.options + ); + + mount(devs, &opt.mountpoint, &opt.options)?; + Ok(()) +} + +#[no_mangle] +pub extern "C" fn cmd_mount(argc: c_int, argv: *const *const c_char) { + let argv: Vec<_> = (0..argc) + .map(|i| unsafe { CStr::from_ptr(*argv.add(i as usize)) }) + .map(|i| OsStr::from_bytes(i.to_bytes())) + .collect(); + + let opt = Cli::parse_from(argv); + + log::set_boxed_logger(Box::new(SimpleLogger)).unwrap(); + + // @TODO : more granular log levels via mount option + log::set_max_level(match opt.verbose { + 0 => LevelFilter::Warn, + 1 => LevelFilter::Trace, + 2_u8..=u8::MAX => todo!(), + }); + + colored::control::set_override(opt.colorize); + if let Err(e) = cmd_mount_inner(opt) { + error!("Fatal error: {}", e); + } else { + info!("Successfully mounted"); + } +} diff --git a/rust-src/src/key.rs b/rust-src/src/key.rs new file mode 100644 index 0000000..2b4fc45 --- /dev/null +++ b/rust-src/src/key.rs @@ -0,0 +1,129 @@ +use log::{info}; +use bch_bindgen::bcachefs::bch_sb_handle; +use crate::c_str; +use anyhow::anyhow; + +#[derive(Clone, Debug)] +pub enum KeyLocation { + Fail, + Wait, + Ask, +} + +#[derive(Clone, Debug)] +pub struct KeyLoc(pub Option); +impl std::ops::Deref for KeyLoc { + type Target = Option; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl std::str::FromStr for KeyLoc { + type Err = anyhow::Error; + fn from_str(s: &str) -> anyhow::Result { + match s { + "" => Ok(KeyLoc(None)), + "fail" => Ok(KeyLoc(Some(KeyLocation::Fail))), + "wait" => Ok(KeyLoc(Some(KeyLocation::Wait))), + "ask" => Ok(KeyLoc(Some(KeyLocation::Ask))), + _ => Err(anyhow!("invalid password option")), + } + } +} + +fn check_for_key(key_name: &std::ffi::CStr) -> anyhow::Result { + use bch_bindgen::keyutils::{self, keyctl_search}; + let key_name = key_name.to_bytes_with_nul().as_ptr() as *const _; + let key_type = c_str!("logon"); + + let key_id = unsafe { keyctl_search(keyutils::KEY_SPEC_USER_KEYRING, key_type, key_name, 0) }; + if key_id > 0 { + info!("Key has became available"); + Ok(true) + } else if errno::errno().0 != libc::ENOKEY { + Err(crate::ErrnoError(errno::errno()).into()) + } else { + Ok(false) + } +} + +fn wait_for_key(uuid: &uuid::Uuid) -> anyhow::Result<()> { + let key_name = std::ffi::CString::new(format!("bcachefs:{}", uuid)).unwrap(); + loop { + if check_for_key(&key_name)? { + break Ok(()); + } + + std::thread::sleep(std::time::Duration::from_secs(1)); + } +} + +const BCH_KEY_MAGIC: &str = "bch**key"; +fn ask_for_key(sb: &bch_sb_handle) -> anyhow::Result<()> { + use bch_bindgen::bcachefs::{self, bch2_chacha_encrypt_key, bch_encrypted_key, bch_key}; + use byteorder::{LittleEndian, ReadBytesExt}; + use std::os::raw::c_char; + + let key_name = std::ffi::CString::new(format!("bcachefs:{}", sb.sb().uuid())).unwrap(); + if check_for_key(&key_name)? { + return Ok(()); + } + + let bch_key_magic = BCH_KEY_MAGIC.as_bytes().read_u64::().unwrap(); + let crypt = sb.sb().crypt().unwrap(); + let pass = if atty::is(atty::Stream::Stdin) { + rpassword::read_password_from_tty(Some("Enter passphrase: "))? + } else { + let mut line = String::new(); + std::io::stdin().read_line(&mut line)?; + line + }; + let pass = std::ffi::CString::new(pass.trim_end())?; // bind to keep the CString alive + let mut output: bch_key = unsafe { + bcachefs::derive_passphrase( + crypt as *const _ as *mut _, + pass.as_c_str().to_bytes_with_nul().as_ptr() as *const _, + ) + }; + + let mut key = crypt.key().clone(); + let ret = unsafe { + bch2_chacha_encrypt_key( + &mut output as *mut _, + sb.sb().nonce(), + &mut key as *mut _ as *mut _, + std::mem::size_of::() as usize, + ) + }; + if ret != 0 { + Err(anyhow!("chacha decryption failure")) + } else if key.magic != bch_key_magic { + Err(anyhow!("failed to verify the password")) + } else { + let key_type = c_str!("logon"); + let ret = unsafe { + bch_bindgen::keyutils::add_key( + key_type, + key_name.as_c_str().to_bytes_with_nul() as *const _ as *const c_char, + &output as *const _ as *const _, + std::mem::size_of::() as usize, + bch_bindgen::keyutils::KEY_SPEC_USER_KEYRING, + ) + }; + if ret == -1 { + Err(anyhow!("failed to add key to keyring: {}", errno::errno())) + } else { + Ok(()) + } + } +} + +pub fn prepare_key(sb: &bch_sb_handle, password: KeyLocation) -> anyhow::Result<()> { + info!("checking if key exists for filesystem {}", sb.sb().uuid()); + match password { + KeyLocation::Fail => Err(anyhow!("no key available")), + KeyLocation::Wait => Ok(wait_for_key(&sb.sb().uuid())?), + KeyLocation::Ask => ask_for_key(sb), + } +} diff --git a/rust-src/src/lib.rs b/rust-src/src/lib.rs new file mode 100644 index 0000000..159d049 --- /dev/null +++ b/rust-src/src/lib.rs @@ -0,0 +1,24 @@ +pub mod key; +pub mod logger; +pub mod cmd_mount; +pub mod cmd_list; + +#[macro_export] +macro_rules! c_str { + ($lit:expr) => { + unsafe { + std::ffi::CStr::from_ptr(concat!($lit, "\0").as_ptr() as *const std::os::raw::c_char) + .to_bytes_with_nul() + .as_ptr() as *const std::os::raw::c_char + } + }; +} + +#[derive(Debug)] +struct ErrnoError(errno::Errno); +impl std::fmt::Display for ErrnoError { + fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> { + self.0.fmt(f) + } +} +impl std::error::Error for ErrnoError {} diff --git a/rust-src/src/logger.rs b/rust-src/src/logger.rs new file mode 100644 index 0000000..2cd7b36 --- /dev/null +++ b/rust-src/src/logger.rs @@ -0,0 +1,28 @@ +use colored::Colorize; +use log::{Level, Metadata, Record}; + +pub struct SimpleLogger; + +impl log::Log for SimpleLogger { + fn enabled(&self, _: &Metadata) -> bool { + true + } + + fn log(&self, record: &Record) { + let debug_prefix = match record.level() { + Level::Error => "ERROR".bright_red(), + Level::Warn => "WARN".bright_yellow(), + Level::Info => "INFO".green(), + Level::Debug => "DEBUG".bright_blue(), + Level::Trace => "TRACE".into(), + }; + println!( + "{} - {}: {}", + debug_prefix, + record.module_path().unwrap_or_default().bright_black(), + record.args() + ); + } + + fn flush(&self) {} +} diff --git a/shell.nix b/shell.nix deleted file mode 100644 index fc7929d..0000000 --- a/shell.nix +++ /dev/null @@ -1,18 +0,0 @@ -{ kversion ? "linux_5_15" -, pkgs ? import {} }: - -with pkgs; - -let - tools = pkgs.callPackage ./default.nix { doCheck = false ;} ; -in -mkShell { - buildInputs = [ - linuxKernel.packages.${kversion}.perf - gdb - ccls # code completion in neovim/emacs - ]; - inputsFrom = [ - tools - ]; -} diff --git a/smoke_test b/smoke_test deleted file mode 100755 index 1122808..0000000 --- a/smoke_test +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# -# This is a smoke test of bcachefs-tools. -# -# It builds the source with multiple options (debug, release, valgrind, FUSE) -# and runs the test suite. -# -# Returns 0 on success, nonzero on any failure. -# -# Dependencies: -# -# valgrind, python3-pytest, python3-pytest-xdist -# -# On debian/ubuntu based systems, install with: -# -# apt install valgrind python3-pytest python3-pytest-xdist -# -# You also currently need fuse 3.7 or later. Fuse 3.7 unfortunately requires -# debian sid or bullseye at this time, so you may need to install from source. - -set -e - -PYTEST="${PYTEST:-pytest-3}" -spam=$(mktemp) -unset BCACHEFS_FUSE BCACHEFS_TEST_USE_VALGRIND BCACHEFS_DEBUG - -trap "set +x; cat ${spam}; rm -f ${spam} ; echo; echo FAILED." EXIT - -echo -- Verify dependencies -- -pkg-config --atleast-version 3.7.0 fuse3 -python3 -c "import pytest" -python3 -c "import xdist" -which valgrind > /dev/null -echo OK - -JOBS=$(nproc) -function build() { - echo Building. - make -j ${JOBS} clean > ${spam} 2>&1 - make -j ${JOBS} tests bcachefs > ${spam} 2>&1 - truncate -s0 ${spam} -} - -function test() { - echo Running tests. - ( - ${PYTEST} -n${JOBS} - ) > ${spam} 2>&1 -} - -function test_vg() { - echo Running tests with valgrind. - ( - export BCACHEFS_TEST_USE_VALGRIND=yes - ${PYTEST} -n${JOBS} - ) > ${spam} 2>&1 -} - - -echo -- Test: default -- -build -test - -echo -- Test: debug -- -export BCACHEFS_DEBUG=1 -build -test - -echo -- Test: debug with valgrind -- -test_vg - -#echo -- Test: fuse debug -- -#export BCACHEFS_FUSE=1 -#build -#test - -#echo -- Test: fuse debug with valgrind -- -#test_vg - -rm -f ${spam} -trap "set +x; echo; echo SUCCESS." EXIT diff --git a/tools-util.c b/tools-util.c index f29d202..624656a 100644 --- a/tools-util.c +++ b/tools-util.c @@ -330,21 +330,21 @@ struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter) { struct fiemap_extent e; - BUG_ON(iter->idx > iter->f.fm_mapped_extents); + BUG_ON(iter->idx > iter->f->fm_mapped_extents); - if (iter->idx == iter->f.fm_mapped_extents) { - xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f); + if (iter->idx == iter->f->fm_mapped_extents) { + xioctl(iter->fd, FS_IOC_FIEMAP, iter->f); - if (!iter->f.fm_mapped_extents) + if (!iter->f->fm_mapped_extents) return (struct fiemap_extent) { .fe_length = 0 }; iter->idx = 0; } - e = iter->f.fm_extents[iter->idx++]; + e = iter->f->fm_extents[iter->idx++]; BUG_ON(!e.fe_length); - iter->f.fm_start = e.fe_logical + e.fe_length; + iter->f->fm_start = e.fe_logical + e.fe_length; return e; } @@ -605,21 +605,72 @@ int dev_mounted(char *dev) return 2; } +static int kstrtoull_symbolic(const char *s, unsigned int base, unsigned long long *res) +{ + if (!strcmp(s, "U64_MAX")) { + *res = U64_MAX; + return 0; + } + + if (!strcmp(s, "U32_MAX")) { + *res = U32_MAX; + return 0; + } + + return kstrtoull(s, base, res); +} + +static int kstrtouint_symbolic(const char *s, unsigned int base, unsigned *res) +{ + unsigned long long tmp; + int rv; + + rv = kstrtoull_symbolic(s, base, &tmp); + if (rv < 0) + return rv; + if (tmp != (unsigned long long)(unsigned int)tmp) + return -ERANGE; + *res = tmp; + return 0; +} + struct bpos bpos_parse(char *buf) { - char *s = buf, *field; + char *orig = strdup(buf); + char *s = buf; + + char *inode_s = strsep(&s, ":"); + char *offset_s = strsep(&s, ":"); + char *snapshot_s = strsep(&s, ":"); + + if (!inode_s || !offset_s || s) + die("invalid bpos %s", orig); + free(orig); + u64 inode_v = 0, offset_v = 0; + u32 snapshot_v = 0; + if (kstrtoull_symbolic(inode_s, 10, &inode_v)) + die("invalid bpos.inode %s", inode_s); - if (!(field = strsep(&s, ":")) || - kstrtoull(field, 10, &inode_v)) - die("invalid bpos %s", buf); + if (kstrtoull_symbolic(offset_s, 10, &offset_v)) + die("invalid bpos.offset %s", offset_s); - if ((field = strsep(&s, ":")) && - kstrtoull(field, 10, &offset_v)) - die("invalid bpos %s", buf); + if (snapshot_s && + kstrtouint_symbolic(snapshot_s, 10, &snapshot_v)) + die("invalid bpos.snapshot %s", snapshot_s); - if (s) - die("invalid bpos %s", buf); + return (struct bpos) { .inode = inode_v, .offset = offset_v, .snapshot = snapshot_v }; +} + +struct bbpos bbpos_parse(char *buf) +{ + char *s = buf, *field; + struct bbpos ret; + + if (!(field = strsep(&s, ":"))) + die("invalid bbpos %s", buf); - return (struct bpos) { .inode = inode_v, .offset = offset_v }; + ret.btree = read_string_list_or_die(field, bch2_btree_ids, "btree id"); + ret.pos = bpos_parse(s); + return ret; } diff --git a/tools-util.h b/tools-util.h index d1122f5..e7bdd2c 100644 --- a/tools-util.h +++ b/tools-util.h @@ -18,6 +18,8 @@ #include #include #include +#include "libbcachefs/bcachefs.h" +#include "libbcachefs/bbpos.h" #include "libbcachefs/darray.h" #define noreturn __attribute__((noreturn)) @@ -113,8 +115,7 @@ static inline struct range hole_iter_next(struct hole_iter *iter) #include struct fiemap_iter { - struct fiemap f; - struct fiemap_extent fe[1024]; + struct fiemap *f; unsigned idx; int fd; }; @@ -123,11 +124,20 @@ static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd) { memset(iter, 0, sizeof(*iter)); - iter->f.fm_extent_count = ARRAY_SIZE(iter->fe); - iter->f.fm_length = FIEMAP_MAX_OFFSET; + iter->f = xmalloc(sizeof(struct fiemap) + + sizeof(struct fiemap_extent) * 1024); + + iter->f->fm_extent_count = 1024; + iter->f->fm_length = FIEMAP_MAX_OFFSET; iter->fd = fd; } +static inline void fiemap_iter_exit(struct fiemap_iter *iter) +{ + free(iter->f); + memset(iter, 0, sizeof(*iter)); +} + struct fiemap_extent fiemap_iter_next(struct fiemap_iter *); #define fiemap_for_each(fd, iter, extent) \ @@ -159,5 +169,6 @@ do { \ }) struct bpos bpos_parse(char *); +struct bbpos bbpos_parse(char *); #endif /* _TOOLS_UTIL_H */ -- 2.39.2